pax_global_header00006660000000000000000000000064142727203060014515gustar00rootroot0000000000000052 comment=3bb42bc940a04ccc2a315ca3d68da61a2f97e36b cython-blis-0.9.1/000077500000000000000000000000001427272030600137575ustar00rootroot00000000000000cython-blis-0.9.1/.appveyor.yml000066400000000000000000000017121427272030600164260ustar00rootroot00000000000000environment: matrix: - BLIS_ARCH: "generic" - BLIS_ARCH: "x86_64" install: - git submodule update --init --recursive - cd flame-blis - set "CC=clang" - set "PATH=C:\msys64\mingw64\bin;C:\msys64\bin;%PATH%" - set "PATH=C:\Program Files\LLVM\bin;%PATH%" - set "AR=llvm-ar" - set "AS=llvm-as" - call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 build_script: - set RANLIB=echo - set LIBPTHREAD= - set "PATH=%PATH%;C:\blis\lib" - set "CFLAGS=-Wno-macro-redefined" - cd %APPVEYOR_BUILD_FOLDER% - cd flame-blis - bash -lc "ln -s $APPVEYOR_BUILD_FOLDER /c/projects/cython-blis" - bash -lc "cd /c/projects/cython-blis/ && ./bin/generate-make-jsonl windows $BLIS_ARCH --export" artifacts: - path: blis/_src/make name: windows-generic.jsonl - path: blis/_src/make name: windows-x86_64.jsonl - path: blis/_src/include/windows-generic name: blis.h - path: blis/_src/include/windows-x86_64 name: blis.h cython-blis-0.9.1/.gitignore000066400000000000000000000003331427272030600157460ustar00rootroot00000000000000.*.sw* blis/blis.c *.pyc *.so blis/cy.c blis/py.c .eggs .env/ env3.6 .hypothesis/ build/ cache/ __pycache__/ .python-version cythonize.dat dist/ .pytest_cache blis.egg-info/ tmp/ # Blis stuff blis/include .fragment.mk cython-blis-0.9.1/.gitmodules000066400000000000000000000001251427272030600161320ustar00rootroot00000000000000[submodule "flame-blis"] path = flame-blis url = https://github.com/explosion/blis cython-blis-0.9.1/.travis.yml000077500000000000000000000045431427272030600161010ustar00rootroot00000000000000sudo: required dist: focal env: global: - PLAT=x86_64 - UNICODE_WIDTH=32 - CC=gcc-9 matrix: include: - os: osx language: generic env: - MB_PYTHON_VERSION="3.7" - BLIS_ARCH="generic" - os: osx language: generic env: - MB_PYTHON_VERSION="3.7" - BLIS_ARCH="x86_64" - os: linux python: '3.7' env: - BLIS_ARCH="generic" - os: linux language: python python: '3.7' env: - BLIS_ARCH="x86_64" - os: linux language: python arch: arm64 python: '3.7' env: - BLIS_ARCH="cortexa57" - os: linux language: shell arch: ppc64le dist: focal env: - BLIS_ARCH="power9" before_install: - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then source ./bin/travis/before_install_osx.sh; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then source ./bin/travis/before_install_linux.sh; fi - before_install install: - python -m pip install -r requirements.txt - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./bin/generate-make-jsonl darwin $BLIS_ARCH --export; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./bin/generate-make-jsonl linux $BLIS_ARCH --export; fi script: - python setup.py build_ext --inplace - PYTHONPATH=`pwd` python -m pytest blis/ notifications: email: false slack: secure: VSqtxg7u4NTZRfoZqjxPRPVS92KTy/mp62egfDZ9ujTP4VPxNe15QZuTB6r/ICPgEYqBtdhLc/aetuBcemt0bHfentV0F7bz7iDY/AFQC1h1i4G0D0wKMufuqOJFw9MOp2tSpuvCVzhCxR+Ymx/F9SaeYBAiwBawce4wu+qu3lA= deploy: skip_cleanup: true provider: gcs access_key_id: GOOGAYJSXD24MLFQGHMJ6TQC secret_access_key: secure: 8SbYhu799pawZfC0a/Jq7eQklvfRNn1hJRnuEEpRdBO6fnFNMeYtTaSb867dwNl00i4VuQAjfcE8RXleY3EeP18qtmqfknCnOLCrSHphqWCYo/nx2wx/zC0E1xC4pefB2sO9nHEuKQVsi5OziNXunWedTh7n6CANoLRJmiypflvlLcOYp5eCLUcsoDbOtb7m2DDYXiCe8NM3ymZ2k42GmXqV2pvx14b0kl6okmAZJ3IMqfRLMXow5TxXwZx/AwW/N3FpitbhOAM2t10MWEdP4egkZlS+b2QKKnwvkocXAXstjokLsYBei8/9/AA2+ldtzT4HiBv6osPy4Y3MB68uyy3x+Q/4PZv7plxP2UPspyUCUCeYeY8CU3S8+8EjQhZYRphx2CibBLCOOpC68GxDcxMjXAgdm0FW1MLpbp/1NJRHgPQrpFvnKjjt01ysha50UGppigX6ebvH5fz4IIhMTRzMEhchCZR4GZvfHx0RaVJz21M5ngLGBQaV7pp99wCy8g/vtztOzwIKVP0VuCl4n31/Cit8QzNIQOQ0YoHJO1alr3SGyXmwnxx0r4DtRsPB70cGq45d7TuMVi7qTe7/gvHCG5rwC1X5YNTYiUae92j9niMLBMeuD5bToAnJIMYIwllTgyDuyo+u1a+fN5jJTtWgK/dBhrXvPaBZovupsmU= bucket: cython-blis-artifacts local-dir: artifacts on: repo: explosion/cython-blis branch: master cython-blis-0.9.1/.vscode/000077500000000000000000000000001427272030600153205ustar00rootroot00000000000000cython-blis-0.9.1/.vscode/settings.json000066400000000000000000000000571427272030600200550ustar00rootroot00000000000000{ "python.pythonPath": ".env/bin/python3" }cython-blis-0.9.1/LICENSE000066400000000000000000000037611427272030600147730ustar00rootroot00000000000000NOTE: Portions of this project's code are copyrighted by The University of Texas at Austin while other portions are copyrighted by Hewlett Packard Enterprise Development LP Advanced Micro Devices, Inc. ExplosionAI GmbH with some overlap. Please see file-level license headers for file-specific copyright info. All parties provide their portions of the code under the 3-clause BSD license, found below. --- Copyright (C) 2018, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cython-blis-0.9.1/MANIFEST.in000066400000000000000000000002521427272030600155140ustar00rootroot00000000000000include LICENSE include README.md exclude blis/cy.c exclude blis/py.c recursive-include blis/_src *.c recursive-include blis/_src *.h recursive-include blis/_src *.jsonl cython-blis-0.9.1/README.md000066400000000000000000000150501427272030600152370ustar00rootroot00000000000000 # Cython BLIS: Fast BLAS-like operations from Python and Cython, without the tears This repository provides the [Blis linear algebra](https://github.com/flame/blis) routines as a self-contained Python C-extension. Currently, we only supports single-threaded execution, as this is actually best for our workloads (ML inference). [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/6/master.svg?logo=azure-pipelines&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=6) [![pypi Version](https://img.shields.io/pypi/v/blis.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/blis) [![conda](https://img.shields.io/conda/vn/conda-forge/cython-blis.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/cython-blis) [![Python wheels](https://img.shields.io/badge/wheels-%E2%9C%93-4c1.svg?longCache=true&style=flat-square&logo=python&logoColor=white)](https://github.com/explosion/wheelwright/releases) ## Installation You can install the package via pip, first making sure that `pip`, `setuptools`, and `wheel` are up-to-date: ```bash pip install -U pip setuptools wheel pip install blis ``` Wheels should be available, so installation should be fast. If you want to install from source and you're on Windows, you'll need to install LLVM. ### Building BLIS for alternative architectures The provided wheels should work on x86_64 and osx/arm64 architectures. Unfortunately we do not currently know a way to provide different wheels for alternative architectures, and we cannot provide a single binary that works everywhere. So if the wheel doesn't work for your CPU, you'll need to specify source distribution, and tell Blis your CPU architecture using the `BLIS_ARCH` environment variable. #### a) Install with auto-detected CPU support ```bash pip install spacy --no-binary blis ``` #### b) Install using an existing configuration Provide an architecture from the [supported configurations](https://github.com/explosion/cython-blis/tree/v0.9.0/blis/_src/make). ```bash BLIS_ARCH="power9" pip install spacy --no-binary blis ``` #### c) Install with generic arch support > ⚠️ `generic` is not optimized for any particular CPU and is extremely slow. Only recommended for testing! ```bash BLIS_ARCH="generic" pip install spacy --no-binary blis ``` #### d) Build specific support In order to compile Blis, `cython-blis` bundles makefile scripts for specific architectures, that are compiled by running the Blis build system and logging the commands. We do not yet have logs for every architecture, as there are some architectures we have not had access to. [See here](https://github.com/flame/blis/blob/0.9.0/config_registry) for list of architectures. For example, here's how to build support for the Intel architecture `knl`: ```bash git clone https://github.com/explosion/cython-blis && cd cython-blis git pull && git submodule init && git submodule update && git submodule status python3 -m venv venv source venv/bin/activate pip install -U pip setuptools wheel pip install -r requirements.txt ./bin/generate-make-jsonl linux knl BLIS_ARCH="knl" python setup.py build_ext --inplace BLIS_ARCH="knl" python setup.py bdist_wheel ``` Fingers crossed, this will build you a wheel that supports your platform. You could then [submit a PR](https://github.com/explosion/cython-blis/pulls) with the `blis/_src/make/linux-knl.jsonl` and `blis/_src/include/linux-knl/blis.h` files so that you can run: ```bash BLIS_ARCH="knl" pip install --no-binary=blis ``` ## Usage Two APIs are provided: a high-level Python API, and direct [Cython](http://cython.org) access, which provides fused-type, nogil Cython bindings to the underlying Blis linear algebra library. Fused types are a simple template mechanism, allowing just a touch of compile-time generic programming: ```python cimport blis.cy A = calloc(nN * nI, sizeof(float)) B = calloc(nO * nI, sizeof(float)) C = calloc(nr_b0 * nr_b1, sizeof(float)) blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE, nO, nI, nN, 1.0, A, nI, 1, B, nO, 1, 1.0, C, nO, 1) ``` Bindings have been added as we've needed them. Please submit pull requests if the library is missing some functions you require. ## Development To build the source package, you should run the following command: ```bash ./bin/update-vendored-source ``` This populates the `blis/_src` folder for the various architectures, using the `flame-blis` submodule. ## Updating the build files In order to compile the Blis sources, we use jsonl files that provide the explicit compiler flags. We build these jsonl files by running Blis's build system, and then converting the log. This avoids us having to replicate the build system within Python: we just use the jsonl to make a bunch of subprocess calls. To support a new OS/architecture combination, we have to provide the jsonl file and the header. ### Linux The Linux build files need to be produced from within the manylinux2014 Docker container, so that they will be compatible with the wheel building process. First, install docker. Then do the following to start the container: sudo docker run -it quay.io/pypa/manylinux2014_x86_64:latest Once within the container, the following commands should check out the repo and build the jsonl files for the generic arch: mkdir /usr/local/repos cd /usr/local/repos git clone https://github.com/explosion/cython-blis && cd cython-blis git pull && git submodule init && git submodule update && git submodule status /opt/python/cp36-cp36m/bin/python -m venv env3.6 source env3.6/bin/activate pip install -r requirements.txt ./bin/generate-make-jsonl linux generic --export BLIS_ARCH=generic python setup.py build_ext --inplace # N.B.: don't copy to /tmp, docker cp doesn't work from there. cp blis/_src/include/linux-generic/blis.h /linux-generic-blis.h cp blis/_src/make/linux-generic.jsonl / Then from a new terminal, retrieve the two files we need out of the container: sudo docker ps -l # Get the container ID # When I'm in Vagrant, I need to go via cat -- but then I end up with dummy # lines at the top and bottom. Sigh. If you don't have that problem and # sudo docker cp just works, just copy the file. sudo docker cp aa9d42588791:/linux-generic-blis.h - | cat > linux-generic-blis.h sudo docker cp aa9d42588791:/linux-generic.jsonl - | cat > linux-generic.jsonl cython-blis-0.9.1/azure-pipelines.yml000066400000000000000000000154371427272030600176300ustar00rootroot00000000000000trigger: batch: true branches: include: - '*' jobs: - job: 'JSONL' # Manually enable for generating JSONL condition: false strategy: matrix: Python38Mac: imageName: 'macos-latest' python.version: '3.8' Python38Windows: imageName: 'windows-latest' python.version: '3.8' maxParallel: 4 pool: vmImage: $(imageName) steps: - task: UsePythonVersion@0 inputs: versionSpec: '$(python.version)' architecture: 'x64' - script: choco install llvm condition: eq( variables['Agent.OS'], 'Windows_NT') displayName: 'Preinstall (Windows)' - script: git config --global core.autocrlf false displayName: 'Disable automatic crlf conversion' - script: git submodule update --init --recursive displayName: 'Update git modules' - script: | python -m pip install --upgrade pip wheel setuptools pip install -r requirements.txt displayName: 'Install dependencies' # Set the correct paths and includes. Only the env variables set here are # used, not the ones defined in the .jsonl. - script: | set "PATH=C:\Program Files\LLVM\bin;%PATH%" set "AR=llvm-ar" set "AS=llvm-as" set "CC=clang" set RANLIB=echo call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 clang --version bash -lc "./bin/generate-make-jsonl windows generic --export" bash -lc "./bin/generate-make-jsonl windows x86_64 --export" condition: eq( variables['Agent.OS'], 'Windows_NT') displayName: 'Generate JSONL (Windows)' - script: | bin/generate-make-jsonl darwin generic --export bin/generate-make-jsonl darwin x86_64 --export bin/generate-make-jsonl darwin x86_64_no_zen3 --export bin/generate-make-jsonl darwin x86_64_no_zen2 --export bin/generate-make-jsonl darwin x86_64_no_skx --export condition: eq(variables['Agent.OS'], 'Darwin') displayName: 'Generate JSONL (Mac)' - publish: $(System.DefaultWorkingDirectory)/artifacts artifact: '$(Agent.JobName)' - job: 'Test' strategy: matrix: Python27Linux: imageName: 'ubuntu-18.04' python.version: '2.7' Python27Mac: imageName: 'macos-10.15' python.version: '2.7' Python36Linux: imageName: 'ubuntu-18.04' python.version: '3.6' Python36Mac: imageName: 'macos-10.15' python.version: '3.6' Python36Windows: imageName: 'windows-2019' python.version: '3.6' Python37Linux: imageName: 'ubuntu-18.04' python.version: '3.7' Python37Mac: imageName: 'macos-latest' python.version: '3.7' Python37Windows: imageName: 'windows-latest' python.version: '3.7' Python38Linux: imageName: 'ubuntu-latest' python.version: '3.8' Python38Mac: imageName: 'macos-latest' python.version: '3.8' Python38Windows: imageName: 'windows-latest' python.version: '3.8' Python39Linux: imageName: 'ubuntu-latest' python.version: '3.9' Python39Mac: imageName: 'macos-latest' python.version: '3.9' Python39Windows: imageName: 'windows-latest' python.version: '3.9' Python310Linux: imageName: 'ubuntu-latest' python.version: '3.10' Python310Mac: imageName: 'macos-latest' python.version: '3.10' Python310Windows: imageName: 'windows-latest' python.version: '3.10' maxParallel: 4 pool: vmImage: $(imageName) steps: - task: UsePythonVersion@0 inputs: versionSpec: '$(python.version)' architecture: 'x64' - script: choco install llvm condition: eq( variables['Agent.OS'], 'Windows_NT') displayName: 'Preinstall (Windows)' - script: | python -m pip install --upgrade pip wheel setuptools pip install -r requirements.txt displayName: 'Install dependencies' # Set the correct paths and includes. Only the env variables set here are # used, not the ones defined in the .jsonl. - script: | set "PATH=C:\Program Files\LLVM\bin;%PATH%" set "AR=llvm-ar" set "AS=llvm-as" set "CC=clang" set RANLIB=echo call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 clang --version python setup.py bdist_wheel condition: eq( variables['Agent.OS'], 'Windows_NT') displayName: 'Build wheel (Windows)' - script: python setup.py bdist_wheel condition: eq(variables['Agent.OS'], 'Darwin') displayName: 'Build wheel (Mac)' - script: | gcc --version python setup.py bdist_wheel condition: and(eq(variables['Agent.OS'], 'Linux'), or(eq(variables['python.version'], '2.7'), eq(variables['python.version'], '3.5'))) displayName: 'Build wheel (Linux)' - script: | clang --version CC=clang python setup.py bdist_wheel condition: and(eq(variables['Agent.OS'], 'Linux'), eq(variables['python.version'], '3.6')) displayName: 'Build wheel (Linux / clang)' - script: | gcc-7 --version CC=gcc-7 python setup.py bdist_wheel condition: and(eq(variables['Agent.OS'], 'Linux'), eq(variables['python.version'], '3.7')) displayName: 'Build wheel (Linux / gcc-7)' - script: | gcc-9 --version CC=gcc-9 python setup.py bdist_wheel condition: and(eq(variables['Agent.OS'], 'Linux'), eq(variables['python.version'], '3.8')) displayName: 'Build wheel (Linux / gcc-9)' - script: | gcc-10 --version CC=gcc-10 python setup.py bdist_wheel condition: and(eq(variables['Agent.OS'], 'Linux'), eq(variables['python.version'], '3.9')) displayName: 'Build wheel (Linux / gcc-10)' - script: | clang-12 --version CC=clang-12 python setup.py bdist_wheel condition: and(eq(variables['Agent.OS'], 'Linux'), eq(variables['python.version'], '3.10')) displayName: 'Build wheel (Linux / clang-12)' - task: PythonScript@0 inputs: scriptSource: inline script: | # https://github.com/pypa/pip/issues/6951 import os wheel_file = os.listdir('./dist')[0] wheel_path = os.path.join('./dist', wheel_file) os.rename(wheel_path, wheel_path.replace("cp38m-win", "cp38-win")) print(os.listdir('./dist')) failOnStderr: true condition: and(eq(variables['Agent.OS'], 'Windows_NT'), eq(variables['python.version'], '3.8')) displayName: Fix wheel name (Windows, Python 3.8) - script: | pip freeze > installed.txt pip uninstall -y -r installed.txt displayName: 'Uninstall all packages' - bash: | rm -rf blis* python -m pip install dist/*.whl displayName: 'Install wheel' - script: | pip install -r requirements.txt python -m pytest --pyargs blis displayName: 'Run tests' cython-blis-0.9.1/bin/000077500000000000000000000000001427272030600145275ustar00rootroot00000000000000cython-blis-0.9.1/bin/.appveyor_compile_jsonl.yml000066400000000000000000000040411427272030600221110ustar00rootroot00000000000000#environment: # # matrix: # - PYTHON: "/c/Python35-x64" # - PYTHON: "/c/Python36-x64" # - PYTHON: "/c/Python37-x64" install: - git submodule update --init --recursive - cd flame-blis - set "CC=clang" - set "PATH=C:\msys64\mingw64\bin;C:\msys64\bin;%PATH%" - set "PATH=C:\Program Files\LLVM\bin;%PATH%" - set "AR=llvm-ar" - set "AS=llvm-as" - call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 build_script: - set RANLIB=echo - set LIBPTHREAD= - set "PATH=%PATH%;C:\blis\lib" - set "CFLAGS=-Wno-macro-redefined" - cd %APPVEYOR_BUILD_FOLDER% - cd flame-blis - bash -lc "ln -s $APPVEYOR_BUILD_FOLDER /c/projects/cython-blis" - bash -lc "cd /c/projects/cython-blis/flame-blis && ./configure --disable-shared --disable-cblas --disable-blas --disable-threading --enable-verbose-make --enable-arg-max-hack --prefix=/c/blis x86_64" - bash -lc "cd /c/projects/cython-blis/flame-blis && mingw32-make -j4 > make.log" - bash -lc "mkdir -p /c/projects/cython-blis/blis/_src/include/windows-x86_64" - bash -lc "cd /c/projects/cython-blis && cat flame-blis/make.log | python bin/munge_make_log.py windows x86_64 > blis/_src/make/windows-x86_64.jsonl" - bash -lc "cp /c/projects/cython-blis/flame-blis/include/x86_64/blis.h /c/projects/cython-blis/blis/_src/include/windows-x86_64/blis.h" - bash -lc "cp /c/projects/cython-blis/blis/_src/make/windows-x86_64.jsonl $APPVEYOR_BUILD_FOLDER/blis/_src/make/windows-x86_64.jsonl" - bash -lc "mkdir -p $APPVEYOR_BUILD_FOLDER/blis/_src/include/windows-x86_64" - bash -lc "cp /c/projects/cython-blis/blis/_src/include/windows-x86_64/blis.h $APPVEYOR_BUILD_FOLDER/blis/_src/include/windows-x86_64/blis.h" #- python -m pip install -U pip wheel #- python -m pip install -r requirements.txt #- python setup.py bdist_wheel #- cd .. #- bash -lc "cp -r $APPVEYOR_BUILD_FOLDER /c/build" #- bash -lc "python -m pip install /c/build/dist/*.whl" #test_script: #- python -m pytest --pyargs blis artifacts: - path: blis/_src/make name: windows-x86_64.jsonl - path: blis/_src/include/windows-x86_64 name: blis.h cython-blis-0.9.1/bin/.appveyor_run_tests.yml000066400000000000000000000007561427272030600213130ustar00rootroot00000000000000environment: matrix: - PYTHON: "C:\\Python35-x64" - PYTHON: "C:\\Python36-x64" - PYTHON: "C:\\Python37-x64" install: - set "PATH=%PYTHON%;%path%" build_script: - python -m pip install -U pip wheel - python -m pip install -r requirements.txt - python setup.py bdist_wheel - cd .. - bash -lc "cp -r $APPVEYOR_BUILD_FOLDER /c/build" - bash -lc "python -m pip install /c/build/dist/*.whl" test_script: - python -m pytest --pyargs blis artifacts: - path: dist/ name: wheels cython-blis-0.9.1/bin/.update_blis_travis.yml000077500000000000000000000043251427272030600212220ustar00rootroot00000000000000language: python sudo: required dist: xenial env: global: - PLAT=x86_64 - UNICODE_WIDTH=32 matrix: include: - os: osx language: generic env: - MB_PYTHON_VERSION="2.7" - os: osx language: generic env: - MB_PYTHON_VERSION="3.5" - os: osx language: generic env: - MB_PYTHON_VERSION="3.6" - os: osx language: generic env: - MB_PYTHON_VERSION="3.7" - os: linux python: '2.7' - os: linux python: '3.5' - os: linux python: '3.6' - os: linux python: '3.7' before_install: - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then source ./travis/before_install_osx.sh; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then source ./travis/before_install_linux.sh; fi - before_install install: - python -m pip install -r requirements.txt - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./bin/generate-make-jsonl darwin x86_64 --export; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./bin/generate-make-jsonl linux x86_64 --export; fi - python setup.py bdist_wheel - rm -rf blis* - python -m pip install dist/*.whl script: - python -m pytest tests/ notifications: email: false slack: secure: VSqtxg7u4NTZRfoZqjxPRPVS92KTy/mp62egfDZ9ujTP4VPxNe15QZuTB6r/ICPgEYqBtdhLc/aetuBcemt0bHfentV0F7bz7iDY/AFQC1h1i4G0D0wKMufuqOJFw9MOp2tSpuvCVzhCxR+Ymx/F9SaeYBAiwBawce4wu+qu3lA= deploy: skip_cleanup: true provider: gcs access_key_id: GOOGAYJSXD24MLFQGHMJ6TQC secret_access_key: secure: 8SbYhu799pawZfC0a/Jq7eQklvfRNn1hJRnuEEpRdBO6fnFNMeYtTaSb867dwNl00i4VuQAjfcE8RXleY3EeP18qtmqfknCnOLCrSHphqWCYo/nx2wx/zC0E1xC4pefB2sO9nHEuKQVsi5OziNXunWedTh7n6CANoLRJmiypflvlLcOYp5eCLUcsoDbOtb7m2DDYXiCe8NM3ymZ2k42GmXqV2pvx14b0kl6okmAZJ3IMqfRLMXow5TxXwZx/AwW/N3FpitbhOAM2t10MWEdP4egkZlS+b2QKKnwvkocXAXstjokLsYBei8/9/AA2+ldtzT4HiBv6osPy4Y3MB68uyy3x+Q/4PZv7plxP2UPspyUCUCeYeY8CU3S8+8EjQhZYRphx2CibBLCOOpC68GxDcxMjXAgdm0FW1MLpbp/1NJRHgPQrpFvnKjjt01ysha50UGppigX6ebvH5fz4IIhMTRzMEhchCZR4GZvfHx0RaVJz21M5ngLGBQaV7pp99wCy8g/vtztOzwIKVP0VuCl4n31/Cit8QzNIQOQ0YoHJO1alr3SGyXmwnxx0r4DtRsPB70cGq45d7TuMVi7qTe7/gvHCG5rwC1X5YNTYiUae92j9niMLBMeuD5bToAnJIMYIwllTgyDuyo+u1a+fN5jJTtWgK/dBhrXvPaBZovupsmU= bucket: cython-blis-artifacts local-dir: artifacts on: repo: explosion/cython-blis branch: update-blis cython-blis-0.9.1/bin/generate-make-jsonl000077500000000000000000000020311427272030600203010ustar00rootroot00000000000000#!/usr/bin/env bash set -e OS="$1" ARCH="$2" EXPORT="$3" JSONL="blis/_src/make/$OS-$ARCH.jsonl" cd flame-blis if [ ! -f $JSONL ]; then echo "Compile" if [[ "$OS" == "windows" ]]; then mingw32-make clean ./configure --disable-blas --disable-cblas --disable-shared --disable-threading --int-size=64 --enable-verbose-make --enable-arg-max-hack $ARCH mingw32-make -j 4 > make.log else make clean ./configure --disable-blas --disable-cblas --disable-shared --disable-threading --int-size=64 --enable-verbose-make --export-shared=all $ARCH make > make.log fi echo "Preprocess make log" cat make.log | python ../bin/munge_make_log.py $OS $ARCH > ../$JSONL mkdir -p ../blis/_src/include/$OS-$ARCH/ cp include/$ARCH/blis.h ../blis/_src/include/$OS-$ARCH/blis.h fi if [[ "$EXPORT" == "--export" ]]; then mkdir -p ../artifacts/ cp ../blis/_src/include/$OS-$ARCH/blis.h ../artifacts/blis-$OS-$ARCH.h cp ../blis/_src/make/$OS-$ARCH.jsonl ../artifacts/$OS-$ARCH.jsonl; fi cython-blis-0.9.1/bin/munge_make_log.py000066400000000000000000000021471427272030600200560ustar00rootroot00000000000000import os import sys import json os_name = sys.argv[1] arch_name = sys.argv[2] print(json.dumps({"environment": dict(os.environ)})) for line in sys.stdin: if 'flatten-headers.py' in line: continue line = line.replace('include/' + arch_name, 'include/' + os_name + '-' + arch_name) pieces = line.split() args = {} flags = [] macros = [] includes = [] for i, piece in enumerate(pieces): if i == 0: args['compiler'] = piece elif piece == '-c': args['source'] = pieces[i+1] elif piece == '-o': args['target'] = pieces[i+1] elif piece.startswith('-f') or piece.startswith('-m') or piece.startswith('-O'): flags.append(piece) elif piece.startswith('-std'): flags.append(piece) elif piece.startswith('-D'): macros.append(piece.replace('\\', '')) elif piece.startswith('-I'): includes.append(piece) if 'source' in args: args['flags'] = flags args['macros'] = macros args['include'] = includes print(json.dumps(args)) cython-blis-0.9.1/bin/push-tag.sh000077500000000000000000000005621427272030600166210ustar00rootroot00000000000000#!/usr/bin/env bash set -e # Insist repository is clean git diff-index --quiet HEAD git checkout $1 git pull origin $1 git push origin $1 version=$(grep "__version__ = " blis/about.py) version=${version/__version__ = } version=${version/\'/} version=${version/\'/} version=${version/\"/} version=${version/\"/} git tag "v$version" git push origin "v$version" --tags cython-blis-0.9.1/bin/travis/000077500000000000000000000000001427272030600160375ustar00rootroot00000000000000cython-blis-0.9.1/bin/travis/before_install_linux.sh000077500000000000000000000001541427272030600226050ustar00rootroot00000000000000#!/usr/bin/env bash set -e function before_install { local passed=1 sudo apt-get install python-dev } cython-blis-0.9.1/bin/travis/before_install_osx.sh000077500000000000000000000233171427272030600222650ustar00rootroot00000000000000#!/bin/bash # Use with ``source osx_utils.sh`` set -e # Get our own location on this filesystem, load common utils MULTIBUILD_DIR=$(dirname "${BASH_SOURCE[0]}") source $MULTIBUILD_DIR/common_utils.sh export MACOSX_DEPLOYMENT_TARGET=10.7 MACPYTHON_URL=https://www.python.org/ftp/python MACPYTHON_PY_PREFIX=/Library/Frameworks/Python.framework/Versions GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py DOWNLOADS_SDIR=downloads WORKING_SDIR=working # As of 28 June 2018 - latest Python of each version with binary download # available. # See: https://www.python.org/downloads/mac-osx/ LATEST_2p7=2.7.15 LATEST_2p6=2.6.6 LATEST_3p2=3.2.5 LATEST_3p3=3.3.5 LATEST_3p4=3.4.4 LATEST_3p5=3.5.4 LATEST_3p6=3.6.6 LATEST_3p7=3.7.0 function check_python { if [ -z "$PYTHON_EXE" ]; then echo "PYTHON_EXE variable not defined" exit 1 fi } function check_pip { if [ -z "$PIP_CMD" ]; then echo "PIP_CMD variable not defined" exit 1 fi } function check_var { if [ -z "$1" ]; then echo "required variable not defined" exit 1 fi } function get_py_digit { check_python $PYTHON_EXE -c "import sys; print(sys.version_info[0])" } function get_py_mm { check_python $PYTHON_EXE -c "import sys; print('{0}.{1}'.format(*sys.version_info[0:2]))" } function get_py_mm_nodot { check_python $PYTHON_EXE -c "import sys; print('{0}{1}'.format(*sys.version_info[0:2]))" } function get_py_prefix { check_python $PYTHON_EXE -c "import sys; print(sys.prefix)" } function fill_pyver { # Convert major or major.minor format to major.minor.micro # # Hence: # 2 -> 2.7.11 (depending on LATEST_2p7 value) # 2.7 -> 2.7.11 (depending on LATEST_2p7 value) local ver=$1 check_var $ver if [[ $ver =~ [0-9]+\.[0-9]+\.[0-9]+ ]]; then # Major.minor.micro format already echo $ver elif [ $ver == 2 ] || [ $ver == "2.7" ]; then echo $LATEST_2p7 elif [ $ver == "2.6" ]; then echo $LATEST_2p6 elif [ $ver == 3 ] || [ $ver == "3.7" ]; then echo $LATEST_3p7 elif [ $ver == "3.6" ]; then echo $LATEST_3p6 elif [ $ver == "3.5" ]; then echo $LATEST_3p5 elif [ $ver == "3.4" ]; then echo $LATEST_3p4 elif [ $ver == "3.3" ]; then echo $LATEST_3p3 elif [ $ver == "3.2" ]; then echo $LATEST_3p2 else echo "Can't fill version $ver" 1>&2 exit 1 fi } function pyinst_ext_for_version { # echo "pkg" or "dmg" depending on the passed Python version # Parameters # $py_version (python version in major.minor.extra format) # # Earlier Python installers are .dmg, later are .pkg. local py_version=$1 check_var $py_version py_version=$(fill_pyver $py_version) local py_0=${py_version:0:1} if [ $py_0 -eq 2 ]; then if [ "$(lex_ver $py_version)" -ge "$(lex_ver 2.7.9)" ]; then echo "pkg" else echo "dmg" fi elif [ $py_0 -ge 3 ]; then if [ "$(lex_ver $py_version)" -ge "$(lex_ver 3.4.2)" ]; then echo "pkg" else echo "dmg" fi fi } function pyinst_fname_for_version { # echo filename for OSX installer file given Python version # Parameters # $py_version (python version in major.minor.extra format) local py_version=$1 local inst_ext=$(pyinst_ext_for_version $py_version) # Python 2.6 has OSX 10.3 suffix if [ "$(lex_ver $py_version)" -le "$(lex_ver 2.6.6)" ]; then local osx_ver=10.3 else local osx_ver=10.6 fi echo "python-$py_version-macosx${osx_ver}.$inst_ext" } function install_mac_cpython { # Installs Python.org Python # Parameter $version # Version given in major or major.minor or major.minor.micro e.g # "3" or "3.4" or "3.4.1". # sets $PYTHON_EXE variable to python executable local py_version=$(fill_pyver $1) local py_stripped=$(strip_ver_suffix $py_version) local py_inst=$(pyinst_fname_for_version $py_version) local inst_path=$DOWNLOADS_SDIR/$py_inst mkdir -p $DOWNLOADS_SDIR curl $MACPYTHON_URL/$py_stripped/${py_inst} > $inst_path if [ "${py_inst: -3}" == "dmg" ]; then hdiutil attach $inst_path -mountpoint /Volumes/Python inst_path=/Volumes/Python/Python.mpkg fi sudo installer -pkg $inst_path -target / local py_mm=${py_version:0:3} PYTHON_EXE=$MACPYTHON_PY_PREFIX/$py_mm/bin/python$py_mm # Install certificates for Python 3.6 local inst_cmd="/Applications/Python ${py_mm}/Install Certificates.command" if [ -e "$inst_cmd" ]; then sh "$inst_cmd" fi } function install_pip { # Generic install pip # Gets needed version from version implied by $PYTHON_EXE # Installs pip into python given by $PYTHON_EXE # Assumes pip will be installed into same directory as $PYTHON_EXE check_python mkdir -p $DOWNLOADS_SDIR curl $GET_PIP_URL > $DOWNLOADS_SDIR/get-pip.py # Python 2.6 will fail SSL check local py_mm=`get_py_mm` if [ "$py_mm" == "2.6" ]; then local pip_args="--trusted-host=pypi.org" fi # Travis VMS now install pip for system python by default - force install # even if installed already. sudo $PYTHON_EXE $DOWNLOADS_SDIR/get-pip.py --ignore-installed $pip_args PIP_CMD="sudo $(dirname $PYTHON_EXE)/pip$py_mm" # Append pip_args if present (avoiding trailing space cf using variable # above). if [ -n "$pip_args" ]; then PIP_CMD="$PIP_CMD $pip_args" fi } function install_virtualenv { # Generic install of virtualenv # Installs virtualenv into python given by $PYTHON_EXE # Assumes virtualenv will be installed into same directory as $PYTHON_EXE check_pip # Travis VMS install virtualenv for system python by default - force # install even if installed already $PIP_CMD install virtualenv --ignore-installed check_python VIRTUALENV_CMD="$(dirname $PYTHON_EXE)/virtualenv" } function make_workon_venv { # Make a virtualenv in given directory ('venv' default) # Set $PYTHON_EXE, $PIP_CMD to virtualenv versions # Parameter $venv_dir # directory for virtualenv local venv_dir=$1 if [ -z "$venv_dir" ]; then venv_dir="venv" fi venv_dir=`abspath $venv_dir` check_python $PYTHON_EXE -m virtualenv $venv_dir PYTHON_EXE=$venv_dir/bin/python PIP_CMD=$venv_dir/bin/pip } function remove_travis_ve_pip { # Remove travis installs of virtualenv and pip # FIXME: What if virtualenv is installed but pip is not? if [ "$(sudo which virtualenv)" == /usr/local/bin/virtualenv ] && [ "$(sudo which pip)" == /usr/local/bin/pip ]; then sudo pip uninstall -y virtualenv; fi if [ "$(sudo which pip)" == /usr/local/bin/pip ]; then sudo pip uninstall -y pip; fi } function set_py_vars { # Used by terryfy project; left here for back-compatibility export PATH="`dirname $PYTHON_EXE`:$PATH" export PYTHON_EXE PIP_CMD } function get_macpython_environment { # Set up MacPython environment # Parameters: # $version : [implementation-]major[.minor[.patch]] # The Python implementation to install, e.g. "3.6" or "pypy-5.4" # $venv_dir : {directory_name|not defined} # If defined - make virtualenv in this directory, set python / pip # commands accordingly # # Installs Python # Sets $PYTHON_EXE to path to Python executable # Sets $PIP_CMD to full command for pip (including sudo if necessary) # If $venv_dir defined, Sets $VIRTUALENV_CMD to virtualenv executable # Puts directory of $PYTHON_EXE on $PATH local version=$1 local venv_dir=$2 if [ "$USE_CCACHE" == "1" ]; then activate_ccache fi remove_travis_ve_pip install_mac_cpython $version install_pip if [ -n "$venv_dir" ]; then install_virtualenv make_workon_venv $venv_dir source $venv_dir/bin/activate else export PATH="`dirname $PYTHON_EXE`:$PATH" fi export PYTHON_EXE PIP_CMD } function install_delocate { check_pip if [ $(lex_ver $(get_py_mm)) -lt $(lex_ver 2.7) ]; then # Wheel 0.30 doesn't work for Python 2.6; see: # https://github.com/pypa/wheel/issues/193 $PIP_CMD install "wheel<=0.29" fi $PIP_CMD install delocate } function repair_wheelhouse { local wheelhouse=$1 install_delocate delocate-wheel $wheelhouse/*.whl # copies library dependencies into wheel # Add platform tags to label wheels as compatible with OSX 10.9 and # 10.10. The wheels will be built against Python.org Python, and so will # in fact be compatible with OSX >= 10.6. pip < 6.0 doesn't realize # this, so, in case users have older pip, add platform tags to specify # compatibility with later OSX. Not necessary for OSX released well # after pip 6.0. See: # https://github.com/MacPython/wiki/wiki/Spinning-wheels#question-will-pip-give-me-a-broken-wheel delocate-addplat --rm-orig -x 10_9 -x 10_10 $wheelhouse/*.whl } function install_pkg_config { # Install pkg-config avoiding error from homebrew # See : # https://github.com/matthew-brett/multibuild/issues/24#issue-221951587 command -v pkg-config > /dev/null 2>&1 || brew install pkg-config } function activate_ccache { brew install ccache export PATH=/usr/local/opt/ccache/libexec:$PATH export CCACHE_CPP2=1 # Prove to the developer that ccache is activated echo "Using C compiler: $(which clang)" } function before_install { # Uninstall oclint. See Travis-CI gh-8826 brew cask uninstall oclint || true export CC=clang export CXX=clang++ get_macpython_environment $MB_PYTHON_VERSION venv source venv/bin/activate pip install --upgrade pip wheel } cython-blis-0.9.1/bin/travis/common_utils.sh000077500000000000000000000316101427272030600211070ustar00rootroot00000000000000#!/bin/bash # Utilities for both OSX and Docker Linux # Python should be on the PATH # Only source common_utils once if [ -n "$COMMON_UTILS_SOURCED" ]; then return fi COMMON_UTILS_SOURCED=1 # Turn on exit-if-error set -e MULTIBUILD_DIR=$(dirname "${BASH_SOURCE[0]}") if [ $(uname) == "Darwin" ]; then IS_OSX=1; fi # Work round bug in travis xcode image described at # https://github.com/direnv/direnv/issues/210 shell_session_update() { :; } # Workaround for https://github.com/travis-ci/travis-ci/issues/8703 # suggested by Thomas K at # https://github.com/travis-ci/travis-ci/issues/8703#issuecomment-347881274 unset -f cd unset -f pushd unset -f popd function start_spinner { if [ -n "$MB_SPINNER_PID" ]; then return fi >&2 echo "Building libraries..." # Start a process that runs as a keep-alive # to avoid travis quitting if there is no output (while true; do sleep 60 >&2 echo "Still building..." done) & MB_SPINNER_PID=$! disown } function stop_spinner { if [ ! -n "$MB_SPINNER_PID" ]; then return fi kill $MB_SPINNER_PID unset MB_SPINNER_PID >&2 echo "Building libraries finished." } function abspath { python -c "import os.path; print(os.path.abspath('$1'))" } function relpath { # Path of first input relative to second (or $PWD if not specified) python -c "import os.path; print(os.path.relpath('$1','${2:-$PWD}'))" } function realpath { python -c "import os; print(os.path.realpath('$1'))" } function lex_ver { # Echoes dot-separated version string padded with zeros # Thus: # 3.2.1 -> 003002001 # 3 -> 003000000 echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}' } function unlex_ver { # Reverses lex_ver to produce major.minor.micro # Thus: # 003002001 -> 3.2.1 # 003000000 -> 3.0.0 echo "$((10#${1:0:3}+0)).$((10#${1:3:3}+0)).$((10#${1:6:3}+0))" } function strip_ver_suffix { echo $(unlex_ver $(lex_ver $1)) } function is_function { # Echo "true" if input argument string is a function # Allow errors during "set -e" blocks. (set +e; $(declare -Ff "$1" > /dev/null) && echo true) } function gh-clone { git clone https://github.com/$1 } function set_opts { # Set options from input options string (in $- format). local opts=$1 local chars="exhimBH" for (( i=0; i<${#chars}; i++ )); do char=${chars:$i:1} [ -n "${opts//[^${char}]/}" ] && set -$char || set +$char done } function suppress { # Run a command, show output only if return code not 0. # Takes into account state of -e option. # Compare # https://unix.stackexchange.com/questions/256120/how-can-i-suppress-output-only-if-the-command-succeeds#256122 # Set -e stuff agonized over in # https://unix.stackexchange.com/questions/296526/set-e-in-a-subshell local tmp=$(mktemp tmp.XXXXXXXXX) || return local opts=$- echo "Running $@" set +e ( set_opts $opts ; $@ > "$tmp" 2>&1 ) ; ret=$? [ "$ret" -eq 0 ] || cat "$tmp" rm -f "$tmp" set_opts $opts return "$ret" } function rm_mkdir { # Remove directory if present, then make directory local path=$1 if [ -z "$path" ]; then echo "Need not-empty path"; exit 1; fi if [ -d "$path" ]; then rm -rf $path; fi mkdir $path } function untar { local in_fname=$1 if [ -z "$in_fname" ];then echo "in_fname not defined"; exit 1; fi local extension=${in_fname##*.} case $extension in tar) tar -xf $in_fname ;; gz|tgz) tar -zxf $in_fname ;; bz2) tar -jxf $in_fname ;; zip) unzip -qq $in_fname ;; xz) unxz -c $in_fname | tar -xf ;; *) echo Did not recognize extension $extension; exit 1 ;; esac } function install_rsync { if [ -z "$IS_OSX" ]; then [[ $(type -P rsync) ]] || yum install -y rsync fi } function fetch_unpack { # Fetch input archive name from input URL # Parameters # url - URL from which to fetch archive # archive_fname (optional) archive name # # Echos unpacked directory and file names. # # If `archive_fname` not specified then use basename from `url` # If `archive_fname` already present at download location, use that instead. local url=$1 if [ -z "$url" ];then echo "url not defined"; exit 1; fi local archive_fname=${2:-$(basename $url)} local arch_sdir="${ARCHIVE_SDIR:-archives}" # Make the archive directory in case it doesn't exist mkdir -p $arch_sdir local out_archive="${arch_sdir}/${archive_fname}" # If the archive is not already in the archives directory, get it. if [ ! -f "$out_archive" ]; then # Source it from multibuild archives if available. local our_archive="${MULTIBUILD_DIR}/archives/${archive_fname}" if [ -f "$our_archive" ]; then ln -s $our_archive $out_archive else # Otherwise download it. curl -L $url > $out_archive fi fi # Unpack archive, refreshing contents, echoing dir and file # names. rm_mkdir arch_tmp install_rsync (cd arch_tmp && \ untar ../$out_archive && \ ls -1d * && rsync --delete -ah * ..) } function clean_code { local repo_dir=${1:-$REPO_DIR} local build_commit=${2:-$BUILD_COMMIT} [ -z "$repo_dir" ] && echo "repo_dir not defined" && exit 1 [ -z "$build_commit" ] && echo "build_commit not defined" && exit 1 # The package $repo_dir may be a submodule. git submodules do not # have a .git directory. If $repo_dir is copied around, tools like # Versioneer which require that it be a git repository are unable # to determine the version. Give submodule proper git directory fill_submodule "$repo_dir" (cd $repo_dir \ && git fetch origin \ && git checkout $build_commit \ && git clean -fxd \ && git reset --hard \ && git submodule update --init --recursive) } function build_wheel_cmd { # Builds wheel with named command, puts into $WHEEL_SDIR # # Parameters: # cmd (optional, default "pip_wheel_cmd" # Name of command for building wheel # repo_dir (optional, default $REPO_DIR) # # Depends on # REPO_DIR (or via input argument) # WHEEL_SDIR (optional, default "wheelhouse") # BUILD_DEPENDS (optional, default "") # MANYLINUX_URL (optional, default "") (via pip_opts function) local cmd=${1:-pip_wheel_cmd} local repo_dir=${2:-$REPO_DIR} [ -z "$repo_dir" ] && echo "repo_dir not defined" && exit 1 local wheelhouse=$(abspath ${WHEEL_SDIR:-wheelhouse}) start_spinner if [ -n "$(is_function "pre_build")" ]; then pre_build; fi stop_spinner if [ -n "$BUILD_DEPENDS" ]; then pip install $(pip_opts) $BUILD_DEPENDS fi (cd $repo_dir && $cmd $wheelhouse) repair_wheelhouse $wheelhouse } function pip_wheel_cmd { local abs_wheelhouse=$1 pip wheel $(pip_opts) -w $abs_wheelhouse --no-deps . } function bdist_wheel_cmd { # Builds wheel with bdist_wheel, puts into wheelhouse # # It may sometimes be useful to use bdist_wheel for the wheel building # process. For example, versioneer has problems with versions which are # fixed with bdist_wheel: # https://github.com/warner/python-versioneer/issues/121 local abs_wheelhouse=$1 python setup.py bdist_wheel cp dist/*.whl $abs_wheelhouse } function build_pip_wheel { # Standard wheel building command with pip wheel build_wheel_cmd "pip_wheel_cmd" $@ } function build_bdist_wheel { # Wheel building with bdist_wheel. See bdist_wheel_cmd build_wheel_cmd "bdist_wheel_cmd" $@ } function build_wheel { # Set default building method to pip build_pip_wheel $@ } function build_index_wheel { # Builds wheel from some index, usually pypi # # Parameters: # project_spec # requirement to install, e.g. "tornado" or "tornado==4.4.1" # *args # Any other arguments to be passed to pip `install` and `wheel` # commands. # # Depends on # WHEEL_SDIR (optional, default "wheelhouse") # BUILD_DEPENDS (optional, default "") # MANYLINUX_URL (optional, default "") (via pip_opts function) # # You can also override `pip_opts` command to set indices other than pypi local project_spec=$1 [ -z "$project_spec" ] && echo "project_spec not defined" && exit 1 # Discard first argument to pass remainder to pip shift local wheelhouse=$(abspath ${WHEEL_SDIR:-wheelhouse}) start_spinner if [ -n "$(is_function "pre_build")" ]; then pre_build; fi stop_spinner if [ -n "$BUILD_DEPENDS" ]; then pip install $(pip_opts) $@ $BUILD_DEPENDS fi pip wheel $(pip_opts) $@ -w $wheelhouse --no-deps $project_spec repair_wheelhouse $wheelhouse } function pip_opts { [ -n "$MANYLINUX_URL" ] && echo "--find-links $MANYLINUX_URL" } function get_platform { # Report platform as given by uname python -c 'import platform; print(platform.uname()[4])' } function get_distutils_platform { # Report platform as given by distutils get_platform. # This is the platform tag that pip will use. python -c "import distutils.util; print(distutils.util.get_platform())" } function install_wheel { # Install test dependencies and built wheel # # Pass any input flags to pip install steps # # Depends on: # WHEEL_SDIR (optional, default "wheelhouse") # TEST_DEPENDS (optional, default "") # MANYLINUX_URL (optional, default "") (via pip_opts function) local wheelhouse=$(abspath ${WHEEL_SDIR:-wheelhouse}) if [ -n "$TEST_DEPENDS" ]; then pip install $(pip_opts) $@ $TEST_DEPENDS fi # Install compatible wheel pip install $(pip_opts) $@ \ $(python $MULTIBUILD_DIR/supported_wheels.py $wheelhouse/*.whl) } function install_run { # Depends on function `run_tests` defined in `config.sh` install_wheel mkdir tmp_for_test (cd tmp_for_test && run_tests) } function fill_submodule { # Restores .git directory to submodule, if necessary # See: # https://stackoverflow.com/questions/41776331/is-there-a-way-to-reconstruct-a-git-directory-for-a-submodule local repo_dir="$1" [ -z "$repo_dir" ] && echo "repo_dir not defined" && exit 1 local git_loc="$repo_dir/.git" # For ordinary submodule, .git is a file. [ -d "$git_loc" ] && return # Need to recreate .git directory for submodule local origin_url=$(cd "$repo_dir" && git config --get remote.origin.url) local repo_copy="$repo_dir-$RANDOM" git clone --recursive "$repo_dir" "$repo_copy" rm -rf "$repo_dir" mv "${repo_copy}" "$repo_dir" (cd "$repo_dir" && git remote set-url origin $origin_url) } PYPY_URL=https://bitbucket.org/pypy/pypy/downloads # As of 2018-04-25, the latest verions of PyPy. LATEST_PP_1=1.9 LATEST_PP_2p0=2.0.2 # No minor version numbers for 2.1 LATEST_PP_2p1=2.1 LATEST_PP_2p2=2.2.1 LATEST_PP_2p3=2.3.1 LATEST_PP_2p4=2.4.0 LATEST_PP_2p5=2.5.1 LATEST_PP_2p6=2.6.1 LATEST_PP_2=$LATEST_PP_2p6 LATEST_PP_4p0=4.0.1 LATEST_PP_4=$LATEST_PP_4p0 LATEST_PP_5p0=5.0.1 LATEST_PP_5p1=5.1.1 LATEST_PP_5p3=5.3.1 LATEST_PP_5p4=5.4.1 LATEST_PP_5p6=5.6.0 LATEST_PP_5p7=5.7.1 LATEST_PP_5p8=5.8.0 LATEST_PP_5p9=5.9.0 LATEST_PP_5p10=5.10.1 LATEST_PP_5=$LATEST_PP_5p10 LATEST_PP_6p0=6.0.0 LATEST_PP_6=$LATEST_PP_6p0 function unroll_version { # Convert major or major.minor format to major.minor.micro using the above # values recursively # Parameters: # $prefix : one of LATEST_PP or LATEST_PP3 # $version : major[.minor[.patch]] # Hence: # LATEST_PP 5 -> 5.7.0 # LATEST 2.7 -> 2.7.11 local prefix=$1 local ver=$2 local latest=${prefix}_${ver//./p} if [ -n "${!latest}" ]; then echo $(unroll_version ${prefix} ${!latest}) else echo $ver fi } function fill_pypy_ver { # Convert major or major.minor format to major.minor.micro # Parameters: # $version : major[.minor[.patch]] # Hence: # 5 -> 5.7.0 echo $(unroll_version LATEST_PP $1) } function get_pypy_build_prefix { # Return the file prefix of a PyPy file # Parameters: # $version : pypy2 version number local version=$1 if [[ $version =~ ([0-9]+)\.([0-9]+) ]]; then local major=${BASH_REMATCH[1]} local minor=${BASH_REMATCH[2]} if (( $major > 5 || ($major == 5 && $minor >= 3) )); then echo "pypy2-v" else echo "pypy-" fi else echo "error: expected version number, got $1" 1>&2 exit 1 fi } retry () { # Retry command (with arguments) up to 5 times # https://gist.github.com/fungusakafungus/1026804 local retry_max=5 local count=$retry_max while [ $count -gt 0 ]; do "$@" && break count=$(($count - 1)) sleep 1 done [ $count -eq 0 ] && { echo "Retry failed [$retry_max]: $@" >&2 return 1 } return 0 } cython-blis-0.9.1/bin/travis_setup.sh000077500000000000000000000006701427272030600176210ustar00rootroot00000000000000#!/usr/bin/env bash set -e if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo systemctl disable apt-daily.timer sudo killall apt.systemd.daily sleep 5 sudo -E apt-add-repository -y "ppa:ubuntu-toolchain-r/test" sleep 5 sudo apt-get update -y sleep 5 sudo apt-get install -y gcc-6 binutils clang sed -i 's/"gcc"/"gcc-6"/' blis/_src/make/linux-x86_64.jsonl export CC="gcc-6" fi #if [ "$TRAVIS_OS_NAME" = "osx" ]; then #fi cython-blis-0.9.1/bin/update-vendored-source000077500000000000000000000012011427272030600210330ustar00rootroot00000000000000#!/usr/bin/env bash set -e rm -rf blis/_new_src mkdir blis/_new_src cp -r flame-blis/config blis/_new_src/config cp -r flame-blis/frame blis/_new_src/frame cp -r flame-blis/kernels blis/_new_src/kernels cp -r flame-blis/ref_kernels blis/_new_src/ref_kernels mkdir blis/_new_src/include mkdir blis/_new_src/include/darwin-x86_64_no_skx mkdir blis/_new_src/include/linux-x86_64_no_skx mkdir blis/_new_src/include/windows-x86_64_no_skx mkdir blis/_new_src/include/darwin-generic mkdir blis/_new_src/include/linux-generic mkdir blis/_new_src/include/windows-generic mkdir blis/_new_src/make mv blis/_src _old_src mv blis/_new_src blis/_src cython-blis-0.9.1/blis/000077500000000000000000000000001427272030600147105ustar00rootroot00000000000000cython-blis-0.9.1/blis/__init__.pxd000066400000000000000000000000001427272030600171520ustar00rootroot00000000000000cython-blis-0.9.1/blis/__init__.py000066400000000000000000000001171427272030600170200ustar00rootroot00000000000000# Copyright ExplsionAI GmbH, released under BSD. from .cy import init init() cython-blis-0.9.1/blis/_src/000077500000000000000000000000001427272030600156365ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/000077500000000000000000000000001427272030600171035ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/README.md000066400000000000000000000014671427272030600203720ustar00rootroot00000000000000 For more information on sub-configurations and configuration families in BLIS, please read the Configuration Guide, which can be viewed in markdown-rendered form [from the BLIS wiki page](https://github.com/flame/blis/wiki/). If you don't have time, or are impatient, take a look at the `config_registry` file in the top-level directory of the BLIS distribution. It contains a grammar-like mapping of configuration names, or families, to sub-configurations, which may be other families. Keep in mind that the `/` notation: ``` : / ``` means that the kernel set associated with `` should be made available to the configuration `` if `` is targeted at configure-time. (Some configurations borrow kernels from other configurations, and this is how we specify that requirement.) cython-blis-0.9.1/blis/_src/config/a64fx/000077500000000000000000000000001427272030600200335ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/a64fx/bli_a64fx_sector_cache.h000066400000000000000000000104571427272030600244730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // A64FX: set up cache sizes // // Reference: A64FX (TM) specification Fujitsu HPC Extension // Link: https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_Specification_HPC_Extension_v1_EN.pdf // // 63:15 | 14:12 | 11 | 10:08 | 07 | 06:04 | 03 | 02:00 | // RES0 | l1_sec3_max | RES0 | l1_sec2_max | RES0 | l1_sec1_max | RES0 | l1_sec0_max | // // the bits set number of maximum sectors from 0-7 // 000 - 0 // 001 - 1 // 010 - 2 // 011 - 3 // 100 - 4 // 101 - 5 // 110 - 6 // 111 - 7 // // For L1 we want to maximize the number of sectors for B // Configuration 1: 1 sector for C (sector 3) // 1 sector for A (sector 1) // 6 sectors for B (sector 2) // 0 sectors for the rest (sector 0) // // 16b bitfield conf. 1: 0b0 001 0 110 0 001 0 000 // // Configuration 2: 1 sector for C (sector 3) // 1 sector for A (sector 1) // 5 sectors for B (sector 2) // 1 sectors for the rest (sector 0) // // 16b bitfield conf. 2: 0b0 001 0 101 0 001 0 001 // // accessing the control register: // // MRS , S3_3_C11_C8_2 // MSR S3_3_C11_C8_2, // // TODO: First tests showed no change in performance, a deeper investigation // is necessary #define A64FX_SETUP_SECTOR_CACHE_SIZES(config_bitfield)\ {\ uint64_t sector_cache_config = config_bitfield;\ __asm__ volatile(\ "msr s3_3_c11_c8_2,%[sector_cache_config]"\ :\ : [sector_cache_config] "r" (sector_cache_config)\ :\ );\ } #define A64FX_SETUP_SECTOR_CACHE_SIZES_L2(config_bitfield)\ {\ uint64_t sector_cache_config = config_bitfield;\ __asm__ volatile(\ "msr s3_3_c15_c8_2,%[sector_cache_config]"\ :\ : [sector_cache_config] "r" (sector_cache_config)\ :\ );\ } #define A64FX_SET_CACHE_SECTOR(areg, tag, sparereg)\ " mov "#sparereg", "#tag" \n\t"\ " lsl "#sparereg", "#sparereg", 56 \n\t"\ " orr "#areg", "#areg", "#sparereg" \n\t" #define A64FX_READ_SECTOR_CACHE_SIZES(output_uint64)\ __asm__ volatile(\ "mrs %["#output_uint64"],s3_3_c11_c8_2"\ : [output_uint64] "=r" (output_uint64)\ : \ :\ ); #define A64FX_SCC(sec0,sec1,sec2,sec3)\ (uint64_t)((sec0 & 0x7LU) | ((sec1 & 0x7LU) << 4) | ((sec2 & 0x7LU) << 8) | ((sec3 & 0x7LU) << 12)) #define A64FX_SCC_L2(sec02,sec13)\ (uint64_t)((sec02 & 0x1FLU) | ((sec13 & 0x1FLU) << 8)) cython-blis-0.9.1/blis/_src/config/a64fx/bli_cntx_init_a64fx.c000066400000000000000000000132541427272030600240410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "bli_a64fx_sector_cache.h" void bli_cntx_init_a64fx( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_a64fx_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, cntx ); // Set SVE-512 packing routine. bli_cntx_set_packm_kers ( 2, BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, // 12xk is not used and disabled for GCC 8-9 compatibility. // BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk, BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, 16, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 10, 10, 10, 10 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 192, 96 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 2048, 2048, 1536, 1536 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 23040, 26880, 11520, 11760 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); #if 0 // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 65, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 65, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 65, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 4, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 10, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 16, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); #endif // Set A64FX cache sector sizes for each PE/CMG // SC Fugaku might disable users' setting cache sizes. #if !defined(CACHE_SECTOR_SIZE_READONLY) #pragma omp parallel { A64FX_SETUP_SECTOR_CACHE_SIZES(A64FX_SCC(0,1,3,0)) A64FX_SETUP_SECTOR_CACHE_SIZES_L2(A64FX_SCC_L2(9,28)) } #endif } cython-blis-0.9.1/blis/_src/config/a64fx/bli_family_a64fx.h000066400000000000000000000042031427272030600233220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 256 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 #define W_L1_SVE_DEFAULT 4 #define C_L1_SVE_DEFAULT 256 #define N_L2_SVE_DEFAULT 2048 #define W_L2_SVE_DEFAULT 16 #define C_L2_SVE_DEFAULT 256 #define N_L3_SVE_DEFAULT 8192 #define W_L3_SVE_DEFAULT 16 #define C_L3_SVE_DEFAULT 256 //#endif cython-blis-0.9.1/blis/_src/config/a64fx/make_defs.mk000066400000000000000000000055521427272030600223110ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := a64fx #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE -D_A64FX CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) CKVECFLAGS := # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/amd64/000077500000000000000000000000001427272030600200165ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/amd64/bli_family_amd64.h000066400000000000000000000033321427272030600232720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_FAMILY_AMD64_H #define BLIS_FAMILY_AMD64_H #endif cython-blis-0.9.1/blis/_src/config/amd64/make_defs.mk000066400000000000000000000050271427272030600222710ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := amd64 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Setting for reference and optimized kernels are taken from individual # subconfiguration makefile fragments in this family. # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/amd64_legacy/000077500000000000000000000000001427272030600213425ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/amd64_legacy/bli_family_amd64_legacy.h000066400000000000000000000035131427272030600261430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_FAMILY_AMD64_LEGACY_H #define BLIS_FAMILY_AMD64_LEGACY_H // Placeholder for bundle configuration. #endif cython-blis-0.9.1/blis/_src/config/amd64_legacy/make_defs.mk000066400000000000000000000051331427272030600236130ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := amd64_legacy #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Setting for reference and optimized kernels are taken from individual # subconfiguration makefile fragments in this family. # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/arm32/000077500000000000000000000000001427272030600200275ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/arm32/bli_family_arm32.h000066400000000000000000000034721427272030600233210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif cython-blis-0.9.1/blis/_src/config/arm32/make_defs.mk000066400000000000000000000056661427272030600223130ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := arm32 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mfloat-abi=hard -mfpu=neon CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -march=armv7-a else $(error gcc is required for this configuration.) endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/arm64/000077500000000000000000000000001427272030600200345ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/arm64/bli_family_arm64.h000066400000000000000000000041741427272030600233330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 #define W_L1_SVE_DEFAULT 4 #define C_L1_SVE_DEFAULT 256 #define N_L2_SVE_DEFAULT 2048 #define W_L2_SVE_DEFAULT 16 #define C_L2_SVE_DEFAULT 256 #define N_L3_SVE_DEFAULT 8192 #define W_L3_SVE_DEFAULT 16 #define C_L3_SVE_DEFAULT 256 //#endif cython-blis-0.9.1/blis/_src/config/arm64/make_defs.mk000066400000000000000000000057671427272030600223220ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := arm64 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -march=armv8-a else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -march=armv8-a else $(error gcc or clang is required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/armsve/000077500000000000000000000000001427272030600204005ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/armsve/bli_cntx_init_armsve.c000066400000000000000000000140361427272030600247520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #ifndef HWCAP_SVE #define HWCAP_SVE (1 << 22) #endif void bli_cntx_init_armsve( cntx_t* cntx ) { if (!(getauxval( AT_HWCAP ) & HWCAP_SVE)) return; blksz_t blkszs[ BLIS_NUM_BLKSZS ]; #if 0 blksz_t thresh[ BLIS_NUM_THRESH ]; #endif // Set default kernel blocksizes and functions. bli_cntx_init_armsve_ref( cntx ); // ------------------------------------------------------------------------- // Block size. dim_t m_r_s, n_r_s, k_c_s, m_c_s, n_c_s; dim_t m_r_d, n_r_d, k_c_d, m_c_d, n_c_d; dim_t m_r_c, n_r_c, k_c_c, m_c_c, n_c_c; dim_t m_r_z, n_r_z, k_c_z, m_c_z, n_c_z; bli_s_blksz_armsve(&m_r_s, &n_r_s, &k_c_s, &m_c_s, &n_c_s); bli_d_blksz_armsve(&m_r_d, &n_r_d, &k_c_d, &m_c_d, &n_c_d); bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c); bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z); // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, // These are vector-length agnostic kernels. Yet knowing mr is required at runtime. BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armsve_asm_2vx10_unindexed, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armsve_asm_2vx10_unindexed, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE, cntx ); // Set VL-specific packing routines if applicable. if (m_r_d==16) bli_cntx_set_packm_kers ( 2, BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk, BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk, cntx ); else if (m_r_d==8) bli_cntx_set_packm_kers ( 1, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], m_r_s, m_r_d, m_r_c, m_r_z ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], n_r_s, n_r_d, n_r_c, n_r_z ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], m_c_s, m_c_d, m_c_c, m_c_z ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], k_c_s, k_c_d, k_c_c, k_c_z ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], n_c_s, n_c_d, n_c_c, n_c_z ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); #if 0 // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 101, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 101, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 101, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 4, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE, cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, n_r_d, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, m_r_d, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 120, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 2048, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); #endif } cython-blis-0.9.1/blis/_src/config/armsve/bli_family_armsve.h000066400000000000000000000042031427272030600242340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 256 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 #define W_L1_SVE_DEFAULT 4 #define C_L1_SVE_DEFAULT 256 #define N_L2_SVE_DEFAULT 2048 #define W_L2_SVE_DEFAULT 16 #define C_L2_SVE_DEFAULT 256 #define N_L3_SVE_DEFAULT 8192 #define W_L3_SVE_DEFAULT 16 #define C_L3_SVE_DEFAULT 256 //#endif cython-blis-0.9.1/blis/_src/config/armsve/make_defs.mk000066400000000000000000000055421427272030600226550ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := armsve #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 -ftree-vectorize -march=armv8-a+sve endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) CKVECFLAGS := # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/bgq/000077500000000000000000000000001427272030600176545ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/bgq/bli_cntx_init_bgq.c000066400000000000000000000060311427272030600234760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_bgq( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_bgq_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bgq_int_8x8, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 1024, 0, 768 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 2048, 0, 1536 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 10240, 0, 10240 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/bgq/bli_family_bgq.h000066400000000000000000000062131427272030600227670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #undef restrict #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 1024 #define BLIS_DEFAULT_KC_S 2048 #define BLIS_DEFAULT_NC_S 8192 // 1 MPI RANK CASE: #define BLIS_DGEMM_UKERNEL bli_dgemm_int_8x8 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 1024 #define BLIS_DEFAULT_KC_D 2048 #define BLIS_DEFAULT_NC_D 10240 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 1024 #define BLIS_DEFAULT_KC_C 2048 #define BLIS_DEFAULT_NC_C 8192 #define BLIS_ZGEMM_UKERNEL bli_zgemm_int_8x8 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 768 #define BLIS_DEFAULT_KC_Z 1536 #define BLIS_DEFAULT_NC_Z 10240 // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ #define BLIS_DEFAULT_AF_D 8 #define BLIS_DAXPYF_KERNEL bli_daxpyf_opt_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif cython-blis-0.9.1/blis/_src/config/bgq/make_defs.mk000066400000000000000000000071361427272030600221320ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := bgq #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # #ifeq ($(CC),) #CC := /bgsys/drivers/ppcfloor/comm/gcc.legacy/bin/mpixlc_r #CC_VENDOR := ibm #endif # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -I/bgsys/drivers/ppcfloor -I/bgsys/drivers/ppcfloor/spi/include/kernel/cnk ifeq ($(CC_VENDOR),ibm) CMISCFLAGS := -qthreaded -qsmp=omp -qasm=gcc -qkeyword=asm # -qreport -qsource -qlistopt -qlist else ifeq ($(CC_VENDOR),clang) CMISCFLAGS := -fopenmp else $(error xlc or bgclang is required for this configuration.) endif CPICFLAGS := CWARNFLAGS := -w ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),ibm) CKVECFLAGS := -qarch=qp -qtune=qp -qsimd=auto -qhot=level=1 -qprefetch -qunroll=yes -qnoipa endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Override the default value for LDFLAGS. ifeq ($(CC_VENDOR),ibm) LDFLAGS := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -qthreaded -qsmp=omp else ifeq ($(CC_VENDOR),clang) LDFLAGS := -L/bgsys/drivers/ppcfloor/spi/lib -lSPI -lSPI_cnk -fopenmp endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/bulldozer/000077500000000000000000000000001427272030600211055ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/bulldozer/bli_cntx_init_bulldozer.c000066400000000000000000000063171427272030600261670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_bulldozer( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_bulldozer_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_bulldozer_asm_8x8_fma4, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_bulldozer_asm_4x6_fma4, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 6, 4, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 128, 1080, 96, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 120, 256, 192 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 8400, 4096, 4096 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/bulldozer/bli_family_bulldozer.h000066400000000000000000000054231427272030600254530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif cython-blis-0.9.1/blis/_src/config/bulldozer/make_defs.mk000066400000000000000000000061261427272030600233610ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := bulldozer #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mfpmath=sse -mavx -mfma4 -march=bdver1 -mno-tbm -mno-xop -mno-lwp else $(error gcc or clang are required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/cortexa15/000077500000000000000000000000001427272030600207165ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/cortexa15/bli_cntx_init_cortexa15.c000066400000000000000000000066461427272030600256160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_cortexa15( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_cortexa15_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z #if 1 bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 336, 176, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 528, 368, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 4, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 4, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 176, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 368, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4096, -1, -1 ); #endif // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/cortexa15/bli_family_cortexa15.h000066400000000000000000000055721427272030600251020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_armv7a_int_4x4 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 336 #define BLIS_DEFAULT_KC_S 528 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_armv7a_int_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 176 #define BLIS_DEFAULT_KC_D 368 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif cython-blis-0.9.1/blis/_src/config/cortexa15/make_defs.mk000066400000000000000000000056741427272030600232010ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := cortexa15 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mfloat-abi=hard -mfpu=neon CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=cortex-a15 else $(error gcc is required for this configuration.) endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/cortexa53/000077500000000000000000000000001427272030600207205ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/cortexa53/bli_cntx_init_cortexa53.c000066400000000000000000000060551427272030600256140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_cortexa53( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_cortexa53_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 120, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 240, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/cortexa53/bli_family_cortexa53.h000066400000000000000000000034101427272030600250730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 cython-blis-0.9.1/blis/_src/config/cortexa53/make_defs.mk000066400000000000000000000060451427272030600231740ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := cortexa53 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -mcpu=cortex-a53 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=cortex-a53 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mcpu=cortex-a53 else $(error gcc or clang is required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/cortexa57/000077500000000000000000000000001427272030600207245ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/cortexa57/bli_cntx_init_cortexa57.c000066400000000000000000000060551427272030600256240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_cortexa57( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_cortexa57_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 120, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 240, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/cortexa57/bli_family_cortexa57.h000066400000000000000000000060241427272030600251070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 12 #define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 #define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 #define BLIS_DEFAULT_NC_S 3072 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 #define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 #define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif cython-blis-0.9.1/blis/_src/config/cortexa57/make_defs.mk000066400000000000000000000060411427272030600231740ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := cortexa57 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -mcpu=cortex-a57 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=cortex-a57 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mcpu=cortex-a57 else $(error gcc or clang is required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/cortexa9/000077500000000000000000000000001427272030600206415ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/cortexa9/bli_cntx_init_cortexa9.c000066400000000000000000000060511427272030600254520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_cortexa9( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_cortexa9_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_int_4x4, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 176, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 368, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/cortexa9/bli_family_cortexa9.h000066400000000000000000000055661427272030600247530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_armv7a_int_4x4 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 432 #define BLIS_DEFAULT_KC_S 352 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_armv7a_int_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 176 #define BLIS_DEFAULT_KC_D 368 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif cython-blis-0.9.1/blis/_src/config/cortexa9/make_defs.mk000066400000000000000000000056721427272030600231220ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := cortexa9 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mfloat-abi=hard -mfpu=neon CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=cortex-a9 else $(error gcc is required for this configuration.) endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/excavator/000077500000000000000000000000001427272030600210775ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/excavator/bli_cntx_init_excavator.c000066400000000000000000000063031427272030600261460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_excavator( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_excavator_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 4, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 3, 3, 2, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 528, 264, 264, 100 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 320 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8400, 8400, 8400, 8400 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/excavator/bli_family_excavator.h000066400000000000000000000057351427272030600254450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif cython-blis-0.9.1/blis/_src/config/excavator/make_defs.mk000066400000000000000000000061501427272030600233500ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := excavator #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else $(error gcc or clang are required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/firestorm/000077500000000000000000000000001427272030600211155ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/firestorm/bli_cntx_init_firestorm.c000066400000000000000000000127621427272030600262100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_firestorm( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_firestorm_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, cntx ); // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 4, BLIS_PACKM_8XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_8xk, BLIS_PACKM_12XK_KER, BLIS_FLOAT, bli_spackm_armv8a_int_12xk, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 252, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 3072, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 8192, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 99, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 99, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 99, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 8, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE, cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 6, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 240, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1024, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 3072, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); } cython-blis-0.9.1/blis/_src/config/firestorm/bli_family_firestorm.h000066400000000000000000000060241427272030600254710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 12 #define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 #define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 #define BLIS_DEFAULT_NC_S 3072 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 #define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 #define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif cython-blis-0.9.1/blis/_src/config/firestorm/make_defs.mk000066400000000000000000000055701427272030600233730ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := firestorm #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -march=armv8-a endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize CKVECFLAGS := -march=armv8-a # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/generic/000077500000000000000000000000001427272030600205175ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/generic/bli_cntx_init_generic.c000066400000000000000000000034421427272030600252070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_generic( cntx_t* cntx ) { // Set default kernel blocksizes and functions. bli_cntx_init_generic_ref( cntx ); } cython-blis-0.9.1/blis/_src/config/generic/bli_family_generic.h000066400000000000000000000033071427272030600244760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/generic/make_defs.mk000066400000000000000000000060101427272030600227630ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := generic #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/haswell/000077500000000000000000000000001427272030600205425ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/haswell/bli_cntx_init_haswell.c000066400000000000000000000232141427272030600252540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" //GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_cntx_init_haswell( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_haswell_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 8, // gemm #if 1 BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, #else BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_16x6, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_8x6, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3, FALSE, #endif // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, cntx ); #if 1 // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 8, BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, cntx ); #endif // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 10, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z #if 1 bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1008, 1008, 1008, 1008 ); //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 6, 6, 3, 3 ); //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1024, 1024, 1024, 1024 ); //bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 64, 56, 32 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 112, 72, 56, 44 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); #endif bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], 201, 201, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], 201, 201, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], 201, 201, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( 1, BLIS_GEMM, bli_gemmsup_ref, cntx ); #endif // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 16, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, 9, 9, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); } cython-blis-0.9.1/blis/_src/config/haswell/bli_family_haswell.h000066400000000000000000000114211427272030600245400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif cython-blis-0.9.1/blis/_src/config/haswell/make_defs.mk000066400000000000000000000066321427272030600230200ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := haswell #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -O3 -fomit-frame-pointer ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell ifeq ($(GCC_OT_4_9_0),yes) # If gcc is older than 4.9.0, we must use a different label for -march. CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2 endif else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xCORE-AVX2 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/intel64/000077500000000000000000000000001427272030600203705ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/intel64/bli_family_intel64.h000066400000000000000000000033061427272030600242170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/intel64/make_defs.mk000066400000000000000000000061241427272030600226420ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := intel64 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xSSSE3 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/intel64_no_skx/000077500000000000000000000000001427272030600217515ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/intel64_no_skx/bli_family_intel64.h000066400000000000000000000033061427272030600256000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/intel64_no_skx/bli_family_intel64_no_skx.h000066400000000000000000000033061427272030600271610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/intel64_no_skx/make_defs.mk000066400000000000000000000056641427272030600242330ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := intel64_no_skx #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xSSSE3 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) else CRVECFLAGS := $(CKVECFLAGS) endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/knc/000077500000000000000000000000001427272030600176565ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/knc/bli_cntx_init_knc.c000066400000000000000000000061501427272030600235040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_knc( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_knc_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 1, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8, TRUE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 30, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 8, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 120, 0, 0, 0, 160, 0, 0 ); bli_blksz_init ( &blkszs[ BLIS_KC ], 0, 240, 0, 0, 0, 300, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 14400, 0, 0 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/knc/bli_family_knc.h000066400000000000000000000067241427272030600230020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMTERS ------------------------------------------------------ #define BLIS_TREE_BARRIER #define BLIS_TREE_BARRIER_ARITY 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_30x16 #define BLIS_DEFAULT_MR_S 30 #define BLIS_DEFAULT_NR_S 16 #define BLIS_DEFAULT_MC_S 240 #define BLIS_DEFAULT_KC_S 240 #define BLIS_DEFAULT_NC_S 14400 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_30x8 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) #define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + 2) //#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...) #define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + 2) //#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...) #endif //#endif cython-blis-0.9.1/blis/_src/config/knc/make_defs.mk000066400000000000000000000061011427272030600221230ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := knc #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mmic -fasm-blocks CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),icc) CKVECFLAGS := else $(error icc is required for this configuration.) endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Override the default value for LDFLAGS. LDFLAGS := -mmic # Never use libm with Intel compilers. ifneq ($(CC_VENDOR),icc) LDFLAGS += $(LIBM) endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/knl/000077500000000000000000000000001427272030600176675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/knl/bli_cntx_init_knl.c000066400000000000000000000116701427272030600235310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_knl( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_knl_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_knl_asm_24x16, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8, FALSE, cntx ); // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 2, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk, BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk, cntx ); // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 24, 24, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); bli_blksz_init ( &blkszs[ BLIS_MC ], 240, 120, -1, -1, 288, 144, -1, -1 ); bli_blksz_init ( &blkszs[ BLIS_KC ], 336, 336, -1, -1, 408, 408, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 14400, 14400, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); } cython-blis-0.9.1/blis/_src/config/knl/bli_family_knl.h000066400000000000000000000116171427272030600230210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 4 #define BLIS_THREAD_RATIO_N 1 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // -- MEMORY ALLOCATION -------------------------------------------------------- //#define BLIS_TREE_BARRIER //#define BLIS_TREE_BARRIER_ARITY 4 #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 /* #ifdef BLIS_NO_HBWMALLOC #include #define BLIS_MALLOC_POOL malloc #define BLIS_FREE_POOL free #else #include #define BLIS_MALLOC_POOL hbw_malloc #define BLIS_FREE_POOL hbw_free #endif */ //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16_knc #define BLIS_DEFAULT_MC_S 240 #define BLIS_DEFAULT_KC_S 240 #define BLIS_DEFAULT_NC_S 14400 #define BLIS_DEFAULT_MR_S 30 #define BLIS_DEFAULT_NR_S 16 #define BLIS_PACKDIM_MR_S 32 #define BLIS_PACKDIM_NR_S 16 #if 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8_knc #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #elif 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_30XK_KERNEL bli_dpackm_30xk_opt #else #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_24x8 #define BLIS_DEFAULT_MR_D 24 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 24 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt #endif #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) #endif //#endif cython-blis-0.9.1/blis/_src/config/knl/make_defs.mk000066400000000000000000000077761427272030600221570ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := knl #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif ifeq ($(DEBUG_TYPE),sde) # Unconditionally disable use of libmemkind in Intel SDE. # Note: The BLIS_DISABLE_MEMKIND macro definition will override # (undefine) the BLIS_ENABLE_MEMKIND macro definition. CPPROCFLAGS += -DBLIS_DISABLE_MEMKIND # This value is normally set by configure and communicated to make via # config.mk, however, the make_defs.mk files (this file) get included # after config.mk, so this definition will override that earlier # definition. MK_ENABLE_MEMKIND := no endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mavx512f -mavx512pf -mfpmath=sse -march=knl else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xMIC-AVX512 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mavx512f -mavx512pf -mfpmath=sse -march=knl else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # The assembler on OS X won't recognize AVX512 without help. ifneq ($(CC_VENDOR),icc) ifeq ($(OS_NAME),Darwin) CKVECFLAGS += -Wa,-march=knl endif endif # Flags specific to reference kernels. # Note: We use AVX2 for reference kernels instead of AVX-512. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),icc) CRVECFLAGS := -xMIC-AVX512 else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := -march=knl -mno-avx512f -mno-avx512pf -mno-avx512er -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/old/000077500000000000000000000000001427272030600176615ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/old/armv7a/000077500000000000000000000000001427272030600210565ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/old/armv7a/bli_cntx_init_armv7a.c000066400000000000000000000062431427272030600253310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_armv7a( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_armv7a_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv7a_asm_4x4, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_asm_4x4, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armv7a_asm_2x2, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armv7a_asm_2x2, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 2, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 2, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 432, 192, 64, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 352, 256, 128, 128 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/old/armv7a/bli_family_armv7a.h000066400000000000000000000057701427272030600246240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_FAMILY_H #define BLIS_FAMILY_H // -- ARCHITECTURE-SPECIFIC PROTOTYPES ----------------------------------------- // Define the current architecture's name. #define archname armv7a // Include the context initialization function API template. #include "bli_cntx_init_arch.h" #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_4x4 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 432 #define BLIS_DEFAULT_KC_S 352 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 192 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_CGEMM_UKERNEL bli_cgemm_opt_4x4 #define BLIS_DEFAULT_MR_C 2 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_ZGEMM_UKERNEL bli_zgemm_opt_4x4 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif #endif cython-blis-0.9.1/blis/_src/config/old/armv7a/make_defs.mk000066400000000000000000000052131427272030600233260ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := armv7a #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=c99 -mfloat-abi=hard CPICFLAGS := -fPIC CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mfpu=vfpv3 -marm -march=armv7-a else $(error gcc is required for this configuration.) endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/old/emscripten/000077500000000000000000000000001427272030600220325ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/old/emscripten/bli_kernel.h000066400000000000000000000146161427272030600243210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_KERNEL_H #define BLIS_KERNEL_H /* Use the same parameters as non-SIMD PNaCl */ // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 // -- Cache blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DEFAULT_MC_S 252 #define BLIS_DEFAULT_KC_S 264 #define BLIS_DEFAULT_NC_S 8196 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MC_C 120 #define BLIS_DEFAULT_KC_C 264 #define BLIS_DEFAULT_NC_C 4092 #define BLIS_DEFAULT_MC_Z 60 #define BLIS_DEFAULT_KC_Z 264 #define BLIS_DEFAULT_NC_Z 2040 // -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MR_C 2 #define BLIS_DEFAULT_NR_C 3 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 3 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. //#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) //#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) //#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + BLIS_DEFAULT_NC_S/4) //#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) //#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) //#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + BLIS_DEFAULT_NC_D/4) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) // -- Packing register blocksize (for packed micro-panels) -- // NOTE: These register blocksize "extensions" determine whether the // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. //#define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + ...) //#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...) //#define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + ...) //#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...) //#define BLIS_PACKDIM_MR_C (BLIS_DEFAULT_MR_C + ...) //#define BLIS_PACKDIM_NR_C (BLIS_DEFAULT_NR_C + ...) //#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...) //#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...) // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ // -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- // -- gemm -- // -- trsm-related -- // -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- // -- packm -- // -- unpackm -- // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- // -- axpy2v -- // -- dotaxpyv -- // -- axpyf -- // -- dotxf -- // -- dotxaxpyf -- // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- // -- addv -- // -- axpyv -- // -- copyv -- // -- dotv -- // -- dotxv -- // -- invertv -- // -- scal2v -- // -- scalv -- // -- setv -- // -- subv -- // -- swapv -- #endif cython-blis-0.9.1/blis/_src/config/old/emscripten/make_defs.mk000066400000000000000000000053051427272030600243040ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Only include this block of code once. ifndef MAKE_DEFS_MK_INCLUDED MAKE_DEFS_MK_INCLUDED := yes # # --- Development tools definitions -------------------------------------------- # # --- Determine the C compiler and related flags --- CC := emcc CC_VENDOR := emcc # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CDBGFLAGS := #-g4 CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors COPTFLAGS := -O2 CKOPTFLAGS := -O3 CKVECFLAGS := # --- Determine the archiver and related flags --- AR := emar RANLIB := emranlib ARFLAGS := cr # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := -shared LDFLAGS := -O3 -s TOTAL_MEMORY=67108864 -s FORCE_ALIGNED_MEMORY=1 -s PRECISE_F32=2 -s GC_SUPPORT=0 # --- Determine JS interpreter --- JSINT := node # end of ifndef MAKE_DEFS_MK_INCLUDED conditional block endif cython-blis-0.9.1/blis/_src/config/old/haswellbb/000077500000000000000000000000001427272030600216245ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/old/haswellbb/bli_cntx_init_haswell.c000066400000000000000000000257011427272030600263410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Instantiate prototypes for packm kernels. PACKM_KER_PROT( float, s, packm_6xk_bb4_haswell_ref ) PACKM_KER_PROT( double, d, packm_6xk_bb2_haswell_ref ) // Instantiate prototypes for level-3 kernels. GEMM_UKR_PROT( float, s, gemmbb_haswell_ref ) GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_haswell_ref ) GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_haswell_ref ) TRSM_UKR_PROT( float, s, trsmbb_l_haswell_ref ) TRSM_UKR_PROT( float, s, trsmbb_u_haswell_ref ) GEMM_UKR_PROT( double, d, gemmbb_haswell_ref ) GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_haswell_ref ) GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_haswell_ref ) TRSM_UKR_PROT( double, d, trsmbb_l_haswell_ref ) TRSM_UKR_PROT( double, d, trsmbb_u_haswell_ref ) GEMM_UKR_PROT( scomplex, c, gemmbb_haswell_ref ) GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_haswell_ref ) GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_haswell_ref ) TRSM_UKR_PROT( scomplex, c, trsmbb_l_haswell_ref ) TRSM_UKR_PROT( scomplex, c, trsmbb_u_haswell_ref ) GEMM_UKR_PROT( dcomplex, z, gemmbb_haswell_ref ) GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_haswell_ref ) GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_haswell_ref ) TRSM_UKR_PROT( dcomplex, z, trsmbb_l_haswell_ref ) TRSM_UKR_PROT( dcomplex, z, trsmbb_u_haswell_ref ) void bli_cntx_init_haswell( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_haswell_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( #if 0 8, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, #else 12, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_haswell_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_haswell_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_haswell_ref, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemmbb_haswell_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_haswell_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_haswell_ref, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_haswell_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_haswell_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_haswell_ref, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_haswell_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_haswell_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_haswell_ref, FALSE, #endif cntx ); // Update the context with customized virtual [gemm]trsm micro-kernels. bli_cntx_set_l3_vir_ukrs ( 8, BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_haswell_ref, BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_haswell_ref, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_haswell_ref, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_haswell_ref, BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_haswell_ref, BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_haswell_ref, BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_haswell_ref, BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_haswell_ref, cntx ); // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 2, BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_haswell_ref, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_haswell_ref, cntx ); // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z #if 0 bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, 75, 192 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MR ], 24, 12, 12, 6 ); bli_blksz_init ( &blkszs[ BLIS_NR ], 6, 6, 6, 6, 24, 12, 6, 6 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 2076 ); #endif bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, 8, 8 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], -1, 1, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], -1, 1, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], -1, 1, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 8, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init ( &blkszs[ BLIS_MR ], -1, 6, -1, -1, -1, 9, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], -1, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 72, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 256, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 4080, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); } cython-blis-0.9.1/blis/_src/config/old/haswellbb/bli_family_haswell.h000066400000000000000000000122601427272030600256240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096 #define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096 #define BLIS_POOL_ADDR_OFFSET_SIZE_A 32 #define BLIS_POOL_ADDR_OFFSET_SIZE_B 64 // Disable right-side hemm, symm, and trmm[3] to accommodate the broadcasting of // elements within the packed matrix B. #define BLIS_DISABLE_HEMM_RIGHT #define BLIS_DISABLE_SYMM_RIGHT #define BLIS_DISABLE_TRMM_RIGHT #define BLIS_DISABLE_TRMM3_RIGHT #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif cython-blis-0.9.1/blis/_src/config/old/haswellbb/make_defs.mk000066400000000000000000000064101427272030600240740ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := haswell #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell ifeq ($(GCC_OT_4_9_0),yes) # If gcc is older than 4.9.0, we must use a different label for -march. CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=core-avx2 endif else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xCORE-AVX2 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mavx2 -mfma -mfpmath=sse -march=haswell else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/old/loongson3a/000077500000000000000000000000001427272030600217435ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/old/loongson3a/bli_kernel.h000066400000000000000000000146231427272030600242300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_KERNEL_H #define BLIS_KERNEL_H // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 // -- Cache blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8192 #define BLIS_DEFAULT_MC_D 32 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 1024 #define BLIS_DEFAULT_MC_C 128 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 2048 // -- Register blocksizes -- #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. //#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) //#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) //#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + BLIS_DEFAULT_NC_S/4) //#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) //#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) //#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + BLIS_DEFAULT_NC_D/4) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) // -- Packing register blocksize (for packed micro-panels) -- // NOTE: These register blocksize "extensions" determine whether the // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. //#define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + ...) //#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...) //#define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + ...) //#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...) //#define BLIS_PACKDIM_MR_C (BLIS_DEFAULT_MR_C + ...) //#define BLIS_PACKDIM_NR_C (BLIS_DEFAULT_NR_C + ...) //#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...) //#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...) // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ // -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- // -- gemm -- #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_4x4 // -- trsm-related -- // -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- // -- packm -- // -- unpackm -- // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- // -- axpy2v -- // -- dotaxpyv -- // -- axpyf -- // -- dotxf -- // -- dotxaxpyf -- // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- // -- addv -- // -- axpyv -- // -- copyv -- // -- dotv -- // -- dotxv -- // -- invertv -- // -- scal2v -- // -- scalv -- // -- setv -- // -- subv -- // -- swapv -- #endif cython-blis-0.9.1/blis/_src/config/old/loongson3a/make_defs.mk000066400000000000000000000052121427272030600242120ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := loongson3a #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L -mabi=64 CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 -mtune=loongson3a endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -march=loongson3a else $(error gcc is required for this configuration.) endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/old/newarch/000077500000000000000000000000001427272030600213105ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/old/newarch/bli_kernel.h000066400000000000000000000033001427272030600235630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_KERNEL_H #define BLIS_KERNEL_H #endif cython-blis-0.9.1/blis/_src/config/old/newarch/make_defs.mk000066400000000000000000000053421427272030600235630ustar00rootroot00000000000000#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := newarch #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # ifeq ($(CC),) CC := gcc CC_VENDOR := gcc endif # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=c99 CPICFLAGS := -fPIC CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/old/pnacl/000077500000000000000000000000001427272030600207565ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/old/pnacl/bli_kernel.h000066400000000000000000000172151427272030600232430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_KERNEL_H #define BLIS_KERNEL_H /* * SIMD-enabled (SP only) PNaCl shipped in Chrome 36 and it is not backward-compatible. * Therefore, if compilation targets an older Chrome release, we use scalar kernels. * The target Chrome version is indicated by PPAPI_MACRO defined in the header below. */ #include // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 // -- Cache blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #if PPAPI_RELEASE >= 36 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8192 #else #define BLIS_DEFAULT_MC_S 252 #define BLIS_DEFAULT_KC_S 264 #define BLIS_DEFAULT_NC_S 8196 #endif #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #if PPAPI_RELEASE >= 36 #define BLIS_DEFAULT_MC_C 128 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #else #define BLIS_DEFAULT_MC_C 120 #define BLIS_DEFAULT_KC_C 264 #define BLIS_DEFAULT_NC_C 4092 #endif #define BLIS_DEFAULT_MC_Z 60 #define BLIS_DEFAULT_KC_Z 264 #define BLIS_DEFAULT_NC_Z 2040 // -- Register blocksizes -- #if PPAPI_RELEASE >= 36 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #else #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 3 #endif #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 3 #if PPAPI_RELEASE >= 36 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 4 #else #define BLIS_DEFAULT_MR_C 2 #define BLIS_DEFAULT_NR_C 3 #endif #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 3 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. //#define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) //#define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) //#define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + BLIS_DEFAULT_NC_S/4) //#define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) //#define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) //#define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + BLIS_DEFAULT_NC_D/4) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) // -- Packing register blocksize (for packed micro-panels) -- // NOTE: These register blocksize "extensions" determine whether the // leading dimensions used within the packed micro-panels are equal to // or greater than their corresponding register blocksizes above. //#define BLIS_PACKDIM_MR_S (BLIS_DEFAULT_MR_S + ...) //#define BLIS_PACKDIM_NR_S (BLIS_DEFAULT_NR_S + ...) //#define BLIS_PACKDIM_MR_D (BLIS_DEFAULT_MR_D + ...) //#define BLIS_PACKDIM_NR_D (BLIS_DEFAULT_NR_D + ...) //#define BLIS_PACKDIM_MR_C (BLIS_DEFAULT_MR_C + ...) //#define BLIS_PACKDIM_NR_C (BLIS_DEFAULT_NR_C + ...) //#define BLIS_PACKDIM_MR_Z (BLIS_DEFAULT_MR_Z + ...) //#define BLIS_PACKDIM_NR_Z (BLIS_DEFAULT_NR_Z + ...) // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ // -- LEVEL-3 KERNEL DEFINITIONS ----------------------------------------------- // -- gemm -- #if PPAPI_RELEASE >= 36 #define BLIS_SGEMM_UKERNEL bli_sgemm_opt #define BLIS_CGEMM_UKERNEL bli_cgemm_opt #endif // -- trsm-related -- // -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- // -- packm -- // -- unpackm -- // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- // -- axpy2v -- // -- dotaxpyv -- // -- axpyf -- // -- dotxf -- // -- dotxaxpyf -- // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- // -- addv -- // -- axpyv -- #if PPAPI_RELEASE >= 36 #define BLIS_SAXPYV_KERNEL bli_saxpyv_opt #define BLIS_CAXPYV_KERNEL bli_caxpyv_opt #endif // -- copyv -- // -- dotv -- #define BLIS_SDOTV_KERNEL bli_sdotv_opt #define BLIS_DDOTV_KERNEL bli_ddotv_opt #define BLIS_CDOTV_KERNEL bli_cdotv_opt #define BLIS_ZDOTV_KERNEL bli_zdotv_opt // -- dotxv -- // -- invertv -- // -- scal2v -- // -- scalv -- // -- setv -- // -- subv -- // -- swapv -- #endif cython-blis-0.9.1/blis/_src/config/old/pnacl/make_defs.mk000066400000000000000000000056411427272030600232330ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Only include this block of code once. ifndef MAKE_DEFS_MK_INCLUDED MAKE_DEFS_MK_INCLUDED := yes # # --- Development tools definitions -------------------------------------------- # # --- Determine the C compiler and related flags --- CC := pnacl-clang CC_VENDOR := pnacl-clang # Enable IEEE Standard 1003.1-2004 (POSIX.1d). # NOTE: This is needed to enable posix_memalign(). CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L CMISCFLAGS := -std=gnu11 -I$(NACL_SDK_ROOT)/include CPICFLAGS := CDBGFLAGS := -g CWARNFLAGS := -Wall -Wno-unused-function -Wfatal-errors COPTFLAGS := -O3 CKOPTFLAGS := $(COPTFLAGS) -ffast-math CKVECFLAGS := # --- Determine the archiver and related flags --- AR := pnacl-ar ARFLAGS := rcs # --- Determine the linker and related flags --- LINKER := $(CC) SOFLAGS := ifneq ($(CC_VENDOR),icc) LDFLAGS := -lm endif # --- Determine the finalizer and related flags --- FINALIZER := pnacl-finalize FINFLAGS := # --- Determine the translator and related flags --- TRANSLATOR := pnacl-translate TRNSFLAGS := -O3 TRNSAMD64FLAGS := -arch x86-64 TRNSX86FLAGS := -arch i686 TRNSARMFLAGS := -arch armv7 # end of ifndef MAKE_DEFS_MK_INCLUDED conditional block endif cython-blis-0.9.1/blis/_src/config/penryn/000077500000000000000000000000001427272030600204165ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/penryn/bli_cntx_init_penryn.c000066400000000000000000000065111427272030600250050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_penryn( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_penryn_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_penryn_asm_8x4, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4, FALSE, //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE, //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4, FALSE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 4, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 4, 4, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 768, 384, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 384, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 0, 0 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/penryn/bli_family_penryn.h000066400000000000000000000061321427272030600242730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif cython-blis-0.9.1/blis/_src/config/penryn/make_defs.mk000066400000000000000000000061231427272030600226670ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := penryn #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xSSSE3 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/piledriver/000077500000000000000000000000001427272030600212505ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/piledriver/bli_cntx_init_piledriver.c000066400000000000000000000063051427272030600264720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_piledriver( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_piledriver_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 4, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 3, 3, 2, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 2016, 1008, 512, 400 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 128, 128, 256, 160 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8400, 8400, 8400, 8400 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/piledriver/bli_family_piledriver.h000066400000000000000000000057171427272030600257670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif cython-blis-0.9.1/blis/_src/config/piledriver/make_defs.mk000066400000000000000000000061511427272030600235220ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := piledriver #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver2 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else $(error gcc or clang are required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/power10/000077500000000000000000000000001427272030600204005ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/power10/bli_cntx_init_power10.c000066400000000000000000000137301427272030600247520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Instantiate prototypes for packm kernels. PACKM_KER_PROT( float, s, packm_6xk_bb4_power10_ref ) PACKM_KER_PROT( double, d, packm_6xk_bb2_power10_ref ) // Instantiate prototypes for level-3 kernels. GEMM_UKR_PROT( float, s, gemmbb_power10_ref ) GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power10_ref ) GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power10_ref ) TRSM_UKR_PROT( float, s, trsmbb_l_power10_ref ) TRSM_UKR_PROT( float, s, trsmbb_u_power10_ref ) GEMM_UKR_PROT( double, d, gemmbb_power10_ref ) GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref ) GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref ) TRSM_UKR_PROT( double, d, trsmbb_l_power10_ref ) TRSM_UKR_PROT( double, d, trsmbb_u_power10_ref ) GEMM_UKR_PROT( scomplex, c, gemmbb_power10_ref ) GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref ) GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref ) TRSM_UKR_PROT( scomplex, c, trsmbb_l_power10_ref ) TRSM_UKR_PROT( scomplex, c, trsmbb_u_power10_ref ) GEMM_UKR_PROT( dcomplex, z, gemmbb_power10_ref ) GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref ) GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref ) TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power10_ref ) TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power10_ref ) void bli_cntx_init_power10( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_power10_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 12, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_power10_mma_8x16, TRUE, BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power10_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power10_ref, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8, TRUE, BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power10_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power10_ref, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power10_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power10_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref, FALSE, cntx ); // Update the context with customized virtual [gemm]trsm micro-kernels. bli_cntx_set_l3_vir_ukrs ( 8, BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power10_ref, BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power10_ref, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power10_ref, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power10_ref, BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref, BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref, BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref, BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref, cntx ); // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 2, BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power10_ref, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power10_ref, cntx ); // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 832, 320, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1026, 960, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/power10/bli_family_power10.h000066400000000000000000000034561427272030600242450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096 #define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096 #define BLIS_POOL_ADDR_OFFSET_SIZE_A 192 #define BLIS_POOL_ADDR_OFFSET_SIZE_B 152 cython-blis-0.9.1/blis/_src/config/power10/make_defs.mk000066400000000000000000000054701427272030600226550ustar00rootroot00000000000000 # # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2019, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := power10 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=power10 -mtune=power10 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mcpu=power10 -mtune=power10 else $(info $(CC_VENDOR)) $(error gcc, clang is required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) CRVECFLAGS := $(CKVECFLAGS) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/power7/000077500000000000000000000000001427272030600203265ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/power7/bli_cntx_init_power7.c000066400000000000000000000057441427272030600246340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_power7( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_power7_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 1, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 8, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 4, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 64, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 256, 0, 0 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 4096, 0, 0 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/power7/bli_family_power7.h000066400000000000000000000040621427272030600241130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_8x4 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 64 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #endif //#endif cython-blis-0.9.1/blis/_src/config/power7/make_defs.mk000066400000000000000000000056551427272030600226100ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := power7 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := -mcpu=power7 CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -mtune=power7 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mvsx else $(error gcc is required for this configuration.) endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/power9/000077500000000000000000000000001427272030600203305ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/power9/bli_cntx_init_power9.c000066400000000000000000000136501427272030600246330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Instantiate prototypes for packm kernels. PACKM_KER_PROT( float, s, packm_6xk_bb4_power9_ref ) PACKM_KER_PROT( double, d, packm_6xk_bb2_power9_ref ) // Instantiate prototypes for level-3 kernels. GEMM_UKR_PROT( float, s, gemmbb_power9_ref ) GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_l_power9_ref ) GEMMTRSM_UKR_PROT( float, s, gemmtrsmbb_u_power9_ref ) TRSM_UKR_PROT( float, s, trsmbb_l_power9_ref ) TRSM_UKR_PROT( float, s, trsmbb_u_power9_ref ) GEMM_UKR_PROT( double, d, gemmbb_power9_ref ) GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref ) GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref ) TRSM_UKR_PROT( double, d, trsmbb_l_power9_ref ) TRSM_UKR_PROT( double, d, trsmbb_u_power9_ref ) GEMM_UKR_PROT( scomplex, c, gemmbb_power9_ref ) GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref ) GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref ) TRSM_UKR_PROT( scomplex, c, trsmbb_l_power9_ref ) TRSM_UKR_PROT( scomplex, c, trsmbb_u_power9_ref ) GEMM_UKR_PROT( dcomplex, z, gemmbb_power9_ref ) GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref ) GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref ) TRSM_UKR_PROT( dcomplex, z, trsmbb_l_power9_ref ) TRSM_UKR_PROT( dcomplex, z, trsmbb_u_power9_ref ) void bli_cntx_init_power9( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_power9_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 12, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemmbb_power9_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_FLOAT, bli_strsmbb_l_power9_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_FLOAT, bli_strsmbb_u_power9_ref, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6, FALSE, BLIS_TRSM_L_UKR, BLIS_DOUBLE, bli_dtrsmbb_l_power9_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_DOUBLE, bli_dtrsmbb_u_power9_ref, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemmbb_power9_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemmbb_power9_ref, FALSE, BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref, FALSE, BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref, FALSE, cntx ); // Update the context with customized virtual [gemm]trsm micro-kernels. bli_cntx_set_l3_vir_ukrs ( 8, BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_l_power9_ref, BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsmbb_u_power9_ref, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_l_power9_ref, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsmbb_u_power9_ref, BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref, BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref, BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref, BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref, cntx ); // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 2, BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_6xk_bb4_power9_ref, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_bb2_power9_ref, cntx ); bli_blksz_init_easy( &blkszs[ BLIS_MR ], -1, 12, -1, -1 ); bli_blksz_init ( &blkszs[ BLIS_NR ], -1, 6, -1, -1, -1, 12, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], -1, 576, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], -1, 1408, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], -1, 8190, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/power9/bli_family_power9.h000066400000000000000000000040511427272030600241150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096 #define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096 #define BLIS_POOL_ADDR_OFFSET_SIZE_A 192 #define BLIS_POOL_ADDR_OFFSET_SIZE_B 152 // Disable right-side hemm, symm, and trmm[3] to accommodate the broadcasting of // elements within the packed matrix B. #define BLIS_DISABLE_HEMM_RIGHT #define BLIS_DISABLE_SYMM_RIGHT #define BLIS_DISABLE_TRMM_RIGHT #define BLIS_DISABLE_TRMM3_RIGHT cython-blis-0.9.1/blis/_src/config/power9/make_defs.mk000066400000000000000000000054741427272030600226110ustar00rootroot00000000000000 # # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2019, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := power9 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=power9 -mtune=power9 -DXLC=0 else ifeq ($(CC_VENDOR),IBM) CKVECFLAGS := -qarch=pwr9 -qtune=pwr9 -DXLC=1 else $(info $(CC_VENDOR)) $(error gcc/xlc is required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) CRVECFLAGS := $(CKVECFLAGS) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/sandybridge/000077500000000000000000000000001427272030600213765ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/sandybridge/bli_cntx_init_sandybridge.c000066400000000000000000000063071427272030600267500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_sandybridge( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_sandybridge_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_sandybridge_asm_8x8, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_sandybridge_asm_8x4, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 8, 4, 4, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 128, 96, 96, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 384, 256, 256, 192 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/sandybridge/bli_family_sandybridge.h000066400000000000000000000053741427272030600262420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif cython-blis-0.9.1/blis/_src/config/sandybridge/make_defs.mk000066400000000000000000000063761427272030600236610ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := sandybridge #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge ifeq ($(GCC_OT_4_9_0),yes) # If gcc is older than 4.9.0, we must use a different label for -march. CKVECFLAGS := -mavx -mfpmath=sse -march=corei7-avx endif else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xAVX else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mavx -mfpmath=sse -march=sandybridge else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/skx/000077500000000000000000000000001427272030600177105ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/skx/bli_cntx_init_skx.c000066400000000000000000000112501427272030600235650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_skx( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_skx_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, // gemm BLIS_GEMM_UKR, BLIS_FLOAT , bli_sgemm_skx_asm_32x12_l2, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_skx_asm_16x14, FALSE, cntx ); // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 10, #if 1 // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, #endif // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 32, 16, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 14, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 480, 240, -1, -1 ); bli_blksz_init ( &blkszs[ BLIS_KC ], 384, 256, -1, -1, 480, 320, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3752, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); } cython-blis-0.9.1/blis/_src/config/skx/bli_family_skx.h000066400000000000000000000113171427272030600230600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif cython-blis-0.9.1/blis/_src/config/skx/make_defs.mk000066400000000000000000000112761427272030600221660ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := skx #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -O3 -fomit-frame-pointer ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xCORE-AVX512 else ifeq ($(CC_VENDOR),clang) # NOTE: We have to use -march=haswell on Windows because apparently AVX512 # uses an alternate calling convention where xmm registers are not callee-saved # on the stack. When this is mixed with framework code compiled for general # x86_64 mode then chaos ensues (e.g. #514). ifeq ($(IS_WIN),yes) CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=haswell else CKVECFLAGS := -mavx512f -mavx512dq -mavx512bw -mavx512vl -mfpmath=sse -march=skylake-avx512 endif else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # The assembler on OS X won't recognize AVX512 without help ifneq ($(CC_VENDOR),icc) ifeq ($(OS_NAME),Darwin) CKVECFLAGS += -Wa,-march=skylake-avx512 endif endif # Flags specific to reference kernels. # Note: We use AVX2 for reference kernels because, as Jeff Hammond says, # reference kernel code "is not going to achieve high enough SIMD utilization # to overcome the AVX-512 frequency drop". (Issue #187) CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),icc) CRVECFLAGS := -xCORE-AVX2 else ifeq ($(CC_VENDOR),clang) # NOTE: We have to use -march=haswell on Windows because apparently AVX512 # uses an alternate calling convention where xmm registers are not callee-saved # on the stack. When this is mixed with framework code compiled for general # x86_64 mode then chaos ensues (e.g. #514). ifeq ($(IS_WIN),yes) CRVECFLAGS := -march=haswell -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := -march=skylake-avx512 -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast endif else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/steamroller/000077500000000000000000000000001427272030600214345ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/steamroller/bli_cntx_init_steamroller.c000066400000000000000000000063071427272030600270440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_steamroller( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_steamroller_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 4, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_piledriver_asm_16x3, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_piledriver_asm_8x3, FALSE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2, FALSE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 16, 8, 4, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 3, 3, 2, 2 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 2016, 1008, 512, 400 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 128, 128, 256, 160 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8400, 8400, 8400, 8400 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/steamroller/bli_family_steamroller.h000066400000000000000000000035051427272030600263300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif cython-blis-0.9.1/blis/_src/config/steamroller/make_defs.mk000066400000000000000000000061521427272030600237070ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := steamroller #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mfpmath=sse -mavx -mfma -march=bdver3 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else $(error gcc or clang are required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/template/000077500000000000000000000000001427272030600207165ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/template/bli_cntx_init_template.c000066400000000000000000000076571427272030600256210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_template( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_template_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 5, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_template_noopt, FALSE, BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE, BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE, BLIS_TRSM_L_UKR, BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt, FALSE, BLIS_TRSM_U_UKR, BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt, FALSE, cntx ); // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( BLIS_AXPY2V_KER, BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt, BLIS_DOTAXPYV_KER, BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt, BLIS_AXPYF_KER, BLIS_DCOMPLEX, bli_zaxpyf_template_noopt, BLIS_DOTXF_KER, BLIS_DCOMPLEX, bli_zdotxf_template_noopt, BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt, BLIS_DOTV_KER, BLIS_DCOMPLEX, bli_zdotv_template_noopt, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 0, 0, 0, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 0, 0, 0, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 0, 0, 0, 128 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 0, 0, 0, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 0, 0, 0, 4096 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/template/bli_family_template.h000066400000000000000000000033071427272030600250740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/template/kernels/000077500000000000000000000000001427272030600223615ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/template/kernels/1/000077500000000000000000000000001427272030600225215ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/template/kernels/1/bli_axpyv_template_noopt_var1.c000066400000000000000000000154101427272030600307260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zaxpyv_template_noopt ( conj_t conjx, dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, cntx_t* restrict cntx ) { /* Template axpyv kernel implementation This function contains a template implementation for a double-precision complex kernel, coded in C, which can serve as the starting point for one to write an optimized kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be similar, with the real instantiations being noticeably simpler due to the disappearance of conjugation in the real domain.) This kernel performs a vector scale and accumulate (axpy) operation: y := y + alpha * conjx( x ) where x and y are vectors of length n and alpha is a scalar. Parameters: - conjx: Compute with conjugated values of x? - n: The number of elements in vectors x and y. - alpha: The address of a scalar. - x: The address of vector x. - incx: The vector increment of x. incx should be unit unless the implementation makes special accomodation for non-unit values. - y: The address of vector y. - incy: The vector increment of y. incy should be unit unless the implementation makes special accomodation for non-unit values. This template code calls the reference implementation if any of the following conditions are true: - Either of the strides incx or incy is non-unit. - Vectors x and y are unaligned with different offsets. If the vectors are aligned, or unaligned by the same offset, then optimized code can be used for the bulk of the computation. This template shows how the front-edge case can be handled so that the remaining computation is aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE.) Additional things to consider: - Because conjugation disappears in the real domain, real instances of this kernel can safely ignore the values of any conjugation parameters, thereby simplifying the implementation. For more info, please refer to the BLIS website and/or contact the blis-devel mailing list. -FGVZ */ const dim_t n_elem_per_reg = 1; const dim_t n_iter_unroll = 1; const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll; const siz_t type_size = sizeof( *x ); dcomplex* xp; dcomplex* yp; bool use_ref = FALSE; dim_t n_pre = 0; dim_t n_iter; dim_t n_left; dim_t off_x, off_y; dim_t i; if ( bli_zero_dim1( n ) ) return; if ( bli_zeq0( *alpha ) ) return; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( bli_has_nonunit_inc2( incx, incy ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ) { use_ref = TRUE; // If a, the second column of a, and y are unaligned by the same // offset, then we can still use an implementation that depends on // alignment for most of the operation. off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE ); off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE ); if ( off_x == off_y ) { use_ref = FALSE; n_pre = off_x / type_size; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { zaxpyv_ft f = bli_zaxpyv_template_ref; f ( conjx, n, alpha, x, incx, y, incy, cntx ); return; } // Compute the number of unrolled and leftover (edge) iterations. n_iter = ( n - n_pre ) / n_elem_per_iter; n_left = ( n - n_pre ) % n_elem_per_iter; // Initialize pointers into x and y. xp = x; yp = y; // Iterate over elements of x and y to compute: // y += alpha * conjx( x ); if ( bli_is_noconj( conjx ) ) { // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zaxpys( *alpha, *xp, *yp ); xp += 1; yp += 1; } // The bulk of the operation is executed here. The addresses xp and // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zaxpys( *alpha, *xp, *yp ); xp += n_elem_per_iter; yp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zaxpys( *alpha, *xp, *yp ); xp += 1; yp += 1; } } else // if ( bli_is_conj( conjx ) ) { // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zaxpyjs( *alpha, *xp, *yp ); xp += 1; yp += 1; } // The bulk of the operation is executed here. The addresses xp and // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zaxpyjs( *alpha, *xp, *yp ); xp += n_elem_per_iter; yp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zaxpyjs( *alpha, *xp, *yp ); xp += 1; yp += 1; } } } cython-blis-0.9.1/blis/_src/config/template/kernels/1/bli_dotv_template_noopt_var1.c000066400000000000000000000172501427272030600305370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zdotv_template_noopt ( conj_t conjx, conj_t conjy, dim_t n, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, dcomplex* restrict rho, cntx_t* restrict cntx ) { /* Template dotv kernel implementation This function contains a template implementation for a double-precision complex kernel, coded in C, which can serve as the starting point for one to write an optimized kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be similar, with the real instantiations being noticeably simpler due to the disappearance of conjugation in the real domain.) This kernel performs an inner (dot) product operation: rho := conjx( x^T ) * conjy( y ) where x and y are vectors of length n and rho is a scalar. Parameters: - conjx: Compute with conjugated values of x? - conjy: Compute with conjugated values of y? - n: The number of elements in vectors x and y. - x: The address of vector x. - incx: The vector increment of x. incx should be unit unless the implementation makes special accomodation for non-unit values. - y: The address of vector y. - incy: The vector increment of y. incy should be unit unless the implementation makes special accomodation for non-unit values. - rho: The address of the output scalar. This template code calls the reference implementation if any of the following conditions are true: - Either of the strides incx or incy is non-unit. - Vectors x and y are unaligned with different offsets. If the vectors are aligned, or unaligned by the same offset, then optimized code can be used for the bulk of the computation. This template shows how the front-edge case can be handled so that the remaining computation is aligned. (This template guarantees alignment to be BLIS_SIMD_ALIGN_SIZE.) Additional things to consider: - While four combinations of possible values of conjx and conjy exist, we implement only conjugation on x explicitly; we induce the other two cases by toggling the effective conjugation on x and then conjugating the dot product result. - Because conjugation disappears in the real domain, real instances of this kernel can safely ignore the values of any conjugation parameters, thereby simplifying the implementation. For more info, please refer to the BLIS website and/or contact the blis-devel mailing list. -FGVZ */ const dim_t n_elem_per_reg = 1; const dim_t n_iter_unroll = 1; const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll; const siz_t type_size = sizeof( *x ); dcomplex* xp; dcomplex* yp; dcomplex dotxy; bool use_ref = FALSE; dim_t n_pre = 0; dim_t n_iter; dim_t n_left; dim_t off_x, off_y; dim_t i; conj_t conjx_use; // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { bli_zset0s( *rho ); return; } // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( bli_has_nonunit_inc2( incx, incy ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ) { use_ref = TRUE; // If a, the second column of a, and y are unaligned by the same // offset, then we can still use an implementation that depends on // alignment for most of the operation. off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE ); off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE ); if ( off_x == off_y ) { use_ref = FALSE; n_pre = off_x / type_size; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { zdotv_ft f = bli_zdotv_template_ref; f ( conjx, conjy, n, x, incx, y, incy, rho, cntx ); return; } // Compute the number of unrolled and leftover (edge) iterations. n_iter = ( n - n_pre ) / n_elem_per_iter; n_left = ( n - n_pre ) % n_elem_per_iter; // Initialize pointers into x and y. xp = x; yp = y; // Initialize accumulator to zero. bli_zset0s( dotxy ); conjx_use = conjx; // If y must be conjugated, we compute the result indirectly by first // toggling the effective conjugation of x and then conjugating the // resulting dot product. if ( bli_is_conj( conjy ) ) bli_toggle_conj( &conjx_use ); // Iterate over elements of x and y to compute: // rho = conjx( x^T ) * conjy( y ); if ( bli_is_noconj( conjx_use ) ) { // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zdots( *xp, *yp, dotxy ); xp += 1; yp += 1; } // The bulk of the operation is executed here. The addresses xp and // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zdots( *xp, *yp, dotxy ); xp += n_elem_per_iter; yp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zdots( *xp, *yp, dotxy ); xp += 1; yp += 1; } } else // if ( bli_is_conj( conjx_use ) ) { // Compute front edge cases if x and y were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); xp += 1; yp += 1; } // The bulk of the operation is executed here. The addresses xp and // yp are guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); xp += n_elem_per_iter; yp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); xp += 1; yp += 1; } } // If conjugation on y was requested, we induce it by conjugating // the contents of dotxy. if ( bli_is_conj( conjy ) ) bli_zconjs( dotxy ); bli_zcopys( dotxy, *rho ); } cython-blis-0.9.1/blis/_src/config/template/kernels/1f/000077500000000000000000000000001427272030600226675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c000066400000000000000000000240661427272030600311650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zaxpy2v_template_noopt ( conj_t conjx, conj_t conjy, dim_t n, dcomplex* restrict alpha1, dcomplex* restrict alpha2, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, dcomplex* restrict z, inc_t incz, cntx_t* restrict cntx ) { /* Template axpy2v kernel implementation This function contains a template implementation for a double-precision complex kernel, coded in C, which can serve as the starting point for one to write an optimized kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be similar, with the real instantiations being noticeably simpler due to the disappearance of conjugation in the real domain.) This kernel fuses two axpyv operations: z := z + alpha1 * conjx( x ) z := z + alpha2 * conjy( y ) where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars. Parameters: - conjx: Compute with conjugated values of x? - conjy: Compute with conjugated values of y? - n: The number of elements in vectors x, y, and z. - alpha1: The address of the scalar to be applied to x. - alpha2: The address of the scalar to be applied to y. - x: The address of vector x. - incx: The vector increment of x. incx should be unit unless the implementation makes special accomodation for non-unit values. - y: The address of vector y. - incy: The vector increment of y. incy should be unit unless the implementation makes special accomodation for non-unit values. - z: The address of vector z. - incz: The vector increment of z. incz should be unit unless the implementation makes special accomodation for non-unit values. This template code calls the reference implementation if any of the following conditions are true: - Any of the strides incx, incy, or incz is non-unit. - Vectors x, y, and z are unaligned with different offsets. If the vectors are aligned, or unaligned by the same offset, then optimized code can be used for the bulk of the computation. This template shows how the front-edge case can be handled so that the remaining computation is aligned. (This template guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.) Here are a few additional things to consider: - Because conjugation disappears in the real domain, real instances of this kernel can safely ignore the values of any conjugation parameters, thereby simplifying the implementation. For more info, please refer to the BLIS website and/or contact the blis-devel mailing list. -FGVZ */ const dim_t n_elem_per_reg = 1; const dim_t n_iter_unroll = 1; const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll; const siz_t type_size = sizeof( *x ); dcomplex* xp; dcomplex* yp; dcomplex* zp; bool use_ref = FALSE; dim_t n_pre = 0; dim_t n_iter; dim_t n_left; dim_t off_x, off_y, off_z; dim_t i; // Return early if possible. if ( bli_zero_dim1( n ) ) return; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( bli_has_nonunit_inc3( incx, incy, incz ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) ) { use_ref = TRUE; // If a, the second column of a, and y are unaligned by the same // offset, then we can still use an implementation that depends on // alignment for most of the operation. off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE ); off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE ); off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE ); if ( off_x == off_y && off_x == off_z ) { use_ref = FALSE; n_pre = off_x / type_size; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { zaxpy2v_ft f = bli_zaxpy2v_template_ref; f ( conjx, conjy, n, alpha1, alpha2, x, incx, y, incy, z, incz, cntx ); return; } // Compute the number of unrolled and leftover (edge) iterations. n_iter = ( n - n_pre ) / n_elem_per_iter; n_left = ( n - n_pre ) % n_elem_per_iter; // Initialize pointers into x, y, and z. xp = x; yp = y; zp = z; // Iterate over rows of x, y, and z to compute: // z += alpha1 * conjx( x ) + alpha2 * conjy( y ); if ( bli_is_noconj( conjx ) && bli_is_noconj( conjy ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zaxpys( *alpha1, *xp, *zp ); bli_zaxpys( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha1 and alpha2 should be loaded once prior to the n_iter // loop and the elements of z should be loaded and stored only once // each. The addresses xp, yp, and zp are guaranteed to be aligned // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zaxpys( *alpha1, *xp, *zp ); bli_zaxpys( *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zaxpys( *alpha1, *xp, *zp ); bli_zaxpys( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } } else if ( bli_is_noconj( conjx ) && bli_is_conj( conjy ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zaxpys( *alpha1, *xp, *zp ); bli_zaxpyjs( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha1 and alpha2 should be loaded once prior to the n_iter // loop and the elements of z should be loaded and stored only once // each. The addresses xp, yp, and zp are guaranteed to be aligned // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zaxpys( *alpha1, *xp, *zp ); bli_zaxpyjs( *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zaxpys( *alpha1, *xp, *zp ); bli_zaxpyjs( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } } else if ( bli_is_conj( conjx ) && bli_is_noconj( conjy ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zaxpyjs( *alpha1, *xp, *zp ); bli_zaxpys( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha1 and alpha2 should be loaded once prior to the n_iter // loop and the elements of z should be loaded and stored only once // each. The addresses xp, yp, and zp are guaranteed to be aligned // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zaxpyjs( *alpha1, *xp, *zp ); bli_zaxpys( *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zaxpyjs( *alpha1, *xp, *zp ); bli_zaxpys( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } } else // if ( bli_is_conj( conjx ) && bli_is_conj( conjy ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zaxpyjs( *alpha1, *xp, *zp ); bli_zaxpyjs( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha1 and alpha2 should be loaded once prior to the n_iter // loop and the elements of z should be loaded and stored only once // each. The addresses xp, yp, and zp are guaranteed to be aligned // to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zaxpyjs( *alpha1, *xp, *zp ); bli_zaxpyjs( *alpha2, *yp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zaxpyjs( *alpha1, *xp, *zp ); bli_zaxpyjs( *alpha2, *yp, *zp ); xp += 1; yp += 1; zp += 1; } } } cython-blis-0.9.1/blis/_src/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c000066400000000000000000000223431427272030600310570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zaxpyf_template_noopt ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, dcomplex* restrict alpha, dcomplex* restrict a, inc_t inca, inc_t lda, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, cntx_t* restrict cntx ) { /* Template axpyf kernel implementation This function contains a template implementation for a double-precision complex kernel, coded in C, which can serve as the starting point for one to write an optimized kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be similar, with the real instantiations being noticeably simpler due to the disappearance of conjugation in the real domain.) This kernel performs the following gemv-like operation: y := y + alpha * conja( A ) * conjx( x ) where A is an m x b_n matrix, x is a vector of length b_n, y is a vector of length m, and alpha is a scalar. The operation is performed as a series of fused axpyv operations, and therefore A should be column-stored. Parameters: - conja: Compute with conjugated values of A? - conjx: Compute with conjugated values of x? - m: The number of rows in matrix A. - b_n: The number of columns in matrix A. Must be equal to or less than the fusing factor. - alpha: The address of a scalar. - a: The address of matrix A. - inca: The row stride of A. inca should be unit unless the implementation makes special accomodation for non-unit values. - lda: The column stride of A. - x: The address of vector x. - incx: The vector increment of x. - y: The address of vector y. - incy: The vector increment of y. incy should be unit unless the implementation makes special accomodation for non-unit values. This template code calls the reference implementation if any of the following conditions are true: - Either of the strides inca or incy is non-unit. - The address of A, the second column of A, and y are unaligned with different offsets. If the first/second columns of A and address of y are aligned, or unaligned by the same offset, then optimized code can be used for the bulk of the computation. This template shows how the front-edge case can be handled so that the remaining computation is aligned. (This template guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.) Additional things to consider: - When optimizing, you should fully unroll the loops over b_n. This is the dimension across which we are fusing axpyv operations. - This template code chooses to call the reference implementation whenever b_n is less than the fusing factor, so as to avoid having to handle edge cases. One may choose to optimize this edge case, if desired. - Because conjugation disappears in the real domain, real instances of this kernel can safely ignore the values of any conjugation parameters, thereby simplifying the implementation. For more info, please refer to the BLIS website and/or contact the blis-devel mailing list. -FGVZ */ const dim_t n_elem_per_reg = 1; const dim_t n_iter_unroll = 1; const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll; const siz_t type_size = sizeof( *a ); dcomplex* ap[ bli_zaxpyf_fusefac ]; dcomplex* xp[ bli_zaxpyf_fusefac ]; dcomplex* yp; dcomplex alpha_x[ bli_zaxpyf_fusefac ]; bool use_ref = FALSE; dim_t m_pre = 0; dim_t m_iter; dim_t m_left; dim_t off_a, off_a2, off_y; dim_t i, j; // Return early if possible. if ( bli_zero_dim2( m, b_n ) ) return; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_zaxpyf_fusefac ) { use_ref = TRUE; } else if ( bli_has_nonunit_inc3( inca, incx, incy ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) ) { use_ref = TRUE; // If a, the second column of a, and y are unaligned by the same // offset, then we can still use an implementation that depends on // alignment for most of the operation. off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE ); off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE ); off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE ); if ( off_a == off_y && off_a == off_a2 ) { use_ref = FALSE; m_pre = off_a / type_size; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { zaxpyf_ft f = bli_zaxpyf_template_ref; f ( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy, cntx ); return; } // Compute the number of unrolled and leftover (edge) iterations. m_iter = ( m - m_pre ) / n_elem_per_iter; m_left = ( m - m_pre ) % n_elem_per_iter; // Initialize pointers into the columns of A and elements of x. for ( j = 0; j < b_n; ++j ) { ap[ j ] = a + (j )*lda; xp[ j ] = x + (j )*incx; } yp = y; // Load elements of x or conj(x) into alpha_x and scale by alpha. if ( bli_is_noconj( conjx ) ) { for ( j = 0; j < b_n; ++j ) { bli_zcopys( *xp[ j ], alpha_x[ j ] ); bli_zscals( *alpha, alpha_x[ j ] ); } } else // if ( bli_is_conj( conjx ) ) { for ( j = 0; j < b_n; ++j ) { bli_zcopyjs( *xp[ j ], alpha_x[ j ] ); bli_zscals( *alpha, alpha_x[ j ] ); } } // Iterate over rows of A and y to compute: // y += conja( A )*conjx( x ); if ( bli_is_noconj( conja ) ) { // Compute front edge cases if a and y were unaligned. for ( i = 0; i < m_pre; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } yp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of alpha_x should be loaded once prior to the m_iter // loop, and the b_n loop should be fully unrolled. The addresses in // ap[] and yp are guaranteed to be aligned to // BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < m_iter; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += n_elem_per_iter; } yp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < m_left; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zaxpys( alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } yp += 1; } } else // if ( bli_is_conj( conja ) ) { // Compute front edge cases if a and y were unaligned. for ( i = 0; i < m_pre; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } yp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of alpha_x should be loaded once prior to the m_iter // loop, and the b_n loop should be fully unrolled. The addresses in // ap[] and yp are guaranteed to be aligned to // BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < m_iter; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += n_elem_per_iter; } yp += n_elem_per_iter; } // Compute tail edge cases. for ( i = 0; i < m_left; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zaxpyjs( alpha_x[ j ], *ap[ j ], *yp ); ap[ j ] += 1; } yp += 1; } } } cython-blis-0.9.1/blis/_src/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c000066400000000000000000000260401427272030600316040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zdotaxpyv_template_noopt ( conj_t conjxt, conj_t conjx, conj_t conjy, dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, inc_t incx, dcomplex* restrict y, inc_t incy, dcomplex* restrict rho, dcomplex* restrict z, inc_t incz, cntx_t* restrict cntx ) { /* Template dotaxpyv kernel implementation This function contains a template implementation for a double-precision complex kernel, coded in C, which can serve as the starting point for one to write an optimized kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be similar, with the real instantiations being noticeably simpler due to the disappearance of conjugation in the real domain.) This kernel fuses a dotv and axpyv operation: rho := conjxt( x^T ) * conjy( y ) z := z + alpha * conjx( x ) where x, y, and z are vectors of length n and alpha1 and alpha2 are scalars. Parameters: - conjxt: Compute with conjugated values of x^T? - conjx: Compute with conjugated values of x? - conjy: Compute with conjugated values of y? - n: The number of elements in vectors x, y, and z. - alpha: The address of the scalar to be applied to x. - x: The address of vector x. - incx: The vector increment of x. incx should be unit unless the implementation makes special accomodation for non-unit values. - y: The address of vector y. - incy: The vector increment of y. incy should be unit unless the implementation makes special accomodation for non-unit values. - rho: The address of the output scalar of the dotv subproblem. - z: The address of vector z. - incz: The vector increment of z. incz should be unit unless the implementation makes special accomodation for non-unit values. This template code calls the reference implementation if any of the following conditions are true: - Any of the strides incx, incy, or incz is non-unit. - Vectors x, y, and z are unaligned with different offsets. If the vectors are aligned, or unaligned by the same offset, then optimized code can be used for the bulk of the computation. This template shows how the front-edge case can be handled so that the remaining computation is aligned. (This template guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.) Here are a few additional things to consider: - While four combinations of possible values of conjx and conjy exist, we implement only conjugation on x explicitly; we induce the other two cases by toggling the effective conjugation on x and then conjugating the dot product result. - Because conjugation disappears in the real domain, real instances of this kernel can safely ignore the values of any conjugation parameters, thereby simplifying the implementation. For more info, please refer to the BLIS website and/or contact the blis-devel mailing list. -FGVZ */ const dim_t n_elem_per_reg = 1; const dim_t n_iter_unroll = 1; const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll; const siz_t type_size = sizeof( *x ); dcomplex* xp; dcomplex* yp; dcomplex* zp; dcomplex dotxy; bool use_ref = FALSE; dim_t n_pre = 0; dim_t n_iter; dim_t n_left; dim_t off_x, off_y, off_z; dim_t i; conj_t conjxt_use; // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { bli_zset0s( *rho ); return; } // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( bli_has_nonunit_inc3( incx, incy, incz ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( y, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) ) { use_ref = TRUE; // If x, y, and z are unaligned by the same offset, then we can // still use an implementation that depends on alignment for most // of the operation. off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE ); off_y = bli_offset_from_alignment( y, BLIS_SIMD_ALIGN_SIZE ); off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE ); if ( off_x == off_y && off_x == off_z ) { use_ref = FALSE; n_pre = off_x / type_size; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { zdotaxpyv_ft f = bli_zdotaxpyv_template_ref; f ( conjxt, conjx, conjy, n, alpha, x, incx, y, incy, rho, z, incz, cntx ); return; } // Compute the number of unrolled and leftover (edge) iterations. n_iter = ( n - n_pre ) / n_elem_per_iter; n_left = ( n - n_pre ) % n_elem_per_iter; // Initialize pointers into x, y, and z. xp = x; yp = y; zp = z; // Initialize accumulator to zero. bli_zset0s( dotxy ); conjxt_use = conjxt; // If y must be conjugated, we compute the result indirectly by first // toggling the effective conjugation of xt and then conjugating the // resulting dot product. if ( bli_is_conj( conjy ) ) bli_toggle_conj( &conjxt_use ); // Iterate over elements of x, y, and z to compute: // r = conjxt( x^T ) * conjy( y ); // z += alpha * conjx( x ); if ( bli_is_noconj( conjx ) && bli_is_noconj( conjxt_use ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zdots( *xp, *yp, dotxy ); bli_zaxpys( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha should be loaded once prior to the n_iter loop, dotxy // should be and kept in registers, and each element of x should be // loaded only once each. The addresses xp, yp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zdots( *xp, *yp, dotxy ); bli_zaxpys( *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zdots( *xp, *yp, dotxy ); bli_zaxpys( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } } else if ( bli_is_noconj( conjx ) && bli_is_conj( conjxt_use ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); bli_zaxpys( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha should be loaded once prior to the n_iter loop, dotxy // should be and kept in registers, and each element of x should be // loaded only once each. The addresses xp, yp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); bli_zaxpys( *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); bli_zaxpys( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } } else if ( bli_is_conj( conjx ) && bli_is_noconj( conjxt_use ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zdots( *xp, *yp, dotxy ); bli_zaxpyjs( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha should be loaded once prior to the n_iter loop, dotxy // should be and kept in registers, and each element of x should be // loaded only once each. The addresses xp, yp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zdots( *xp, *yp, dotxy ); bli_zaxpyjs( *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zdots( *xp, *yp, dotxy ); bli_zaxpyjs( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } } else // if ( bli_is_conj( conjx ) && bli_is_conj( conjxt_use ) ) { // Compute front edge cases if x, y, and z were unaligned. for ( i = 0; i < n_pre; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); bli_zaxpyjs( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // alpha should be loaded once prior to the n_iter loop, dotxy // should be and kept in registers, and each element of x should be // loaded only once each. The addresses xp, yp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < n_iter; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); bli_zaxpyjs( *alpha, *xp, *zp ); xp += n_elem_per_iter; yp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < n_left; ++i ) { bli_zdotjs( *xp, *yp, dotxy ); bli_zaxpyjs( *alpha, *xp, *zp ); xp += 1; yp += 1; zp += 1; } } // If conjugation on y was requested, we induce it by conjugating // the contents of rho. if ( bli_is_conj( conjy ) ) bli_zconjs( dotxy ); bli_zcopys( dotxy, *rho ); } cython-blis-0.9.1/blis/_src/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c000066400000000000000000000340001427272030600317470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zdotxaxpyf_template_noopt ( conj_t conjat, conj_t conja, conj_t conjw, conj_t conjx, dim_t m, dim_t b_n, dcomplex* restrict alpha, dcomplex* restrict a, inc_t inca, inc_t lda, dcomplex* restrict w, inc_t incw, dcomplex* restrict x, inc_t incx, dcomplex* restrict beta, dcomplex* restrict y, inc_t incy, dcomplex* restrict z, inc_t incz, cntx_t* restrict cntx ) { /* Template dotxaxpyf kernel implementation This function contains a template implementation for a double-precision complex kernel, coded in C, which can serve as the starting point for one to write an optimized kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be similar, with the real instantiations being noticeably simpler due to the disappearance of conjugation in the real domain.) This kernel performs the following two gemv-like operations: y := beta * y + alpha * conjat( A^T ) * conjw( w ) z := z + alpha * conja( A ) * conjx( x ) where A is an m x b_n matrix, x and y are vector of length b_n, w and z are vectors of length m, and alpha and beta are scalars. The operation fuses a dotxf and an axpyf operation, and therefore A should be column- stored. Parameters: - conjat: Compute with conjugated values of A^T? - conja: Compute with conjugated values of A? - conjw: Compute with conjugated values of w? - conjx: Compute with conjugated values of x? - m: The number of rows in matrix A. - b_n: The number of columns in matrix A. Must be equal to or less than the fusing factor. - alpha: The address of the scalar to be applied to A^T*w and A*x. - a: The address of matrix A. - inca: The row stride of A. inca should be unit unless the implementation makes special accomodation for non-unit values. - lda: The column stride of A. - w: The address of vector w. - incw: The vector increment of w. incw should be unit unless the implementation makes special accomodation for non-unit values. - x: The address of vector x. - incx: The vector increment of x. - beta: The address of the scalar to be applied to y. - y: The address of vector y. - incy: The vector increment of y. - z: The address of vector z. - incz: The vector increment of z. incz should be unit unless the implementation makes special accomodation for non-unit values. This template code calls the reference implementation if any of the following conditions are true: - Any of the strides inca, incw, or incz is non-unit. - The address of A, the second column of A, w, and z are unaligned with different offsets. If the first/second rows of A and addresses of w and z are aligned, or unaligned by the same offset, then optimized code can be used for the bulk of the computation. This template shows how the front-edge case can be handled so that the remaining computation is aligned. (This template guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.) Additional things to consider: - When optimizing, you should fully unroll the loops over b_n. This is the dimension across which we are fusing dotxv operations. - This template code chooses to call the reference implementation whenever b_n is less than the fusing factor, so as to avoid having to handle edge cases. One may choose to optimize this edge case, if desired. - Because conjugation disappears in the real domain, real instances of this kernel can safely ignore the values of any conjugation parameters, thereby simplifying the implementation. For more info, please refer to the BLIS website and/or contact the blis-devel mailing list. -FGVZ */ const dim_t n_elem_per_reg = 1; const dim_t n_iter_unroll = 1; const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll; const siz_t type_size = sizeof( *a ); dcomplex* ap[ bli_zdotxaxpyf_fusefac ]; dcomplex* xp[ bli_zdotxaxpyf_fusefac ]; dcomplex* yp[ bli_zdotxaxpyf_fusefac ]; dcomplex* wp; dcomplex* zp; dcomplex At_w[ bli_zdotxaxpyf_fusefac ]; dcomplex alpha_x[ bli_zdotxaxpyf_fusefac ]; bool use_ref = FALSE; dim_t m_pre = 0; dim_t m_iter; dim_t m_left; dim_t off_a, off_a2, off_w, off_z; dim_t i, j; conj_t conjat_use; // Return early if possible. if ( bli_zero_dim2( m, b_n ) ) return; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_zdotxaxpyf_fusefac ) { use_ref = TRUE; } else if ( bli_has_nonunit_inc3( inca, incw, incz ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( w, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( z, BLIS_SIMD_ALIGN_SIZE ) ) { use_ref = TRUE; // If a, the second column of a, w, and z are unaligned by the same // offset, then we can still use an implementation that depends on // alignment for most of the operation. off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE ); off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE ); off_w = bli_offset_from_alignment( w, BLIS_SIMD_ALIGN_SIZE ); off_z = bli_offset_from_alignment( z, BLIS_SIMD_ALIGN_SIZE ); if ( off_a == off_a2 && off_a == off_w && off_a == off_z ) { use_ref = FALSE; m_pre = off_a / type_size; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { zdotxaxpyf_ft f = bli_zdotxaxpyf_template_ref; f ( conjat, conja, conjw, conjx, m, b_n, alpha, a, inca, lda, w, incw, x, incx, beta, y, incy, z, incz, cntx ); return; } // Compute the number of unrolled and leftover (edge) iterations. m_iter = ( m - m_pre ) / n_elem_per_iter; m_left = ( m - m_pre ) % n_elem_per_iter; // Initialize pointers into the columns of A and elements of x. for ( j = 0; j < b_n; ++j ) { ap[ j ] = a + (j )*lda; xp[ j ] = x + (j )*incx; yp[ j ] = y + (j )*incy; } wp = w; zp = z; // Load elements of x or conj(x) into alpha_x and scale by alpha. if ( bli_is_noconj( conjx ) ) { for ( j = 0; j < b_n; ++j ) { bli_zcopys( *xp[ j ], alpha_x[ j ] ); bli_zscals( *alpha, alpha_x[ j ] ); } } else // if ( bli_is_conj( conjx ) ) { for ( j = 0; j < b_n; ++j ) { bli_zcopyjs( *xp[ j ], alpha_x[ j ] ); bli_zscals( *alpha, alpha_x[ j ] ); } } // Initialize our accumulators to zero. for ( j = 0; j < b_n; ++j ) { bli_zset0s( At_w[ j ] ); } conjat_use = conjat; // If w must be conjugated, we compute the result indirectly by first // toggling the effective conjugation of At and then conjugating the // resulting dot products. if ( bli_is_conj( conjw ) ) bli_toggle_conj( &conjat_use ); // Iterate over the columns of A and elements of w and z to compute: // y = beta * y + alpha * conjat( A^T ) * conjw( w ); // z = z + alpha * conja( A ) * conjx( x ); // where A is m x b_n. if ( bli_is_noconj( conja ) && bli_is_noconj( conjat_use ) ) { // Compute front edge cases if A, w, and z were unaligned. for ( i = 0; i < m_pre; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdots( *ap[ j ], *wp, At_w[ j ] ); bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of alpha_x should be loaded once prior to the m_iter // loop, At_w should be kept in registers, and the b_n loop should // be fully unrolled. The addresses in ap[], wp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < m_iter; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdots( *ap[ j ], *wp, At_w[ j ] ); bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } wp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < m_left; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdots( *ap[ j ], *wp, At_w[ j ] ); bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } } else if ( bli_is_noconj( conja ) && bli_is_conj( conjat_use ) ) { // Compute front edge cases if A, w, and z were unaligned. for ( i = 0; i < m_pre; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of alpha_x should be loaded once prior to the m_iter // loop, At_w should be kept in registers, and the b_n loop should // be fully unrolled. The addresses in ap[], wp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < m_iter; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } wp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < m_left; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); bli_zdots( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } } else if ( bli_is_conj( conja ) && bli_is_noconj( conjat_use ) ) { // Compute front edge cases if A, w, and z were unaligned. for ( i = 0; i < m_pre; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdots( *ap[ j ], *wp, At_w[ j ] ); bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of alpha_x should be loaded once prior to the m_iter // loop, At_w should be kept in registers, and the b_n loop should // be fully unrolled. The addresses in ap[], wp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < m_iter; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdots( *ap[ j ], *wp, At_w[ j ] ); bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } wp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < m_left; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdots( *ap[ j ], *wp, At_w[ j ] ); bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } } else if ( bli_is_conj( conja ) && bli_is_conj( conjat_use ) ) { // Compute front edge cases if A, w, and z were unaligned. for ( i = 0; i < m_pre; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of alpha_x should be loaded once prior to the m_iter // loop, At_w should be kept in registers, and the b_n loop should // be fully unrolled. The addresses in ap[], wp, and zp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( i = 0; i < m_iter; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += n_elem_per_iter; } wp += n_elem_per_iter; zp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( i = 0; i < m_left; ++i ) { for ( j = 0; j < b_n; ++j ) { bli_zdotjs( *ap[ j ], *wp, At_w[ j ] ); bli_zdotjs( *ap[ j ], alpha_x[ j ], *zp ); ap[ j ] += 1; } wp += 1; zp += 1; } } // If conjugation on w was requested, we induce it by conjugating // the contents of At_w. if ( bli_is_conj( conjw ) ) { for ( j = 0; j < b_n; ++j ) { bli_zconjs( At_w[ j ] ); } } // Scale the At_w product by alpha and accumulate into y after // scaling by beta. for ( j = 0; j < b_n; ++j ) { bli_zscals( *beta, *yp[ j ] ); bli_zaxpys( *alpha, At_w[ j ], *yp[ j ] ); } } cython-blis-0.9.1/blis/_src/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c000066400000000000000000000236251427272030600310600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zdotxf_template_noopt ( conj_t conjat, conj_t conjx, dim_t m, dim_t b_n, dcomplex* restrict alpha, dcomplex* restrict a, inc_t inca, inc_t lda, dcomplex* restrict x, inc_t incx, dcomplex* restrict beta, dcomplex* restrict y, inc_t incy, cntx_t* restrict cntx ) { /* Template dotxf kernel implementation This function contains a template implementation for a double-precision complex kernel, coded in C, which can serve as the starting point for one to write an optimized kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be similar, with the real instantiations being noticeably simpler due to the disappearance of conjugation in the real domain.) This kernel performs the following gemv-like operation: y := beta * y + alpha * conjat( A^T ) * conjx( x ) where A is an m x b_n matrix, x is a vector of length m, y is a vector of length b_n, and alpha and beta are scalars. The operation is performed as a series of fused dotxv operations, and therefore A should be column- stored. Parameters: - conjat: Compute with conjugated values of A^T? - conjx: Compute with conjugated values of x? - m: The number of rows in matrix A. - b_n: The number of columns in matrix A. Must be equal to or less than the fusing factor. - alpha: The address of the scalar to be applied to A*x. - a: The address of matrix A. - inca: The row stride of A. inca should be unit unless the implementation makes special accomodation for non-unit values. - lda: The column stride of A. - x: The address of vector x. - incx: The vector increment of x. incx should be unit unless the implementation makes special accomodation for non-unit values. - beta: The address of the scalar to be applied to y. - y: The address of vector y. - incy: The vector increment of y. This template code calls the reference implementation if any of the following conditions are true: - Either of the strides inca or incx is non-unit. - The address of A, the second column of A, and x are unaligned with different offsets. If the first/second columns of A and address of x are aligned, or unaligned by the same offset, then optimized code can be used for the bulk of the computation. This template shows how the front-edge case can be handled so that the remaining computation is aligned. (This template guarantees alignment in the main loops to be BLIS_SIMD_ALIGN_SIZE.) Additional things to consider: - When optimizing, you should fully unroll the loops over b_n. This is the dimension across which we are fusing dotxv operations. - This template code chooses to call the reference implementation whenever b_n is less than the fusing factor, so as to avoid having to handle edge cases. One may choose to optimize this edge case, if desired. - Because conjugation disappears in the real domain, real instances of this kernel can safely ignore the values of any conjugation parameters, thereby simplifying the implementation. For more info, please refer to the BLIS website and/or contact the blis-devel mailing list. -FGVZ */ const dim_t n_elem_per_reg = 1; const dim_t n_iter_unroll = 1; const dim_t n_elem_per_iter = n_elem_per_reg * n_iter_unroll; const siz_t type_size = sizeof( *x ); dcomplex* ap[ bli_zdotxf_fusefac ]; dcomplex* xp; dcomplex* yp[ bli_zdotxf_fusefac ]; dcomplex Atx[ bli_zdotxf_fusefac ]; bool use_ref = FALSE; dim_t m_pre = 0; dim_t m_iter; dim_t m_left; dim_t off_a, off_a2, off_x; dim_t i, j; conj_t conjat_use; // Return early if possible. if ( bli_zero_dim1( b_n ) ) return; // If the vector lengths are zero, scale r by beta and return. if ( bli_zero_dim1( m ) ) { bli_zscalv_ex ( BLIS_NO_CONJUGATE, b_n, beta, y, incy, cntx ); return; } // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_zdotxf_fusefac ) { use_ref = TRUE; } else if ( bli_has_nonunit_inc2( inca, incx ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( a+lda, BLIS_SIMD_ALIGN_SIZE ) || bli_is_unaligned_to( x, BLIS_SIMD_ALIGN_SIZE ) ) { use_ref = TRUE; // If a, the second column of a, and x are unaligned by the same // offset, then we can still use an implementation that depends on // alignment for most of the operation. off_a = bli_offset_from_alignment( a, BLIS_SIMD_ALIGN_SIZE ); off_a2 = bli_offset_from_alignment( a+lda, BLIS_SIMD_ALIGN_SIZE ); off_x = bli_offset_from_alignment( x, BLIS_SIMD_ALIGN_SIZE ); if ( off_a == off_a2 && off_a == off_x ) { use_ref = FALSE; m_pre = off_x / type_size; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { zdotxf_ft f = bli_zdotxf_template_ref; f ( conjat, conjx, m, b_n, alpha, a, inca, lda, x, incx, beta, y, incy, cntx ); return; } // Compute the number of unrolled and leftover (edge) iterations. m_iter = ( m - m_pre ) / n_elem_per_iter; m_left = ( m - m_pre ) % n_elem_per_iter; // Initialize pointers into the rows of A and elements of y. for ( i = 0; i < b_n; ++i ) { ap[ i ] = a + (i )*lda; yp[ i ] = y + (i )*incy; } xp = x; // Initialize our accumulators to zero. for ( i = 0; i < b_n; ++i ) { bli_zset0s( Atx[ i ] ); } conjat_use = conjat; // If x must be conjugated, we compute the result indirectly by first // toggling the effective conjugation of A and then conjugating the // resulting product A^T*x. if ( bli_is_conj( conjx ) ) bli_toggle_conj( &conjat_use ); // Iterate over columns of A and rows of x to compute: // Atx = conjat_use( A^T ) * x; if ( bli_is_noconj( conjat_use ) ) { // Compute front edge cases if A and y were unaligned. for ( j = 0; j < m_pre; ++j ) { for ( i = 0; i < b_n; ++i ) { bli_zzzdots( *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } xp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of Atx should be kept in registers, and the b_n loop // should be fully unrolled. The addresses in ap[] and xp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( j = 0; j < m_iter; ++j ) { for ( i = 0; i < b_n; ++i ) { bli_zzzdots( *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += n_elem_per_iter; } xp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( j = 0; j < m_left; ++j ) { for ( i = 0; i < b_n; ++i ) { bli_zzzdots( *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } xp += 1; } } else // if ( bli_is_conj( conjat_use ) ) { // Compute front edge cases if A and y were unaligned. for ( j = 0; j < m_pre; ++j ) { for ( i = 0; i < b_n; ++i ) { bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } xp += 1; } // The bulk of the operation is executed here. For best performance, // the elements of Atx should be kept in registers, and the b_n loop // should be fully unrolled. The addresses in ap[] and xp are // guaranteed to be aligned to BLIS_SIMD_ALIGN_SIZE. for ( j = 0; j < m_iter; ++j ) { for ( i = 0; i < b_n; ++i ) { bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += n_elem_per_iter; } xp += n_elem_per_iter; } // Compute tail edge cases, if applicable. for ( j = 0; j < m_left; ++j ) { for ( i = 0; i < b_n; ++i ) { bli_zzzdotjs( *ap[ i ], *xp, Atx[ i ] ); ap[ i ] += 1; } xp += 1; } } // If conjugation on y was requested, we induce it by conjugating // the contents of Atx. if ( bli_is_conj( conjx ) ) { for ( i = 0; i < b_n; ++i ) { bli_zconjs( Atx[ i ] ); } } // Scale the Atx product by alpha and accumulate into y after // scaling by beta. for ( i = 0; i < b_n; ++i ) { bli_zzscals( *beta, *yp[ i ] ); bli_zzzaxpys( *alpha, Atx[ i ], *yp[ i ] ); } } cython-blis-0.9.1/blis/_src/config/template/kernels/3/000077500000000000000000000000001427272030600225235ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/template/kernels/3/bli_gemm_template_noopt_mxn.c000066400000000000000000000110661427272030600304420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zgemm_template_noopt ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a1, dcomplex* restrict b1, dcomplex* restrict beta, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { /* Template gemm micro-kernel implementation This function contains a template implementation for a double-precision complex micro-kernel, coded in C, which can serve as the starting point for one to write an optimized micro-kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be nearly identical.) This micro-kernel performs a matrix-matrix multiplication of the form: C11 := beta * C11 + alpha * A1 * B1 where A1 is MR x k, B1 is k x NR, C11 is MR x NR, and alpha and beta are scalars. For more info, please refer to the BLIS website's wiki on kernels: https://github.com/flame/blis/wiki/KernelsHowTo and/or contact the blis-devel mailing list. -FGVZ */ const num_t dt = BLIS_DCOMPLEX; const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t cs_a = packmr; const inc_t rs_b = packnr; const inc_t rs_ab = 1; const inc_t cs_ab = mr; dim_t l, j, i; dcomplex ab[ mr * nr ]; dcomplex* abij; dcomplex ai, bj; /* Initialize the accumulator elements in ab to zero. */ for ( i = 0; i < mr * nr; ++i ) { bli_zset0s( *(ab + i) ); } /* Perform a series of k rank-1 updates into ab. */ for ( l = 0; l < k; ++l ) { abij = ab; /* In an optimized implementation, these two loops over MR and NR are typically fully unrolled. */ for ( j = 0; j < nr; ++j ) { bj = *(b1 + j); for ( i = 0; i < mr; ++i ) { ai = *(a1 + i); bli_zdots( ai, bj, *abij ); abij += rs_ab; } } a1 += cs_a; b1 += rs_b; } /* Scale each element of ab by alpha. */ for ( i = 0; i < mr * nr; ++i ) { bli_zscals( *alpha, *(ab + i) ); } /* If beta is zero, overwrite c11 with the scaled result in ab. Otherwise, scale c11 by beta and then add the scaled result in ab. */ if ( bli_zeq0( *beta ) ) { /* c11 := ab */ bli_zcopys_mxn( m, n, ab, rs_ab, cs_ab, c11, rs_c, cs_c ); } else { /* c11 := beta * c11 + ab */ bli_zxpbys_mxn( m, n, ab, rs_ab, cs_ab, beta, c11, rs_c, cs_c ); } } cython-blis-0.9.1/blis/_src/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c000066400000000000000000000065761427272030600316750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zgemmtrsm_l_template_noopt ( dim_t k, dcomplex* restrict alpha, dcomplex* restrict a10, dcomplex* restrict a11, dcomplex* restrict b01, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { /* Template gemmtrsm_l micro-kernel implementation This function contains a template implementation for a double-precision complex micro-kernel that fuses a gemm with a trsm_l subproblem. This micro-kernel performs the following compound operation: B11 := alpha * B11 - A10 * B01 (gemm) B11 := inv(A11) * B11 (trsm) C11 := B11 where A11 is MR x MR and lower triangular, A10 is MR x k, B01 is k x NR, B11 is MR x NR, and alpha is a scalar. Here, inv() denotes matrix inverse. For more info, please refer to the BLIS website's wiki on kernels: https://github.com/flame/blis/wiki/KernelsHowTo and/or contact the blis-devel mailing list. -FGVZ */ const num_t dt = BLIS_DCOMPLEX; const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t rs_b = packnr; const inc_t cs_b = 1; dcomplex* restrict minus_one = bli_zm1; /* b11 = alpha * b11 - a10 * b01; */ bli_zgemm_template_noopt ( mr, nr, k, minus_one, a10, b01, alpha, b11, rs_b, cs_b, data ); /* b11 = inv(a11) * b11; c11 = b11; */ bli_ztrsm_l_template_noopt ( a11, b11, c11, rs_c, cs_c, data ); } cython-blis-0.9.1/blis/_src/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c000066400000000000000000000065751427272030600317050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_zgemmtrsm_u_template_noopt ( dim_t k, dcomplex* restrict alpha, dcomplex* restrict a10, dcomplex* restrict a11, dcomplex* restrict b01, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { /* Template gemmtrsm_u micro-kernel implementation This function contains a template implementation for a double-precision complex micro-kernel that fuses a gemm with a trsm_u subproblem. This micro-kernel performs the following compound operation: B11 := alpha * B11 - A12 * B21 (gemm) B11 := inv(A11) * B11 (trsm) C11 := B11 where A11 is MR x MR and upper triangular, A12 is MR x k, B21 is k x NR, B11 is MR x NR, and alpha is a scalar. Here, inv() denotes matrix inverse. For more info, please refer to the BLIS website's wiki on kernels: https://github.com/flame/blis/wiki/KernelsHowTo and/or contact the blis-devel mailing list. -FGVZ */ const num_t dt = BLIS_DCOMPLEX; const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const inc_t rs_b = packnr; const inc_t cs_b = 1; dcomplex* restrict minus_one = bli_zm1; /* b11 = alpha * b11 - a12 * b21; */ bli_zgemm_template_noopt ( mr, nr, k, minus_one, a10, b01, alpha, b11, rs_b, cs_b, data ); /* b11 = inv(a11) * b11; c11 = b11; */ bli_ztrsm_u_template_noopt ( a11, b11, c11, rs_c, cs_c, data ); } cython-blis-0.9.1/blis/_src/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c000066400000000000000000000111321427272030600310070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_ztrsm_l_template_noopt ( dcomplex* restrict a11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { /* Template trsm_l micro-kernel implementation This function contains a template implementation for a double-precision complex trsm micro-kernel, coded in C, which can serve as the starting point for one to write an optimized micro-kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be nearly identical.) This micro-kernel performs the following operation: C11 := inv(A11) * B11 where A11 is MR x MR and lower triangular, B11 is MR x NR, and C11 is MR x NR. For more info, please refer to the BLIS website's wiki on kernels: https://github.com/flame/blis/wiki/KernelsHowTo and/or contact the blis-devel mailing list. -FGVZ */ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const dim_t m = mr; const dim_t n = nr; const inc_t rs_a = 1; const inc_t cs_a = packmr; const inc_t rs_b = packnr; const inc_t cs_b = 1; dim_t iter, i, j, l; dim_t n_behind; dcomplex* restrict alpha11; dcomplex* restrict a10t; dcomplex* restrict alpha10; dcomplex* restrict X0; dcomplex* restrict x1; dcomplex* restrict x01; dcomplex* restrict chi01; dcomplex* restrict chi11; dcomplex* restrict gamma11; dcomplex rho11; for ( iter = 0; iter < m; ++iter ) { i = iter; n_behind = i; alpha11 = a11 + (i )*rs_a + (i )*cs_a; a10t = a11 + (i )*rs_a + (0 )*cs_a; X0 = b11 + (0 )*rs_b + (0 )*cs_b; x1 = b11 + (i )*rs_b + (0 )*cs_b; /* x1 = x1 - a10t * X0; */ /* x1 = x1 / alpha11; */ for ( j = 0; j < n; ++j ) { x01 = X0 + (0 )*rs_b + (j )*cs_b; chi11 = x1 + (0 )*rs_b + (j )*cs_b; gamma11 = c11 + (i )*rs_c + (j )*cs_c; /* chi11 = chi11 - a10t * x01; */ bli_zset0s( rho11 ); for ( l = 0; l < n_behind; ++l ) { alpha10 = a10t + (l )*cs_a; chi01 = x01 + (l )*rs_b; bli_zaxpys( *alpha10, *chi01, rho11 ); } bli_zsubs( rho11, *chi11 ); /* chi11 = chi11 / alpha11; */ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ bli_zscals( *alpha11, *chi11 ); /* Output final result to matrix C. */ bli_zcopys( *chi11, *gamma11 ); } } } cython-blis-0.9.1/blis/_src/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c000066400000000000000000000111451427272030600310240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_ztrsm_u_template_noopt ( dcomplex* restrict a11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { /* Template trsm_u micro-kernel implementation This function contains a template implementation for a double-precision complex trsm micro-kernel, coded in C, which can serve as the starting point for one to write an optimized micro-kernel on an arbitrary architecture. (We show a template implementation for only double-precision complex because the templates for the other three floating-point types would be nearly identical.) This micro-kernel performs the following operation: C11 := inv(A11) * B11 where A11 is MR x MR and upper triangular, B11 is MR x NR, and C11 is MR x NR. For more info, please refer to the BLIS website's wiki on kernels: https://github.com/flame/blis/wiki/KernelsHowTo and/or contact the blis-devel mailing list. -FGVZ */ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); const dim_t m = mr; const dim_t n = nr; const inc_t rs_a = 1; const inc_t cs_a = packmr; const inc_t rs_b = packnr; const inc_t cs_b = 1; dim_t iter, i, j, l; dim_t n_behind; dcomplex* restrict alpha11; dcomplex* restrict a12t; dcomplex* restrict alpha12; dcomplex* restrict X2; dcomplex* restrict x1; dcomplex* restrict x21; dcomplex* restrict chi21; dcomplex* restrict chi11; dcomplex* restrict gamma11; dcomplex rho11; for ( iter = 0; iter < m; ++iter ) { i = m - iter - 1; n_behind = iter; alpha11 = a11 + (i )*rs_a + (i )*cs_a; a12t = a11 + (i )*rs_a + (i+1)*cs_a; x1 = b11 + (i )*rs_b + (0 )*cs_b; X2 = b11 + (i+1)*rs_b + (0 )*cs_b; /* x1 = x1 - a12t * X2; */ /* x1 = x1 / alpha11; */ for ( j = 0; j < n; ++j ) { chi11 = x1 + (0 )*rs_b + (j )*cs_b; x21 = X2 + (0 )*rs_b + (j )*cs_b; gamma11 = c11 + (i )*rs_c + (j )*cs_c; /* chi11 = chi11 - a12t * x21; */ bli_zset0s( rho11 ); for ( l = 0; l < n_behind; ++l ) { alpha12 = a12t + (l )*cs_a; chi21 = x21 + (l )*rs_b; bli_zaxpys( *alpha12, *chi21, rho11 ); } bli_zsubs( rho11, *chi11 ); /* chi11 = chi11 / alpha11; */ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ bli_zscals( *alpha11, *chi11 ); /* Output final result to matrix C. */ bli_zcopys( *chi11, *gamma11 ); } } } cython-blis-0.9.1/blis/_src/config/template/make_defs.mk000066400000000000000000000051241427272030600231670ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := template #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 CKVECFLAGS := # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) CRVECFLAGS := $(CKVECFLAGS) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/thunderx2/000077500000000000000000000000001427272030600210265ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/thunderx2/bli_cntx_init_thunderx2.c000066400000000000000000000060551427272030600260300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_thunderx2( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; // Set default kernel blocksizes and functions. bli_cntx_init_thunderx2_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 2, BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_armv8a_asm_8x12, FALSE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8, FALSE, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 8, 6, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 12, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 120, 120, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 640, 240, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 3072, 3072, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, cntx ); } cython-blis-0.9.1/blis/_src/config/thunderx2/bli_family_thunderx2.h000066400000000000000000000034701427272030600253150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 cython-blis-0.9.1/blis/_src/config/thunderx2/make_defs.mk000066400000000000000000000060431427272030600233000ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := thunderx2 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := -D_GNU_SOURCE CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -mcpu=thunderx2t99 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 -ftree-vectorize ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mcpu=thunderx2t99 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mcpu=thunderx2t99 else $(error gcc or clang is required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/x86_64/000077500000000000000000000000001427272030600200415ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/x86_64/bli_family_x86_64.h000066400000000000000000000033061427272030600233410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/x86_64/make_defs.mk000066400000000000000000000061221427272030600223110ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := x86_64 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xSSE3 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/x86_64_no_skx/000077500000000000000000000000001427272030600214225ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/x86_64_no_skx/bli_family_x86_64_no_skx.h000066400000000000000000000033061427272030600263030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/x86_64_no_skx/make_defs.mk000066400000000000000000000061271427272030600236770ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := x86_64_no_skx #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xSSE3 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/x86_64_no_zen2/000077500000000000000000000000001427272030600214735ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/x86_64_no_zen2/bli_family_x86_64_no_zen2.h000066400000000000000000000033061427272030600264250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/x86_64_no_zen2/make_defs.mk000066400000000000000000000061301427272030600237420ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := x86_64_no_zen2 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xSSE3 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/x86_64_no_zen3/000077500000000000000000000000001427272030600214745ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/x86_64_no_zen3/bli_family_x86_64_no_zen3.h000066400000000000000000000033061427272030600264270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif cython-blis-0.9.1/blis/_src/config/x86_64_no_zen3/make_defs.mk000066400000000000000000000061301427272030600237430ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := x86_64_no_zen3 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif # Flags specific to optimized kernels. CKOPTFLAGS := $(COPTFLAGS) ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else ifeq ($(CC_VENDOR),icc) CKVECFLAGS := -xSSE3 else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mssse3 -mfpmath=sse -march=core2 else $(error gcc, icc, or clang is required for this configuration.) endif endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/zen/000077500000000000000000000000001427272030600176775ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/zen/amd_config.mk000066400000000000000000000057101427272030600223210ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2019, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # All the common flags for AMD architectures will be added here # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -fomit-frame-pointer endif # Flags specific to optimized kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -O3 ifeq ($(CC_VENDOR),gcc) CKVECFLAGS := -mavx2 -mfpmath=sse -mfma else ifeq ($(CC_VENDOR),clang) CKVECFLAGS := -mavx2 -mfpmath=sse -mfma ifeq ($(strip $(shell clang -v |& head -1 | grep -c 'AOCC.LLVM')),1) CKVECFLAGS += -mllvm -disable-licm-vrp endif else $(error gcc or clang are required for this configuration.) endif endif # Flags specific to reference kernels. CROPTFLAGS := $(CKOPTFLAGS) ifeq ($(CC_VENDOR),gcc) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else ifeq ($(CC_VENDOR),clang) CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast else CRVECFLAGS := $(CKVECFLAGS) endif endif cython-blis-0.9.1/blis/_src/config/zen/bli_cntx_init_zen.c000066400000000000000000000276271427272030600235620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" //GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_cntx_init_zen( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 8, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, cntx ); #if 1 // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 8, BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, cntx ); #endif // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_8, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 16, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv #if 0 BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int, #else BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, #endif #if 1 // copyv BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, #endif // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv #if 0 BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int, #else BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, #endif #if 1 // setv BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, // swapv BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, #endif cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); /* Multi Instance performance improvement of DGEMM when binded to a CCX In Multi instance each thread runs a sequential DGEMM. a) If BLIS is run in a multi-instance mode with CPU freq 2.6/2.2 Ghz DDR4 clock frequency 2400Mhz mc = 240, kc = 512, and nc = 2040 has better performance on EPYC server, over the default block sizes. b) If BLIS is run in Single Instance mode mc = 510, kc = 1024 and nc = 4080 */ #ifdef BLIS_ENABLE_ZEN_BLOCK_SIZES // Zen optmized level 3 cache block sizes #if BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES bli_blksz_init_easy( &blkszs[ BLIS_MC ], 1020, 510, 510, 255 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 1024, 1024, 1024, 1024 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 2040, 1528 ); #endif #else bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 4080, 3056 ); #endif bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], 512, 256, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], 440, 220, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( 1, BLIS_GEMM, bli_gemmsup_ref, //BLIS_GEMMT, bli_gemmtsup_ref, cntx ); // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 16, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, #if 0 BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, #endif #if 0 // NOTE: This set of kernels is likely broken and therefore disabled. BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, #endif cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, 9, 9, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, -1, -1 ); #if 0 bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, 9, 9, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); #endif // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); } cython-blis-0.9.1/blis/_src/config/zen/bli_family_zen.h000066400000000000000000000064001427272030600230330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif cython-blis-0.9.1/blis/_src/config/zen/make_defs.mk000066400000000000000000000065411427272030600221540ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := zen #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -fomit-frame-pointer endif # Flags specific to optimized and reference kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -O3 CROPTFLAGS := $(CKOPTFLAGS) CKVECFLAGS := -mavx2 -mfma -mfpmath=sse CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ifeq ($(CC_VENDOR),gcc) ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1. CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store endif else ifeq ($(CC_VENDOR),clang) CVECFLAGS_VER := -march=znver1 else ifeq ($(CC_VENDOR),aocc) CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp else $(error gcc, clang, or aocc is required for this configuration.) endif endif endif CKVECFLAGS += $(CVECFLAGS_VER) CRVECFLAGS += $(CVECFLAGS_VER) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/zen/old/000077500000000000000000000000001427272030600204555ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/zen/old/bli_kernel.h000066400000000000000000000165661427272030600227520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_KERNEL_H #define BLIS_KERNEL_H // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // // threading related // By default it is effective to paralleize the // outerloops. Setting these macros to 1 will force // JR and NR inner loops to be not paralleized. #define BLIS_DEFAULT_MR_THREAD_MAX 1 #define BLIS_DEFAULT_NR_THREAD_MAX 1 // sgemm micro-kernel #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_24x4 #define BLIS_DEFAULT_MC_S 264 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 24 #define BLIS_DEFAULT_NR_S 4 #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif // dgemm micro-kernel #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_12x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 192 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 12 #define BLIS_DEFAULT_NR_D 4 #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 510 // 72 /* Improves performance for large Matrices */ #define BLIS_DEFAULT_KC_D 1024 // 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif // cgemm micro-kernel #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif // zgemm micro-kernel #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif // zgemm micro-kernel #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif // -- trsm-related -- #define BLIS_STRSM_L_UKERNEL bli_strsm_l_int_6x16 #define BLIS_DTRSM_L_UKERNEL bli_dtrsm_l_int_6x8 // --gemmtrsm-related -- #define BLIS_SGEMMTRSM_L_UKERNEL bli_sgemmtrsm_l_6x16 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_6x8 #define BLIS_SMALL_MATRIX_ENABLE //This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 gint_t bli_gemm_small_matrix ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); // -- LEVEL-2 KERNEL CONSTANTS ------------------------------------------------- // -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------ // -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- // -- packm -- // -- unpackm -- #define BLIS_DEFAULT_1F_S 8 #define BLIS_DEFAULT_1F_D 4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- // -- axpy2v -- // -- dotaxpyv -- // -- axpyf -- #define BLIS_SAXPYF_KERNEL bli_saxpyf_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 // -- dotxf -- #define BLIS_SDOTXF_KERNEL bli_sdotxf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 // -- dotxaxpyf -- // -- LEVEL-1M KERNEL DEFINITIONS ---------------------------------------------- // -- packm -- // -- unpackm -- // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- // -- amax -- #define BLIS_SAMAXV_KERNEL bli_samaxv_opt_var1 #define BLIS_DAMAXV_KERNEL bli_damaxv_opt_var1 // -- addv -- // -- axpyv -- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var10 #define BLIS_SAXPYV_KERNEL bli_saxpyv_opt_var10 // -- copyv -- // -- dotv -- #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #define BLIS_SDOTV_KERNEL bli_sdotv_opt_var1 // -- dotxv -- #define BLIS_SDOTXV_KERNEL bli_sdotxv_unb_var1 #define BLIS_DDOTXV_KERNEL bli_ddotxv_unb_var1 // -- invertv -- // -- scal2v -- // -- scalv -- #define BLIS_SSCALV_KERNEL bli_sscalv_opt_var2 #define BLIS_DSCALV_KERNEL bli_dscalv_opt_var2 // -- setv -- // -- subv -- // -- swapv -- #endif cython-blis-0.9.1/blis/_src/config/zen2/000077500000000000000000000000001427272030600177615ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/zen2/bli_cntx_init_zen2.c000066400000000000000000000251231427272030600237130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020-2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_zen2( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen2_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 8, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, cntx ); #if 1 // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 8, BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, cntx ); #endif // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 16, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, //swap BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, //copy BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, //set BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); #if AOCL_BLIS_MULTIINSTANCE bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 240, 144, 72 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 512, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 2040, 4080, 4080 ); #else bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); #endif bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. // s d c z #if 1 bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 ); #else bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 ); #endif // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( 1, BLIS_GEMM, bli_gemmsup_ref, cntx ); #endif // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 16, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, #if 0 BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, #endif #if 0 // NOTE: This set of kernels is likely broken and therefore disabled. BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, #endif cntx ); // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, -1, -1, 9, 9, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 168, 72, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); } cython-blis-0.9.1/blis/_src/config/zen2/bli_family_zen2.h000066400000000000000000000073631427272030600232100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 #endif cython-blis-0.9.1/blis/_src/config/zen2/make_defs.mk000066400000000000000000000075561427272030600222450ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := zen2 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O2 -fomit-frame-pointer endif # Flags specific to optimized and reference kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -O3 CROPTFLAGS := $(CKOPTFLAGS) CKVECFLAGS := -mavx2 -mfma -mfpmath=sse CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ifeq ($(CC_VENDOR),gcc) ifeq ($(GCC_OT_6_1_0),yes) # gcc versions older than 6.1. CVECFLAGS_VER := -march=bdver4 -mno-fma4 -mno-tbm -mno-xop -mno-lwp else ifeq ($(GCC_OT_9_1_0),yes) # gcc versions 6.1 or newer, but older than 9.1. CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store else # gcc versions 9.1 or newer. CVECFLAGS_VER := -march=znver2 endif endif else ifeq ($(CC_VENDOR),clang) ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0. CVECFLAGS_VER := -march=znver1 else # clang versions 9.0 or newer. CVECFLAGS_VER := -march=znver2 endif else ifeq ($(CC_VENDOR),aocc) ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0. CVECFLAGS_VER := -march=znver1 -mllvm -disable-licm-vrp else # aocc versions 2.0 or newer. CVECFLAGS_VER := -march=znver2 endif else $(error gcc, clang, or aocc is required for this configuration.) endif endif endif CKVECFLAGS += $(CVECFLAGS_VER) CRVECFLAGS += $(CVECFLAGS_VER) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/config/zen3/000077500000000000000000000000001427272030600177625ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/config/zen3/bli_cntx_init_zen3.c000066400000000000000000000261571427272030600237250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_init_zen3( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; // Set default kernel blocksizes and functions. bli_cntx_init_zen3_ref( cntx ); // ------------------------------------------------------------------------- // Update the context with optimized native gemm micro-kernels and // their storage preferences. bli_cntx_set_l3_nat_ukrs ( 8, // gemm BLIS_GEMM_UKR, BLIS_FLOAT, bli_sgemm_haswell_asm_6x16, TRUE, BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_haswell_asm_6x8, TRUE, BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8, TRUE, BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4, TRUE, // gemmtrsm_l BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT, bli_sgemmtrsm_l_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_haswell_asm_6x8, TRUE, // gemmtrsm_u BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT, bli_sgemmtrsm_u_haswell_asm_6x16, TRUE, BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_haswell_asm_6x8, TRUE, cntx ); #if 0 // AMD: This will be enabled in other PRs. // packm kernels bli_cntx_set_packm_kers ( 2, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen, cntx ); #else // Update the context with optimized packm kernels. bli_cntx_set_packm_kers ( 8, BLIS_PACKM_6XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_6xk, BLIS_PACKM_16XK_KER, BLIS_FLOAT, bli_spackm_haswell_asm_16xk, BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_6xk, BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk, BLIS_PACKM_8XK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk, BLIS_PACKM_3XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk, BLIS_PACKM_4XK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk, cntx ); #endif // Update the context with optimized level-1f kernels. bli_cntx_set_l1f_kers ( 4, // axpyf BLIS_AXPYF_KER, BLIS_FLOAT, bli_saxpyf_zen_int_5, BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_5, // dotxf BLIS_DOTXF_KER, BLIS_FLOAT, bli_sdotxf_zen_int_8, BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8, cntx ); // Update the context with optimized level-1v kernels. bli_cntx_set_l1v_kers ( 16, // amaxv BLIS_AMAXV_KER, BLIS_FLOAT, bli_samaxv_zen_int, BLIS_AMAXV_KER, BLIS_DOUBLE, bli_damaxv_zen_int, // axpyv BLIS_AXPYV_KER, BLIS_FLOAT, bli_saxpyv_zen_int10, BLIS_AXPYV_KER, BLIS_DOUBLE, bli_daxpyv_zen_int10, // dotv BLIS_DOTV_KER, BLIS_FLOAT, bli_sdotv_zen_int10, BLIS_DOTV_KER, BLIS_DOUBLE, bli_ddotv_zen_int10, // dotxv BLIS_DOTXV_KER, BLIS_FLOAT, bli_sdotxv_zen_int, BLIS_DOTXV_KER, BLIS_DOUBLE, bli_ddotxv_zen_int, // scalv BLIS_SCALV_KER, BLIS_FLOAT, bli_sscalv_zen_int10, BLIS_SCALV_KER, BLIS_DOUBLE, bli_dscalv_zen_int10, //swap BLIS_SWAPV_KER, BLIS_FLOAT, bli_sswapv_zen_int8, BLIS_SWAPV_KER, BLIS_DOUBLE, bli_dswapv_zen_int8, //copy BLIS_COPYV_KER, BLIS_FLOAT, bli_scopyv_zen_int, BLIS_COPYV_KER, BLIS_DOUBLE, bli_dcopyv_zen_int, //set BLIS_SETV_KER, BLIS_FLOAT, bli_ssetv_zen_int, BLIS_SETV_KER, BLIS_DOUBLE, bli_dsetv_zen_int, cntx ); // Initialize level-3 blocksize objects with architecture-specific values. // // These are reference block sizes and may be overridden based on // number of threads used at runtime. // s d c z bli_blksz_init_easy( &blkszs[ BLIS_MR ], 6, 6, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 4080, 4080, 4080 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 5, 5, -1, -1 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 8, 8, -1, -1 ); // Update the context with the current architecture's register and cache // blocksizes (and multiples) for native execution. bli_cntx_set_blkszs ( BLIS_NAT, 7, // level-3 BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, // level-1f BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, cntx ); // ------------------------------------------------------------------------- // Initialize sup thresholds with architecture-appropriate values. // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 256, -1, -1 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 220, -1, -1 ); // Initialize the context with the sup thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); #if 0 // Initialize the context with the sup handlers. bli_cntx_set_l3_sup_handlers ( 2, BLIS_GEMM, bli_gemmsup_ref, BLIS_GEMMT, bli_gemmtsup_ref, cntx ); #endif #if 0 // AMD: This should be enabled in the PR which has added these kernels // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 28, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE, BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE, BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE, BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE, BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE, BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE, cntx ); #else // Update the context with optimized small/unpacked gemm kernels. bli_cntx_set_l3_sup_kers ( 16, //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref, BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE, BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE, BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE, BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE, BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE, BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE, BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE, BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE, cntx ); #endif // Initialize level-3 sup blocksize objects with architecture-specific // values. // s d c z bli_blksz_init ( &blkszs[ BLIS_MR ], 6, 6, 3, 3, 9, 9, 3, 3 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 72, 36 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 512, 256, 128, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 8160, 4080, 2040, 1020 ); // Update the context with the current architecture's register and cache // blocksizes for small/unpacked level-3 problems. bli_cntx_set_l3_sup_blkszs ( 5, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_KC, &blkszs[ BLIS_KC ], BLIS_MC, &blkszs[ BLIS_MC ], BLIS_NR, &blkszs[ BLIS_NR ], BLIS_MR, &blkszs[ BLIS_MR ], cntx ); } cython-blis-0.9.1/blis/_src/config/zen3/bli_family_zen3.h000066400000000000000000000074601427272030600232100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLI_FAMILY_ZEN3_ #define BLI_FAMILY_ZEN3_ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. // #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 #endif cython-blis-0.9.1/blis/_src/config/zen3/make_defs.mk000066400000000000000000000101001427272030600222210ustar00rootroot00000000000000# # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # Declare the name of the current configuration and add it to the # running list of configurations included by common.mk. THIS_CONFIG := zen3 #CONFIGS_INCL += $(THIS_CONFIG) # # --- Determine the C compiler and related flags --- # # NOTE: The build system will append these variables with various # general-purpose/configuration-agnostic flags in common.mk. You # may specify additional flags here as needed. CPPROCFLAGS := CMISCFLAGS := CPICFLAGS := CWARNFLAGS := ifneq ($(DEBUG_TYPE),off) CDBGFLAGS := -g endif ifeq ($(DEBUG_TYPE),noopt) COPTFLAGS := -O0 else COPTFLAGS := -O3 endif # Flags specific to optimized and reference kernels. # NOTE: The -fomit-frame-pointer option is needed for some kernels because # they make explicit use of the rbp register. CKOPTFLAGS := $(COPTFLAGS) -fomit-frame-pointer CROPTFLAGS := $(CKOPTFLAGS) CKVECFLAGS := -mavx2 -mfma -mfpmath=sse CRVECFLAGS := $(CKVECFLAGS) -funsafe-math-optimizations -ffp-contract=fast ifeq ($(CC_VENDOR),gcc) ifeq ($(GCC_OT_9_1_0),yes) # gcc versions older than 9.1. CVECFLAGS_VER := -march=znver1 -mno-avx256-split-unaligned-store else ifeq ($(GCC_OT_10_1_0),yes) # gcc versions 9.1 or newer, but older than 10.1. CVECFLAGS_VER := -march=znver2 else # gcc versions 10.1 or newer. CVECFLAGS_VER := -march=znver3 endif endif else ifeq ($(CC_VENDOR),clang) ifeq ($(CLANG_OT_9_0_0),yes) # clang versions older than 9.0. CVECFLAGS_VER := -march=znver1 else ifeq ($(CLANG_OT_12_0_0),yes) # clang versions 9.0 or newer, but older than 12.0. CVECFLAGS_VER := -march=znver2 else # clang versions 12.0 or newer. CVECFLAGS_VER := -march=znver3 endif endif else ifeq ($(CC_VENDOR),aocc) ifeq ($(AOCC_OT_2_0_0),yes) # aocc versions older than 2.0. CVECFLAGS_VER := -march=znver1 else ifeq ($(AOCC_OT_3_0_0),yes) # aocc versions 2.0 or newer, but older than 3.0. CVECFLAGS_VER := -march=znver2 else # aocc versions 3.0 or newer. CVECFLAGS_VER := -march=znver3 endif endif else $(error gcc, clang, or aocc is required for this configuration.) endif endif endif CKVECFLAGS += $(CVECFLAGS_VER) CRVECFLAGS += $(CVECFLAGS_VER) # Store all of the variables here to new variables containing the # configuration name. $(eval $(call store-make-defs,$(THIS_CONFIG))) cython-blis-0.9.1/blis/_src/frame/000077500000000000000000000000001427272030600167305ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/0/000077500000000000000000000000001427272030600170675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/0/bli_l0.h000066400000000000000000000035411427272030600204040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_l0_check.h" #include "bli_l0_oapi.h" #include "bli_l0_tapi.h" #include "bli_l0_ft.h" // Generate function pointer arrays for tapi functions. #include "bli_l0_fpa.h" // copysc #include "bli_copysc.h" cython-blis-0.9.1/blis/_src/frame/0/bli_l0_check.c000066400000000000000000000210111427272030600215240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based check functions. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ) \ { \ bli_l0_xxsc_check( chi, psi ); \ } GENFRONT( addsc ) GENFRONT( copysc ) GENFRONT( divsc ) GENFRONT( mulsc ) GENFRONT( sqrtsc ) GENFRONT( subsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ) \ { \ bli_l0_xsc_check( chi ); \ } GENFRONT( invertsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* norm \ ) \ { \ bli_l0_xx2sc_check( chi, norm ); \ } GENFRONT( absqsc ) GENFRONT( normfsc ) // ----------------------------------------------------------------------------- void bli_getsc_check ( obj_t* chi, double* zeta_r, double* zeta_i ) { err_t e_val; // Check object datatypes. //e_val = bli_check_noninteger_object( chi ); //bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); } void bli_setsc_check ( double zeta_r, double zeta_i, obj_t* chi ) { err_t e_val; // Check object datatypes. //e_val = bli_check_floating_object( chi ); //bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); } void bli_unzipsc_check ( obj_t* chi, obj_t* zeta_r, obj_t* zeta_i ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_real_object( zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_real_object( zeta_i ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( zeta_i ); bli_check_error_code( e_val ); e_val = bli_check_object_real_proj_of( chi, zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_object_real_proj_of( chi, zeta_i ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( zeta_i ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( zeta_i ); bli_check_error_code( e_val ); } void bli_zipsc_check ( obj_t* zeta_r, obj_t* zeta_i, obj_t* chi ) { err_t e_val; // Check object datatypes. e_val = bli_check_real_object( zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_real_object( zeta_i ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_object_real_proj_of( chi, zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_object_real_proj_of( chi, zeta_i ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( zeta_i ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( zeta_r ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( zeta_i ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); } // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( chi ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); } void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( psi ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( psi ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( psi ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( psi ); bli_check_error_code( e_val ); } void bli_l0_xx2sc_check ( obj_t* chi, obj_t* absq ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( absq ); bli_check_error_code( e_val ); e_val = bli_check_real_object( absq ); bli_check_error_code( e_val ); e_val = bli_check_object_real_proj_of( chi, absq ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( absq ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( absq ); bli_check_error_code( e_val ); } void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( psi ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( chi ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( psi ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( chi ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( psi ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/0/bli_l0_check.h000066400000000000000000000066021427272030600215420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); cython-blis-0.9.1/blis/_src/frame/0/bli_l0_fpa.c000066400000000000000000000045061427272030600212270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA( PASTECH(opname,_vft), opname ); \ \ PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \ { \ return PASTECH(opname,_fpa)[ dt ]; \ } GENFRONT( absqsc ) GENFRONT( normfsc ) GENFRONT( addsc ) GENFRONT( divsc ) GENFRONT( mulsc ) GENFRONT( subsc ) GENFRONT( invertsc ) GENFRONT( sqrtsc ) GENFRONT( unzipsc ) GENFRONT( zipsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA_I( PASTECH(opname,_vft), opname ); \ \ PASTECH(opname,_vft) PASTEMAC(opname,_qfp)( num_t dt ) \ { \ return PASTECH(opname,_fpa)[ dt ]; \ } GENFRONT( getsc ) GENFRONT( setsc ) cython-blis-0.9.1/blis/_src/frame/0/bli_l0_fpa.h000066400000000000000000000037771427272030600212450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) cython-blis-0.9.1/blis/_src/frame/0/bli_l0_ft.h000066400000000000000000000100301427272030600210640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) cython-blis-0.9.1/blis/_src/frame/0/bli_l0_oapi.c000066400000000000000000000205741427272030600214140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ) \ { \ bli_init_once(); \ \ num_t dt_chi; \ num_t dt_absq_c = bli_obj_dt_proj_to_complex( absq ); \ \ void* buf_chi; \ void* buf_absq = bli_obj_buffer_at_off( absq ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, absq ); \ \ /* If chi is a scalar constant, use dt_absq_c to extract the address of the corresponding constant value; otherwise, use the datatype encoded within the chi object and extract the buffer at the chi offset. */ \ bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \ \ f \ ( \ buf_chi, \ buf_absq \ ); \ } GENFRONT( absqsc ) GENFRONT( normfsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( psi ); \ \ conj_t conjchi = bli_obj_conj_status( chi ); \ \ void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ void* buf_psi = bli_obj_buffer_at_off( psi ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, psi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ conjchi, \ buf_chi, \ buf_psi \ ); \ } GENFRONT( addsc ) GENFRONT( divsc ) GENFRONT( mulsc ) GENFRONT( subsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( chi ); \ \ conj_t conjchi = bli_obj_conj_status( chi ); \ \ void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ conjchi, \ buf_chi \ ); \ } GENFRONT( invertsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( psi ); \ \ void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ void* buf_psi = bli_obj_buffer_at_off( psi ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, psi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ buf_chi, \ buf_psi \ ); \ } GENFRONT( sqrtsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ) \ { \ bli_init_once(); \ \ num_t dt_chi = bli_obj_dt( chi ); \ num_t dt_def = BLIS_DCOMPLEX; \ num_t dt_use; \ \ /* If chi is a constant object, default to using the dcomplex value to maximize precision, and since we don't know if the caller needs just the real or the real and imaginary parts. */ \ void* buf_chi = bli_obj_buffer_for_1x1( dt_def, chi ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ \ /* The _check() routine prevents integer types, so we know that chi is either a constant or an actual floating-point type. */ \ if ( bli_is_constant( dt_chi ) ) dt_use = dt_def; \ else dt_use = dt_chi; \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_use ); \ \ f \ ( \ buf_chi, \ zeta_r, \ zeta_i \ ); \ } GENFRONT( getsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ) \ { \ bli_init_once(); \ \ num_t dt_chi = bli_obj_dt( chi ); \ \ void* buf_chi = bli_obj_buffer_at_off( chi ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( zeta_r, zeta_i, chi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \ \ f \ ( \ zeta_r, \ zeta_i, \ buf_chi \ ); \ } GENFRONT( setsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ) \ { \ bli_init_once(); \ \ num_t dt_chi; \ num_t dt_zeta_c = bli_obj_dt_proj_to_complex( zeta_r ); \ \ void* buf_chi; \ \ void* buf_zeta_r = bli_obj_buffer_at_off( zeta_r ); \ void* buf_zeta_i = bli_obj_buffer_at_off( zeta_i ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ \ /* If chi is a scalar constant, use dt_zeta_c to extract the address of the corresponding constant value; otherwise, use the datatype encoded within the chi object and extract the buffer at the chi offset. */ \ bli_obj_scalar_set_dt_buffer( chi, dt_zeta_c, &dt_chi, &buf_chi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \ \ f \ ( \ buf_chi, \ buf_zeta_r, \ buf_zeta_i \ ); \ } GENFRONT( unzipsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ) \ { \ bli_init_once(); \ \ num_t dt_chi = bli_obj_dt( chi ); \ \ void* buf_zeta_r = bli_obj_buffer_for_1x1( dt_chi, zeta_r ); \ void* buf_zeta_i = bli_obj_buffer_for_1x1( dt_chi, zeta_i ); \ \ void* buf_chi = bli_obj_buffer_at_off( chi ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, zeta_r, zeta_i ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = PASTEMAC(opname,_qfp)( dt_chi ); \ \ f \ ( \ buf_zeta_i, \ buf_zeta_r, \ buf_chi \ ); \ } GENFRONT( zipsc ) cython-blis-0.9.1/blis/_src/frame/0/bli_l0_oapi.h000066400000000000000000000060161427272030600214140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) cython-blis-0.9.1/blis/_src/frame/0/bli_l0_tapi.c000066400000000000000000000134071427272030600214160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ) \ { \ bli_init_once(); \ \ ctype chi_conj; \ \ PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \ PASTEMAC(ch,kername)( chi_conj, *psi ); \ } INSERT_GENTFUNC_BASIC( addsc, adds ) INSERT_GENTFUNC_BASIC( divsc, invscals ) INSERT_GENTFUNC_BASIC( subsc, subs ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ) \ { \ bli_init_once(); \ \ ctype chi_conj; \ \ PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \ PASTEMAC(ch,kername)( chi_conj ); \ PASTEMAC(ch,copys)( chi_conj, *chi ); \ } INSERT_GENTFUNC_BASIC( invertsc, inverts ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ) \ { \ bli_init_once(); \ \ if ( PASTEMAC(ch,eq0)( *chi ) ) \ { \ /* Overwrite potential Infs and NaNs. */ \ PASTEMAC(ch,set0s)( *psi ); \ } \ else \ { \ ctype chi_conj; \ \ PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \ PASTEMAC(ch,kername)( chi_conj, *psi ); \ } \ } INSERT_GENTFUNC_BASIC( mulsc, scals ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ) \ { \ bli_init_once(); \ \ ctype_r chi_r; \ ctype_r chi_i; \ ctype_r absq_i; \ \ ( void )absq_i; \ \ PASTEMAC2(ch,chr,gets)( *chi, chi_r, chi_i ); \ \ /* absq = chi_r * chi_r + chi_i * chi_i; \ absq_r = 0.0; (thrown away) */ \ PASTEMAC(ch,absq2ris)( chi_r, chi_i, *absq, absq_i ); \ \ ( void )chi_i; \ } INSERT_GENTFUNCR_BASIC0( absqsc ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* norm \ ) \ { \ bli_init_once(); \ \ /* norm = sqrt( chi_r * chi_r + chi_i * chi_i ); */ \ PASTEMAC2(ch,chr,abval2s)( *chi, *norm ); \ } INSERT_GENTFUNCR_BASIC0( normfsc ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ) \ { \ bli_init_once(); \ \ /* NOTE: sqrtsc/sqrt2s differs from normfsc/abval2s in the complex domain. */ \ PASTEMAC(ch,sqrt2s)( *chi, *psi ); \ } INSERT_GENTFUNC_BASIC0( sqrtsc ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ) \ { \ bli_init_once(); \ \ PASTEMAC2(ch,d,gets)( *chi, *zeta_r, *zeta_i ); \ } INSERT_GENTFUNC_BASIC0( getsc ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ) \ { \ bli_init_once(); \ \ PASTEMAC2(d,ch,sets)( zeta_r, zeta_i, *chi ); \ } INSERT_GENTFUNC_BASIC0( setsc ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ) \ { \ bli_init_once(); \ \ PASTEMAC2(ch,chr,gets)( *chi, *zeta_r, *zeta_i ); \ } INSERT_GENTFUNCR_BASIC0( unzipsc ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ) \ { \ bli_init_once(); \ \ PASTEMAC2(chr,ch,sets)( *zeta_r, *zeta_i, *chi ); \ } INSERT_GENTFUNCR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ) { bli_init_once(); PASTEMAC2(i,d,gets)( *chi, *zeta_r, *zeta_i ); } void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ) { bli_init_once(); PASTEMAC2(d,i,sets)( zeta_r, zeta_i, *chi ); } cython-blis-0.9.1/blis/_src/frame/0/bli_l0_tapi.h000066400000000000000000000076111427272030600214230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); cython-blis-0.9.1/blis/_src/frame/0/copysc/000077500000000000000000000000001427272030600203675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/0/copysc/bli_copysc.c000066400000000000000000000072601427272030600226660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // NOTE: This is one of the few functions in BLIS that is defined // with heterogeneous type support. This is done so that we have // an operation that can be used to typecast (copy-cast) a scalar // of one datatype to a scalar of another datatype. typedef void (*FUNCPTR_T) ( conj_t conjchi, void* chi, void* psi ); static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc); // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ) \ { \ bli_init_once(); \ \ conj_t conjchi = bli_obj_conj_status( chi ); \ \ num_t dt_psi = bli_obj_dt( psi ); \ void* buf_psi = bli_obj_buffer_at_off( psi ); \ \ num_t dt_chi; \ void* buf_chi; \ \ FUNCPTR_T f; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, psi ); \ \ /* If chi is a scalar constant, use dt_psi to extract the address of the corresponding constant value; otherwise, use the datatype encoded within the chi object and extract the buffer at the chi offset. */ \ bli_obj_scalar_set_dt_buffer( chi, dt_psi, &dt_chi, &buf_chi ); \ \ /* Index into the type combination array to extract the correct function pointer. */ \ f = ftypes[dt_chi][dt_psi]; \ \ /* Invoke the void pointer-based function. */ \ f( \ conjchi, \ buf_chi, \ buf_psi \ ); \ } GENFRONT( copysc ) // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, varname ) \ \ void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ) \ { \ bli_init_once(); \ \ ctype_x* chi_cast = chi; \ ctype_y* psi_cast = psi; \ \ if ( bli_is_conj( conjchi ) ) \ { \ PASTEMAC2(chx,chy,copyjs)( *chi_cast, *psi_cast ); \ } \ else \ { \ PASTEMAC2(chx,chy,copys)( *chi_cast, *psi_cast ); \ } \ } INSERT_GENTFUNC2_BASIC0( copysc ) INSERT_GENTFUNC2_MIX_D0( copysc ) INSERT_GENTFUNC2_MIX_P0( copysc ) cython-blis-0.9.1/blis/_src/frame/0/copysc/bli_copysc.h000066400000000000000000000043541427272030600226740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) cython-blis-0.9.1/blis/_src/frame/1/000077500000000000000000000000001427272030600170705ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1/bli_l1v.h000066400000000000000000000050541427272030600205750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_l1v_check.h" // Define kernel function types. //#include "bli_l1v_ft_ex.h" #include "bli_l1v_ft_ker.h" // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1v_oapi.h" #include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1v_oapi.h" #include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1v_tapi.h" #include "bli_l1v_ft.h" #include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1v_tapi.h" #include "bli_l1v_ft.h" #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1v_fpa.h" // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_check.c000066400000000000000000000267021427272030600217300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based check functions. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ) \ { \ bli_l1v_xy_check( x, y ); \ } GENFRONT( addv ) GENFRONT( copyv ) GENFRONT( subv ) GENFRONT( swapv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ) \ { \ bli_l1v_xi_check( x, index ); \ } GENFRONT( amaxv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ) \ { \ bli_l1v_axby_check( alpha, x, beta, y ); \ } GENFRONT( axpbyv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ) \ { \ bli_l1v_axy_check( alpha, x, y ); \ } GENFRONT( axpyv ) GENFRONT( scal2v ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ) \ { \ bli_l1v_dot_check( &BLIS_ONE, x, y, &BLIS_ONE, rho ); \ } GENFRONT( dotv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ) \ { \ bli_l1v_dot_check( alpha, x, y, beta, rho ); \ } GENFRONT( dotxv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ) \ { \ bli_l1v_x_check( x ); \ } GENFRONT( invertv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ) \ { \ bli_l1v_ax_check( alpha, x ); \ } GENFRONT( scalv ) GENFRONT( setv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ) \ { \ bli_l1v_xby_check( x, beta, y ); \ } GENFRONT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( rho ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( rho ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( rho ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( rho ); bli_check_error_code( e_val ); } void bli_l1v_x_check ( obj_t* x ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } void bli_l1v_xi_check ( obj_t* x, obj_t* index ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_integer_object( index ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( index ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( index ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( index ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_check.h000066400000000000000000000101671427272030600217330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_fpa.c000066400000000000000000000044121427272030600214130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \ { \ return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \ } GENFRONT( addv ) GENFRONT( copyv ) GENFRONT( subv ) GENFRONT( amaxv ) GENFRONT( axpbyv ) GENFRONT( axpyv ) GENFRONT( scal2v ) GENFRONT( dotv ) GENFRONT( dotxv ) GENFRONT( invertv ) GENFRONT( scalv ) GENFRONT( setv ) GENFRONT( swapv ) GENFRONT( xpbyv ) cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_fpa.h000066400000000000000000000040731427272030600214230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_ft.h000066400000000000000000000116641427272030600212720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_ft_ker.h000066400000000000000000000125271427272030600221320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_ker.h000066400000000000000000000063141427272030600214360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-1v kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l1v_ker_prot.h. #undef GENTPROT #define GENTPROT ADDV_KER_PROT INSERT_GENTPROT_BASIC0( addv_ker_name ) #undef GENTPROT #define GENTPROT AMAXV_KER_PROT INSERT_GENTPROT_BASIC0( amaxv_ker_name ) #undef GENTPROT #define GENTPROT AXPBYV_KER_PROT INSERT_GENTPROT_BASIC0( axpbyv_ker_name ) #undef GENTPROT #define GENTPROT AXPYV_KER_PROT INSERT_GENTPROT_BASIC0( axpyv_ker_name ) #undef GENTPROT #define GENTPROT COPYV_KER_PROT INSERT_GENTPROT_BASIC0( copyv_ker_name ) #undef GENTPROT #define GENTPROT DOTV_KER_PROT INSERT_GENTPROT_BASIC0( dotv_ker_name ) #undef GENTPROT #define GENTPROT DOTXV_KER_PROT INSERT_GENTPROT_BASIC0( dotxv_ker_name ) #undef GENTPROT #define GENTPROT INVERTV_KER_PROT INSERT_GENTPROT_BASIC0( invertv_ker_name ) #undef GENTPROT #define GENTPROT SCALV_KER_PROT INSERT_GENTPROT_BASIC0( scalv_ker_name ) #undef GENTPROT #define GENTPROT SCAL2V_KER_PROT INSERT_GENTPROT_BASIC0( scal2v_ker_name ) #undef GENTPROT #define GENTPROT SETV_KER_PROT INSERT_GENTPROT_BASIC0( setv_ker_name ) #undef GENTPROT #define GENTPROT SUBV_KER_PROT INSERT_GENTPROT_BASIC0( subv_ker_name ) #undef GENTPROT #define GENTPROT SWAPV_KER_PROT INSERT_GENTPROT_BASIC0( swapv_ker_name ) #undef GENTPROT #define GENTPROT XPBYV_KER_PROT INSERT_GENTPROT_BASIC0( xpbyv_ker_name ) cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_ker_prot.h000066400000000000000000000131731427272030600225030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_oapi.c000066400000000000000000000340111427272030600215730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the object API macros. #ifdef BLIS_ENABLE_OAPI // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ n, \ buf_x, inc_x, \ buf_y, inc_y, \ cntx, \ rntm \ ); \ } GENFRONT( addv ) GENFRONT( copyv ) GENFRONT( subv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_index = bli_obj_buffer_at_off( index ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, index ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ n, \ buf_x, incx, \ buf_index, \ cntx, \ rntm \ ); \ } GENFRONT( amaxv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ void* buf_alpha; \ void* buf_beta; \ \ obj_t alpha_local; \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ n, \ buf_alpha, \ buf_x, inc_x, \ buf_beta, \ buf_y, inc_y, \ cntx, \ rntm \ ); \ } GENFRONT( axpbyv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ n, \ buf_alpha, \ buf_x, inc_x, \ buf_y, inc_y, \ cntx, \ rntm \ ); \ } GENFRONT( axpyv ) GENFRONT( scal2v ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ void* buf_rho = bli_obj_buffer_at_off( rho ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, y, rho ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ conjy, \ n, \ buf_x, inc_x, \ buf_y, inc_y, \ buf_rho, \ cntx, \ rntm \ ); \ } GENFRONT( dotv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ void* buf_rho = bli_obj_buffer_at_off( rho ); \ \ void* buf_alpha; \ void* buf_beta; \ \ obj_t alpha_local; \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, y, beta, rho ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ conjy, \ n, \ buf_alpha, \ buf_x, inc_x, \ buf_y, inc_y, \ buf_beta, \ buf_rho, \ cntx, \ rntm \ ); \ } GENFRONT( dotxv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ n, \ buf_x, inc_x, \ cntx, \ rntm \ ); \ } GENFRONT( invertv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ /* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ n, \ buf_alpha, \ buf_x, inc_x, \ cntx, \ rntm \ ); \ } GENFRONT( scalv ) GENFRONT( setv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ n, \ buf_x, inc_x, \ buf_y, inc_y, \ cntx, \ rntm \ ); \ } GENFRONT( swapv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ void* buf_beta; \ \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ n, \ buf_x, inc_x, \ buf_beta, \ buf_y, inc_y, \ cntx, \ rntm \ ); \ } GENFRONT( xpbyv ) #endif cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_oapi.h000066400000000000000000000076521427272030600216130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_oapi_ba.c000066400000000000000000000036701427272030600222440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_oapi_ba.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1v_oapi.c" cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_oapi_ex.c000066400000000000000000000036661427272030600223030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_oapi_ex.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1v_oapi.c" cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_tapi.c000066400000000000000000000214661427272030600216120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the typed API macros. #ifdef BLIS_ENABLE_TAPI // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjx, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( addv, BLIS_ADDV_KER ) INSERT_GENTFUNC_BASIC( copyv, BLIS_COPYV_KER ) INSERT_GENTFUNC_BASIC( subv, BLIS_SUBV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ n, \ x, incx, \ index, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjx, \ n, \ alpha, \ x, incx, \ beta, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( axpbyv, BLIS_AXPBYV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) \ cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjx, \ n, \ alpha, \ x, incx, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( axpyv, BLIS_AXPYV_KER ) INSERT_GENTFUNC_BASIC( scal2v, BLIS_SCAL2V_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjx, \ conjy, \ n, \ x, incx, \ y, incy, \ rho, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( dotv, BLIS_DOTV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjx, \ conjy, \ n, \ alpha, \ x, incx, \ y, incy, \ beta, \ rho, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( dotxv, BLIS_DOTXV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ n, \ x, incx, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( invertv, BLIS_INVERTV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjalpha, \ n, \ alpha, \ x, incx, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( scalv, BLIS_SCALV_KER ) INSERT_GENTFUNC_BASIC( setv, BLIS_SETV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ n, \ x, incx, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( swapv, BLIS_SWAPV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjx, \ n, \ x, incx, \ beta, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( xpbyv, BLIS_XPBYV_KER ) #endif cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_tapi.h000066400000000000000000000116261427272030600216140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_tapi_ba.c000066400000000000000000000036661427272030600222560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_tapi_ba.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1v_tapi.c" cython-blis-0.9.1/blis/_src/frame/1/bli_l1v_tapi_ex.c000066400000000000000000000036641427272030600223060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_tapi_ex.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1v_tapi.c" cython-blis-0.9.1/blis/_src/frame/1/other/000077500000000000000000000000001427272030600202115ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1/other/packv/000077500000000000000000000000001427272030600213155ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv.c000066400000000000000000000032341427272030600234150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv.h000066400000000000000000000034331427272030600234230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_packv_cntl.h" #include "bli_packv_check.h" #include "bli_packv_init.h" #include "bli_packv_int.h" #include "bli_packv_unb_var1.h" cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_check.c000066400000000000000000000037711427272030600245600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_packv_check ( obj_t* c, obj_t* p, cntx_t* cntx ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( c ); bli_check_error_code( e_val ); // Check object dimensions. // We don't check for conformal dimensions between c and p because // p has not yet been initialized. } cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_check.h000066400000000000000000000033451427272030600245620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_packv_check ( obj_t* c, obj_t* p, cntx_t* cntx ); cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_cntl.c000066400000000000000000000050561427272030600244410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cntl_t* bli_packv_cntl_obj_create ( void_fp var_func, void_fp packv_var_func, bszid_t bmid, pack_t pack_schema, cntl_t* sub_node ) { cntl_t* cntl; packv_params_t* params; // Allocate a packv_params_t struct. params = bli_malloc_intl( sizeof( packv_params_t ) ); // Initialize the packv_params_t struct. params->size = sizeof( packv_params_t ); params->packv_var_func = packv_var_func; params->bmid = bmid; params->pack_schema = pack_schema; // It's important that we set the bszid field to BLIS_NO_PART to indicate // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. cntl = bli_cntl_create_node ( BLIS_NO_PART, var_func, params, sub_node ); return cntl; } cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_cntl.h000066400000000000000000000045641427272030600244510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct packv_params_s { uint64_t size packv_var_oft* var_func; bszid_t bmid; pack_t pack_schema; }; typedef struct packv_params_s packv_params_t; #define bli_cntl_packv_params_var_func( cntl ) \ \ ( (packv_params_t*)( cntl->params )->var_func ) #define bli_cntl_packv_params_bmid( cntl ) \ \ ( (packv_params_t*)( cntl->params )->bmid_m ) #define bli_cntl_packv_params_pack_schema( cntl ) \ \ ( (packv_params_t*)( cntl->params )->pack_schema ) // ----------------------------------------------------------------------------- cntl_t* bli_packv_cntl_obj_create ( void_fp var_func, void_fp packv_var_func, bszid_t bmid, pack_t pack_schema, cntl_t* sub_node ); cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_init.c000066400000000000000000000145371427272030600244500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_packv_init ( obj_t* a, obj_t* p, cntx_t* cntx, packv_t* cntl ) { // The purpose of packm_init() is to initialize an object P so that // a source object A can be packed into P via one of the packv // implementations. This initialization includes acquiring a suitable // block of memory from the memory allocator, if such a block of memory // has not already been allocated previously. pack_t pack_schema; bszid_t bmult_id; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packv_check( a, p, cntx ); // First check if we are to skip this operation because the control tree // is NULL, and if so, simply alias the object to its packed counterpart. if ( bli_cntl_is_noop( cntl ) ) { bli_obj_alias_to( a, p ); return; } // At this point, we can be assured that cntl is not NULL. Let us now // check to see if the object has already been packed to the desired // schema (as encoded in the control tree). If so, we can alias and // return, as above. // Note that in most cases, bli_obj_pack_schema() will return // BLIS_NOT_PACKED and thus packing will be called for (but in some // cases packing has already taken place). Also, not all combinations // of current pack status and desired pack schema are valid. if ( bli_obj_pack_schema( a ) == cntl_pack_schema( cntl ) ) { bli_obj_alias_to( a, p ); return; } // Now, if we are not skipping the pack operation, then the only question // left is whether we are to typecast vector a before packing. if ( bli_obj_dt( a ) != bli_obj_target_dt( a ) ) bli_abort(); // Extract various fields from the control tree and pass them in // explicitly into _init_pack(). This allows external code generators // the option of bypassing usage of control trees altogether. pack_schema = cntl_pack_schema( cntl ); bmult_id = cntl_bmid( cntl ); // Initialize object p for the final packed vector. bli_packv_init_pack ( pack_schema, bmult_id, &a, p, cntx ); // Now p is ready to be packed. } siz_t bli_packv_init_pack ( pack_t schema, bszid_t bmult_id, obj_t* a, obj_t* p, cntx_t* cntx ) { num_t dt = bli_obj_dt( a ); dim_t dim_a = bli_obj_vector_dim( a ); dim_t bmult = bli_cntx_get_blksz_def_dt( dt, bmult_id, cntx ); pba_t* pba = bli_cntx_pba( cntx ); #if 0 mem_t* mem_p; #endif dim_t m_p_pad; siz_t size_p; inc_t rs_p, cs_p; void* buf; // We begin by copying the basic fields of c. bli_obj_alias_to( a, p ); // Update the dimensions. bli_obj_set_dims( dim_a, 1, p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, p ); // Set the pack schema in the p object to the value in the control tree // node. bli_obj_set_pack_schema( schema, p ); // Compute the dimensions padded by the dimension multiples. m_p_pad = bli_align_dim_to_mult( bli_obj_vector_dim( p ), bmult ); // Compute the size of the packed buffer. size_p = m_p_pad * 1 * bli_obj_elem_size( p ); #if 0 // Extract the address of the mem_t object within p that will track // properties of the packed buffer. mem_p = bli_obj_pack_mem( *p ); if ( bli_mem_is_unalloc( mem_p ) ) { // If the mem_t object of p has not yet been allocated, then acquire // a memory block suitable for a vector. bli_pba_acquire_v( pba, size_p, mem_p ); } else { // If the mem_t object has already been allocated, then release and // re-acquire the memory so there is sufficient space. if ( bli_mem_size( mem_p ) < size_p ) { bli_pba_release( mem_p ); bli_pba_acquire_v( pba, size_p, mem_p ); } } // Grab the buffer address from the mem_t object and copy it to the // main object buffer field. (Sometimes this buffer address will be // copied when the value is already up-to-date, because it persists // in the main object buffer field across loop iterations.) buf = bli_mem_buffer( mem_p ); bli_obj_set_buffer( buf, p ); #endif // Save the padded (packed) dimensions into the packed object. bli_obj_set_padded_dims( m_p_pad, 1, p ); // Set the row and column strides of p based on the pack schema. if ( schema == BLIS_PACKED_VECTOR ) { // Set the strides to reflect a column-stored vector. Note that the // column stride may never be used, and is only useful to determine // how much space beyond the vector would need to be zero-padded, if // zero-padding was needed. rs_p = 1; cs_p = bli_obj_padded_length( p ); bli_obj_set_strides( rs_p, cs_p, p ); } return size_p; } #if 0 void bli_packv_release ( obj_t* p, packv_t* cntl ) { if ( !bli_cntl_is_noop( cntl ) ) bli_obj_release_pack( p ); } #endif cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_init.h000066400000000000000000000036321427272030600244470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_packv_init ( obj_t* a, obj_t* p, cntx_t* cntx, packv_t* cntl ); siz_t bli_packv_init_pack ( pack_t pack_schema, bszid_t bmult_id, obj_t* a, obj_t* p, cntx_t* cntx ); cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_int.c000066400000000000000000000104341427272030600242670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T packv_fp typedef void (*FUNCPTR_T)( obj_t* a, obj_t* p, cntx_t* cntx, packv_t* cntl ); static FUNCPTR_T vars[1][3] = { // unblocked optimized unblocked blocked { bli_packv_unb_var1, NULL, NULL } }; void bli_packv_int ( obj_t* a, obj_t* p, cntx_t* cntx, cntl_t* cntl ) { #if 0 varnum_t n; impl_t i; #endif packv_var_oft f; // !!! // DEFINE packv_var_oft type. // !!! // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packv_check( a, p, cntx ); // Sanity check; A should never have a zero dimension. If we must support // it, then we should fold it into the next alias-and-early-exit block. //if ( bli_obj_has_zero_dim( a ) ) bli_abort(); // First check if we are to skip this operation because the control tree // is NULL. We return without taking any action because a was already // aliased to p in packv_init(). if ( bli_cntl_is_noop( cntl ) ) { return; } // Let us now check to see if the object has already been packed. First // we check if it has been packed to an unspecified (row or column) // format, in which case we can return, since by now aliasing has already // taken place in packv_init(). // NOTE: The reason we don't need to even look at the control tree in // this case is as follows: an object's pack status is only set to // BLIS_PACKED_UNSPEC for situations when the actual format used is // not important, as long as its packed into contiguous rows or // contiguous columns. A good example of this is packing for matrix // operands in the level-2 operations. if ( bli_obj_pack_schema( a ) == BLIS_PACKED_UNSPEC ) { return; } // At this point, we can be assured that cntl is not NULL. Now we check // if the object has already been packed to the desired schema (as en- // coded in the control tree). If so, we can return, as above. // NOTE: In most cases, an object's pack status will be BLIS_NOT_PACKED // and thus packing will be called for (but in some cases packing has // already taken place, or does not need to take place, and so that will // be indicated by the pack status). Also, not all combinations of // current pack status and desired pack schema are valid. if ( bli_obj_pack_schema( a ) == cntl_pack_schema( cntl ) ) { return; } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( a, p, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_int.h000066400000000000000000000033751427272030600243020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_packv_int ( obj_t* c, obj_t* p, cntx_t* cntx, packv_t* cntl ); cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_unb_var1.c000066400000000000000000000061171427272030600252150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T packv_fp typedef void (*FUNCPTR_T)( dim_t m, void* c, inc_t incc, void* p, inc_t incp, cntx_t* cntx ); static FUNCPTR_T GENARRAY(ftypes,packv_unb_var1); void bli_packv_unb_var1( obj_t* c, obj_t* p, cntx_t* cntx, packv_t* cntl ) { num_t dt_cp = bli_obj_dt( c ); dim_t dim_p = bli_obj_vector_dim( p ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t incc = bli_obj_vector_inc( c ); void* buf_p = bli_obj_buffer_at_off( p ); inc_t incp = bli_obj_vector_inc( p ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f ( dim_p, buf_c, incc, buf_p, incp, cntx ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t m, \ void* c, inc_t incc, \ void* p, inc_t incp, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ BLIS_NO_CONJUGATE, \ m, \ c, incc, \ p, incp, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC0( packv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/1/other/packv/bli_packv_unb_var1.h000066400000000000000000000040441427272030600252170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_packv_unb_var1( obj_t* c, obj_t* p, cntx_t* cntx, packv_t* cntl ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t m, \ void* c, inc_t incc, \ void* p, inc_t incp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( packv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/1/other/scalv/000077500000000000000000000000001427272030600213215ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1/other/scalv/bli_scalv_cntl.c000066400000000000000000000045651427272030600244550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" scalv_t* scalv_cntl = NULL; void bli_scalv_cntl_init() { scalv_cntl = bli_scalv_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1 ); } void bli_scalv_cntl_finalize() { bli_cntl_free_node( scalv_cntl ); } scalv_t* bli_scalv_cntl_obj_create( impl_t impl_type, varnum_t var_num ) { scalv_t* cntl; cntl = ( scalv_t* ) bli_malloc_intl( sizeof(scalv_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; return cntl; } void bli_scalv_cntl_obj_init( scalv_t* cntl, impl_t impl_type, varnum_t var_num ) { cntl->impl_type = impl_type; cntl->var_num = var_num; } cython-blis-0.9.1/blis/_src/frame/1/other/scalv/bli_scalv_cntl.h000066400000000000000000000042131427272030600244500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct scalv_s { impl_t impl_type; varnum_t var_num; }; typedef struct scalv_s scalv_t; #define bli_cntl_sub_scalv( cntl ) cntl->sub_scalv void bli_scalv_cntl_init( void ); void bli_scalv_cntl_finalize( void ); scalv_t* bli_scalv_cntl_obj_create( impl_t impl_type, varnum_t var_num ); void bli_scalv_cntl_obj_init( scalv_t* cntl, impl_t impl_type, varnum_t var_num ); cython-blis-0.9.1/blis/_src/frame/1/other/scalv/bli_scalv_int.c000066400000000000000000000053741427272030600243060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" typedef void (*FUNCPTR_T)( obj_t* alpha, obj_t* x, cntx_t* cntx ); static FUNCPTR_T vars[1][3] = { // unblocked optimized unblocked blocked { bli_scalv_ex, bli_scalv_ex, NULL } }; void bli_scalv_int( obj_t* alpha, obj_t* x, cntx_t* cntx, scalv_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; // Return early if one of the matrix operands has a zero dimension. if ( bli_obj_has_zero_dim( x ) ) return; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_scalv_check( alpha, x ); // First check if we are to skip this operation. if ( bli_cntl_is_noop( cntl ) ) return; // Return early if the alpha scalar equals one. if ( bli_obj_equals( alpha, &BLIS_ONE ) ) return; // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( alpha, x, cntx ); } cython-blis-0.9.1/blis/_src/frame/1/other/scalv/bli_scalv_int.h000066400000000000000000000034261427272030600243070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_scalv_int( obj_t* alpha, obj_t* x, cntx_t* cntx, scalv_t* cntl ); cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/000077500000000000000000000000001427272030600216605ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv.c000066400000000000000000000032341427272030600243230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv.h000066400000000000000000000034101427272030600243240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_unpackv_cntl.h" #include "bli_unpackv_check.h" #include "bli_unpackv_int.h" #include "bli_unpackv_unb_var1.h" cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_check.c000066400000000000000000000042271427272030600254630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_unpackv_check ( obj_t* p, obj_t* a, cntx_t* cntx ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( p ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_equal_vector_lengths( p, a ); bli_check_error_code( e_val ); // Check pack status. e_val = bli_check_packv_schema_on_unpack( p ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_check.h000066400000000000000000000033471427272030600254720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_unpackv_check ( obj_t* p, obj_t* a, cntx_t* cntx ); cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_cntl.c000066400000000000000000000046321427272030600253460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" unpackv_t* unpackv_cntl = NULL; void bli_unpackv_cntl_init() { unpackv_cntl = bli_unpackv_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1 ); } void bli_unpackv_cntl_finalize() { bli_cntl_free_node( unpackv_cntl ); } unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type, varnum_t var_num ) { unpackv_t* cntl; cntl = ( unpackv_t* ) bli_malloc_intl( sizeof(unpackv_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; return cntl; } void bli_unpackv_cntl_obj_init( unpackv_t* cntl, impl_t impl_type, varnum_t var_num ) { cntl->impl_type = impl_type; cntl->var_num = var_num; } cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_cntl.h000066400000000000000000000046371427272030600253600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct unpackv_s { impl_t impl_type; varnum_t var_num; }; typedef struct unpackv_s unpackv_t; #define bli_cntl_sub_unpackv( cntl ) cntl->sub_unpackv #define bli_cntl_sub_unpackv_x( cntl ) cntl->sub_unpackv_x #define bli_cntl_sub_unpackv_x1( cntl ) cntl->sub_unpackv_x1 #define bli_cntl_sub_unpackv_y( cntl ) cntl->sub_unpackv_y #define bli_cntl_sub_unpackv_y1( cntl ) cntl->sub_unpackv_y1 void bli_unpackv_cntl_init( void ); void bli_unpackv_cntl_finalize( void ); unpackv_t* bli_unpackv_cntl_obj_create( impl_t impl_type, varnum_t var_num ); void bli_unpackv_cntl_obj_init( unpackv_t* cntl, impl_t impl_type, varnum_t var_num ); cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_int.c000066400000000000000000000174301427272030600252000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T unpackv_fp typedef void (*FUNCPTR_T)( obj_t* p, obj_t* a, cntx_t* cntx, unpackv_t* cntl ); static FUNCPTR_T vars[1][3] = { // unblocked optimized unblocked blocked { bli_unpackv_unb_var1, NULL, NULL } }; void bli_unpackv_int( obj_t* p, obj_t* a, cntx_t* cntx, unpackv_t* cntl ) { // The unpackv operation consists of an optional casting post-process. // (This post-process is analogous to the cast pre-process in packv.) // Here are the following possible ways unpackv can execute: // 1. unpack and cast: Unpack to a temporary vector c and then cast // c to a. // 2. unpack only: Unpack directly to vector a since typecasting is // not needed. // 3. cast only: Not yet supported / not used. // 4. no-op: The control tree directs us to skip the unpack operation // entirely. No action is taken. obj_t c; varnum_t n; impl_t i; FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_unpackv_check( p, a, cntx ); // Sanity check; A should never have a zero dimension. If we must support // it, then we should fold it into the next alias-and-early-exit block. if ( bli_obj_has_zero_dim( a ) ) bli_abort(); // First check if we are to skip this operation because the control tree // is NULL, and if so, simply return. if ( bli_cntl_is_noop( cntl ) ) { return; } // If p was aliased to a during the pack stage (because it was already // in an acceptable packed/contiguous format), then no unpack is actually // necessary, so we return. if ( bli_obj_is_alias_of( p, a ) ) { return; } // Now, if we are not skipping the unpack operation, then the only // question left is whether we are to typecast vector a after unpacking. if ( bli_obj_dt( p ) != bli_obj_dt( a ) ) bli_abort(); /* if ( bli_obj_dt( p ) != bli_obj_dt( a ) ) { // Initialize an object c for the intermediate typecast vector. bli_unpackv_init_cast( p, a, &c ); } else */ { // If no cast is needed, then aliasing object c to the original // vector serves as a minor optimization. This causes the unpackv // implementation to unpack directly into vector a. bli_obj_alias_to( a, &c ); } // Now we are ready to proceed with the unpacking. // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( p, &c, cntx, cntl ); // Now, if necessary, we cast the contents of c to vector a. If casting // was not necessary, then we are done because the call to the unpackv // implementation would have unpacked directly to vector a. /* if ( bli_obj_dt( p ) != bli_obj_dt( a ) ) { // Copy/typecast vector c to vector a. // NOTE: Here, we use copynzv instead of copym because, in the cases // where we are unpacking/typecasting a real vector c to a complex // vector a, we want to touch only the real components of a, rather // than also set the imaginary components to zero. This comes about // because of the fact that, if we are unpacking real-to-complex, // then it is because all of the computation occurred in the real // domain, and so we would want to leave whatever imaginary values // there are in vector a untouched. Notice that for unpackings that // entail complex-to-complex data movements, the copynzv operation // behaves exactly as copym, so no use cases are lost (at least none // that I can think of). bli_copynzv( &c, a ); // NOTE: The above code/comment is outdated. What should happen is // as follows: // - If dt(a) is complex and dt(p) is real, then create an alias of // a and then tweak it so that it looks like a real domain object. // This will involve: // - projecting the datatype to real domain // - scaling both the row and column strides by 2 // ALL OF THIS should be done in the front-end, NOT here, as // unpackv() won't even be needed in that case. } */ } /* void bli_unpackv_init_cast( obj_t* p, obj_t* a, obj_t* c ) { // The idea here is that we want to create an object c that is identical // to object a, except that: // (1) the storage datatype of c is equal to the target datatype of a, // with the element size of c adjusted accordingly, // (2) object c is marked as being stored in a standard, contiguous // format (ie: a column vector), // (3) the view offset of c is reset to (0,0), and // (4) object c's main buffer is set to a new memory region acquired // from the memory manager, or extracted from p if a mem entry is // already available. (After acquring a mem entry from the memory // manager, it is cached within p for quick access later on.) num_t dt_targ_a = bli_obj_target_dt( a ); dim_t dim_a = bli_obj_vector_dim( a ); siz_t elem_size_c = bli_dt_size( dt_targ_a ); // We begin by copying the basic fields of a. bli_obj_alias_to( a, c ); // Update datatype and element size fields. bli_obj_set_dt( dt_targ_a, c ); bli_obj_set_elem_size( elem_size_c, c ); // Update the strides and dimensions. We set the increments to reflect a // column-stored vector. Note that the column stride is set to dim(a), // though it should never be used because there is no second column to // index into (and therefore it also does not need to be aligned). bli_obj_set_dims( dim_a, 1, c ); bli_obj_set_strides( 1, dim_a, c ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, c ); // Check the mem_t entry of p associated with the cast buffer. If it is // NULL, then acquire memory sufficient to hold the object data and cache // it to p. (Otherwise, if it is non-NULL, then memory has already been // acquired from the memory manager and cached.) We then set the main // buffer of c to the cached address of the cast memory. bli_obj_set_buffer_with_cached_cast_mem( *p, *c ); } */ cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_int.h000066400000000000000000000036371427272030600252110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_unpackv_int( obj_t* p, obj_t* a, cntx_t* cntx, unpackv_t* cntl ); /* void bli_unpackv_init_cast( obj_t* p, obj_t* a, obj_t* c ); */ cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_unb_var1.c000066400000000000000000000061451427272030600261240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T unpackv_fp typedef void (*FUNCPTR_T)( dim_t m, void* p, inc_t incp, void* c, inc_t incc, cntx_t* cntx ); static FUNCPTR_T GENARRAY(ftypes,unpackv_unb_var1); void bli_unpackv_unb_var1( obj_t* p, obj_t* c, cntx_t* cntx, unpackv_t* cntl ) { num_t dt_pc = bli_obj_dt( p ); dim_t dim_c = bli_obj_vector_dim( c ); void* buf_p = bli_obj_buffer_at_off( p ); inc_t incp = bli_obj_vector_inc( p ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t incc = bli_obj_vector_inc( c ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_pc]; // Invoke the function. f ( dim_c, buf_p, incp, buf_c, incc, cntx ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t m, \ void* p, inc_t incp, \ void* c, inc_t incc, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ BLIS_NO_CONJUGATE, \ m, \ p, incp, \ c, incc, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC0( unpackv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/1/other/unpackv/bli_unpackv_unb_var1.h000066400000000000000000000040661427272030600261310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_unpackv_unb_var1( obj_t* p, obj_t* c, cntx_t* cntx, unpackv_t* cntl ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t m, \ void* p, inc_t incp, \ void* c, inc_t incc, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( unpackv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/1d/000077500000000000000000000000001427272030600172345ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d.h000066400000000000000000000043301427272030600207130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_l1d_check.h" // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1d_oapi.h" #include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1d_oapi.h" #include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1d_tapi.h" #include "bli_l1d_ft.h" #include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1d_tapi.h" #include "bli_l1d_ft.h" #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1d_fpa.h" cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_check.c000066400000000000000000000134251427272030600220500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based check functions. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ) \ { \ bli_l1d_xy_check( x, y ); \ } GENFRONT( addd ) GENFRONT( copyd ) GENFRONT( subd ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ) \ { \ bli_l1d_axy_check( alpha, x, y ); \ } GENFRONT( axpyd ) GENFRONT( scal2d ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ) \ { \ bli_l1d_x_check( x ); \ } GENFRONT( invertd ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ) \ { \ bli_l1d_ax_check( alpha, x ); \ } GENFRONT( scald ) GENFRONT( setd ) GENFRONT( setid ) GENFRONT( shiftd ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ) \ { \ bli_l1d_axy_check( beta, x, y ); \ } GENFRONT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( y ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( y ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1d_x_check ( obj_t* x ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_check.h000066400000000000000000000057261427272030600220620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_fpa.c000066400000000000000000000043241427272030600215370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \ { \ return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \ } GENFRONT( addd ) GENFRONT( copyd ) GENFRONT( subd ) GENFRONT( axpyd ) GENFRONT( scal2d ) GENFRONT( invertd ) GENFRONT( scald ) GENFRONT( setd ) GENFRONT( setid ) GENFRONT( shiftd ) GENFRONT( xpbyd ) cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_fpa.h000066400000000000000000000040111427272030600215350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_ft.h000066400000000000000000000106051427272030600214060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_oapi.c000066400000000000000000000263431427272030600217260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the object API macros. #ifdef BLIS_ENABLE_OAPI // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ trans_t transx = bli_obj_conjtrans_status( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ transx, \ m, \ n, \ buf_x, rs_x, cs_x, \ buf_y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } GENFRONT( addd ) GENFRONT( copyd ) GENFRONT( subd ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ trans_t transx = bli_obj_conjtrans_status( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ transx, \ m, \ n, \ buf_alpha, \ buf_x, rs_x, cs_x, \ buf_y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } GENFRONT( axpyd ) GENFRONT( scal2d ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ m, \ n, \ buf_x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } GENFRONT( invertd ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ /* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ diagoffx, \ m, \ n, \ buf_alpha, \ buf_x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } GENFRONT( scald ) GENFRONT( setd ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ m, \ n, \ buf_alpha, \ buf_x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } GENFRONT( setid ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ m, \ n, \ buf_alpha, \ buf_x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } GENFRONT( shiftd ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ trans_t transx = bli_obj_conjtrans_status( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ void* buf_beta; \ \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ transx, \ m, \ n, \ buf_x, rs_x, cs_x, \ buf_beta, \ buf_y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } GENFRONT( xpbyd ) #endif cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_oapi.h000066400000000000000000000055021427272030600217250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_oapi_ba.c000066400000000000000000000036701427272030600223660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_oapi_ba.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1d_oapi.c" cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_oapi_ex.c000066400000000000000000000036661427272030600224250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_oapi_ex.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1d_oapi.c" cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_tapi.c000066400000000000000000000321051427272030600217240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the typed API macros. #ifdef BLIS_ENABLE_TAPI // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x1; \ ctype* y1; \ conj_t conjx; \ dim_t n_elem; \ dim_t offx, offy; \ inc_t incx, incy; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \ \ /* Determine the distance to the diagonals, the number of diagonal elements, and the diagonal increments. */ \ bli_set_dims_incs_2d \ ( \ diagoffx, transx, \ m, n, rs_x, cs_x, rs_y, cs_y, \ &offx, &offy, &n_elem, &incx, &incy \ ); \ \ conjx = bli_extract_conj( transx ); \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ x1 = x + offx; \ y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ /* Simulate a unit diagonal for x with a zero increment over a unit scalar. */ \ x1 = PASTEMAC(ch,1); \ incx = 0; \ y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ f( \ conjx, \ n_elem, \ x1, incx, \ y1, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( addd, addv, BLIS_ADDV_KER ) INSERT_GENTFUNC_BASIC2( copyd, copyv, BLIS_COPYV_KER ) INSERT_GENTFUNC_BASIC2( subd, subv, BLIS_SUBV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x1; \ ctype* y1; \ conj_t conjx; \ dim_t n_elem; \ dim_t offx, offy; \ inc_t incx, incy; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \ \ /* Determine the distance to the diagonals, the number of diagonal elements, and the diagonal increments. */ \ bli_set_dims_incs_2d \ ( \ diagoffx, transx, \ m, n, rs_x, cs_x, rs_y, cs_y, \ &offx, &offy, &n_elem, &incx, &incy \ ); \ \ conjx = bli_extract_conj( transx ); \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ x1 = x + offx; \ y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ /* Simulate a unit diagonal for x with a zero increment over a unit scalar. */ \ x1 = PASTEMAC(ch,1); \ incx = 0; \ y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ f( \ conjx, \ n_elem, \ alpha, \ x1, incx, \ y1, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( axpyd, axpyv, BLIS_AXPYV_KER ) INSERT_GENTFUNC_BASIC2( scal2d, scal2v, BLIS_SCAL2V_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x1; \ dim_t n_elem; \ dim_t offx; \ inc_t incx; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \ \ /* Determine the distance to the diagonals, the number of diagonal elements, and the diagonal increments. */ \ bli_set_dims_incs_1d \ ( \ diagoffx, \ m, n, rs_x, cs_x, \ &offx, &n_elem, &incx \ ); \ \ x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ f( \ n_elem, \ x1, incx, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x1; \ dim_t n_elem; \ dim_t offx; \ inc_t incx; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \ \ /* Determine the distance to the diagonals, the number of diagonal elements, and the diagonal increments. */ \ bli_set_dims_incs_1d \ ( \ diagoffx, \ m, n, rs_x, cs_x, \ &offx, &n_elem, &incx \ ); \ \ x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ f( \ conjalpha, \ n_elem, \ alpha, \ x1, incx, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER ) INSERT_GENTFUNC_BASIC2( setd, setv, BLIS_SETV_KER ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, kername, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ const num_t dt_r = PASTEMAC(chr,type); \ \ ctype_r* x1; \ dim_t n_elem; \ dim_t offx; \ inc_t incx; \ \ /* If the datatype is real, the entire operation is a no-op. */ \ if ( bli_is_real( dt ) ) return; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \ \ /* Determine the distance to the diagonals, the number of diagonal elements, and the diagonal increments. */ \ bli_set_dims_incs_1d \ ( \ diagoffx, \ m, n, rs_x, cs_x, \ &offx, &n_elem, &incx \ ); \ \ /* Alternate implementation. (Substitute for remainder of function). */ \ /* for ( i = 0; i < n_elem; ++i ) \ { \ ctype* chi11 = x1 + (i )*incx; \ \ PASTEMAC(ch,setis)( *alpha, *chi11 ); \ } */ \ \ /* Acquire the addres of the imaginary component of the first element, and scale the increment for use in the real domain. Note that the indexing into the imaginary field only needs to work for complex datatypes since we return early for real domain types. */ \ x1 = ( ctype_r* )( x + offx ) + 1; \ incx = 2*incx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ f( \ BLIS_NO_CONJUGATE, \ n_elem, \ alpha, \ x1, incx, \ cntx \ ); \ } INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x1; \ dim_t n_elem; \ dim_t offx; \ inc_t incx; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ if ( bli_is_outside_diag( diagoffx, BLIS_NO_TRANSPOSE, m, n ) ) return; \ \ /* Determine the distance to the diagonals, the number of diagonal elements, and the diagonal increments. */ \ bli_set_dims_incs_1d \ ( \ diagoffx, \ m, n, rs_x, cs_x, \ &offx, &n_elem, &incx \ ); \ \ x1 = x + offx; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ f( \ BLIS_NO_CONJUGATE, \ n_elem, \ alpha, 0, \ x1, incx, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x1; \ ctype* y1; \ conj_t conjx; \ dim_t n_elem; \ dim_t offx, offy; \ inc_t incx, incy; \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ if ( bli_is_outside_diag( diagoffx, transx, m, n ) ) return; \ \ /* Determine the distance to the diagonals, the number of diagonal elements, and the diagonal increments. */ \ bli_set_dims_incs_2d \ ( \ diagoffx, transx, \ m, n, rs_x, cs_x, rs_y, cs_y, \ &offx, &offy, &n_elem, &incx, &incy \ ); \ \ conjx = bli_extract_conj( transx ); \ \ if ( bli_is_nonunit_diag( diagx ) ) \ { \ x1 = x + offx; \ y1 = y + offy; \ } \ else /* if ( bli_is_unit_diag( diagx ) ) */ \ { \ /* Simulate a unit diagonal for x with a zero increment over a unit scalar. */ \ x1 = PASTEMAC(ch,1); \ incx = 0; \ y1 = y + offy; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Query the context for the operation's kernel address. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Invoke the kernel with the appropriate parameters. */ \ f( \ conjx, \ n_elem, \ x1, incx, \ beta, \ y1, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( xpbyd, xpbyv, BLIS_XPBYV_KER ) #endif cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_tapi.h000066400000000000000000000105301427272030600217270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_tapi_ba.c000066400000000000000000000036661427272030600224000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_tapi_ba.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1d_tapi.c" cython-blis-0.9.1/blis/_src/frame/1d/bli_l1d_tapi_ex.c000066400000000000000000000036641427272030600224300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_tapi_ex.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1d_tapi.c" cython-blis-0.9.1/blis/_src/frame/1f/000077500000000000000000000000001427272030600172365ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f.h000066400000000000000000000044261427272030600207250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_l1f_check.h" // Define kernel function types. #include "bli_l1f_ft_ker.h" // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1f_oapi.h" #include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1f_oapi.h" #include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1f_tapi.h" #include "bli_l1f_ft.h" #include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1f_tapi.h" #include "bli_l1f_ft.h" #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1f_fpa.h" cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_check.c000066400000000000000000000303741427272030600220560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based check functions. // void bli_axpy2v_check ( obj_t* alphax, obj_t* alphay, obj_t* x, obj_t* y, obj_t* z ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alphax ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( alphay ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( z ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( x, z ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alphax ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( alphay ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( z ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, z ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alphax ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( alphay ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( z ); bli_check_error_code( e_val ); } void bli_axpyf_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( x, bli_obj_width_after_trans( a ) ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( y, bli_obj_length_after_trans( a ) ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_dotaxpyv_check ( obj_t* alpha, obj_t* xt, obj_t* x, obj_t* y, obj_t* rho, obj_t* z ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( xt ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( rho ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( rho ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( z ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, xt ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( x, z ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( xt ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( rho ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( z ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, xt ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, z ); bli_check_error_code( e_val ); // Check object aliases. e_val = bli_check_object_alias_of( xt, x ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( xt ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( rho ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( z ); bli_check_error_code( e_val ); } void bli_dotxaxpyf_check ( obj_t* alpha, obj_t* at, obj_t* a, obj_t* w, obj_t* x, obj_t* beta, obj_t* y, obj_t* z ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( at ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( w ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( z ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, at ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, w ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, z ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( at ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( w ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( z ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( w, z ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( at, a ); bli_check_error_code( e_val ); e_val = bli_check_object_length_equals( at, bli_obj_vector_dim( w ) ); bli_check_error_code( e_val ); e_val = bli_check_object_width_equals( at, bli_obj_vector_dim( y ) ); bli_check_error_code( e_val ); e_val = bli_check_object_length_equals( a, bli_obj_vector_dim( z ) ); bli_check_error_code( e_val ); e_val = bli_check_object_width_equals( a, bli_obj_vector_dim( x ) ); bli_check_error_code( e_val ); // Check object aliases. e_val = bli_check_object_alias_of( at, a ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( at ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( w ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( z ); bli_check_error_code( e_val ); } void bli_dotxf_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( x, bli_obj_length_after_trans( a ) ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( y, bli_obj_width_after_trans( a ) ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_check.h000066400000000000000000000055331427272030600220620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_fpa.c000066400000000000000000000041571427272030600215470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \ { \ return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \ } GENFRONT( axpy2v ) GENFRONT( axpyf ) GENFRONT( dotaxpyv ) GENFRONT( dotxaxpyf ) GENFRONT( dotxf ) cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_fpa.h000066400000000000000000000036521427272030600215530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_ft.h000066400000000000000000000076771427272030600214310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_ft_ker.h000066400000000000000000000105601427272030600222530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_ker.h000066400000000000000000000046271427272030600215710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-1f kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l1f_ker_prot.h. #undef GENTPROT #define GENTPROT AXPY2V_KER_PROT INSERT_GENTPROT_BASIC0( axpy2v_ker_name ) #undef GENTPROT #define GENTPROT AXPYF_KER_PROT INSERT_GENTPROT_BASIC0( axpyf_ker_name ) #undef GENTPROT #define GENTPROT DOTAXPYV_KER_PROT INSERT_GENTPROT_BASIC0( dotaxpyv_ker_name ) #undef GENTPROT #define GENTPROT DOTXAXPYF_KER_PROT INSERT_GENTPROT_BASIC0( dotxaxpyf_ker_name ) #undef GENTPROT #define GENTPROT DOTXF_KER_PROT INSERT_GENTPROT_BASIC0( dotxf_ker_name ) cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_ker_prot.h000066400000000000000000000077411427272030600226350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_oapi.c000066400000000000000000000275511427272030600217340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the object API macros. #ifdef BLIS_ENABLE_OAPI // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ void* buf_z = bli_obj_buffer_at_off( z ); \ inc_t inc_z = bli_obj_vector_inc( z ); \ \ void* buf_alphax; \ void* buf_alphay; \ \ obj_t alphax_local; \ obj_t alphay_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alphax, alphay, x, y, z ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alphax, &alphax_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alphay, &alphay_local ); \ buf_alphax = bli_obj_buffer_for_1x1( dt, &alphax_local ); \ buf_alphay = bli_obj_buffer_for_1x1( dt, &alphay_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ conjy, \ n, \ buf_alphax, \ buf_alphay, \ buf_x, inc_x, \ buf_y, inc_y, \ buf_z, inc_z, \ cntx, \ rntm \ ); \ } GENFRONT( axpy2v ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conja = bli_obj_conj_status( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t m = bli_obj_vector_dim( y ); \ dim_t b_n = bli_obj_vector_dim( x ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, a, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Support cases where matrix A requires a transposition. */ \ if ( bli_obj_has_trans( a ) ) { bli_swap_incs( &rs_a, &cs_a ); } \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conja, \ conjx, \ m, \ b_n, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, inc_x, \ buf_y, inc_y, \ cntx, \ rntm \ ); \ } GENFRONT( axpyf ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjxt = bli_obj_conj_status( xt ); \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ void* buf_z = bli_obj_buffer_at_off( z ); \ inc_t inc_z = bli_obj_vector_inc( z ); \ void* buf_rho = bli_obj_buffer_at_off( rho ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, xt, x, y, rho, z ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjxt, \ conjx, \ conjy, \ n, \ buf_alpha, \ buf_x, inc_x, \ buf_y, inc_y, \ buf_rho, \ buf_z, inc_z, \ cntx, \ rntm \ ); \ } GENFRONT( dotaxpyv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjat = bli_obj_conj_status( at ); \ conj_t conja = bli_obj_conj_status( a ); \ conj_t conjw = bli_obj_conj_status( w ); \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t m = bli_obj_vector_dim( z ); \ dim_t b_n = bli_obj_vector_dim( y ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ void* buf_w = bli_obj_buffer_at_off( w ); \ inc_t inc_w = bli_obj_vector_inc( w ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ void* buf_z = bli_obj_buffer_at_off( z ); \ inc_t inc_z = bli_obj_vector_inc( z ); \ \ void* buf_alpha; \ void* buf_beta; \ \ obj_t alpha_local; \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, at, a, w, x, beta, y, z ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Support cases where matrix A requires a transposition. */ \ if ( bli_obj_has_trans( a ) ) { bli_swap_incs( &rs_a, &cs_a ); } \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjat, \ conja, \ conjw, \ conjx, \ m, \ b_n, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_w, inc_w, \ buf_x, inc_x, \ buf_beta, \ buf_y, inc_y, \ buf_z, inc_z, \ cntx, \ rntm \ ); \ } GENFRONT( dotxaxpyf ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ conj_t conjat = bli_obj_conj_status( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t m = bli_obj_vector_dim( x ); \ dim_t b_n = bli_obj_vector_dim( y ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ void* buf_alpha; \ void* buf_beta; \ \ obj_t alpha_local; \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Support cases where matrix A requires a transposition. */ \ if ( bli_obj_has_trans( a ) ) { bli_swap_incs( &rs_a, &cs_a ); } \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjat, \ conjx, \ m, \ b_n, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, inc_x, \ buf_beta, \ buf_y, inc_y, \ cntx, \ rntm \ ); \ } GENFRONT( dotxf ) #endif cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_oapi.h000066400000000000000000000061061427272030600217320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_oapi_ba.c000066400000000000000000000036701427272030600223720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_oapi_ba.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1f_oapi.c" cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_oapi_ex.c000066400000000000000000000036661427272030600224310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_oapi_ex.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1f_oapi.c" cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_tapi.c000066400000000000000000000143621427272030600217350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the typed API macros. #ifdef BLIS_ENABLE_TAPI // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjx, \ conjy, \ n, \ alphax, \ alphay, \ x, incx, \ y, incy, \ z, incz, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( axpy2v, BLIS_AXPY2V_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conja, \ conjx, \ m, \ b_n, \ alpha, \ a, inca, lda, \ x, incx, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( axpyf, BLIS_AXPYF_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjxt, \ conjx, \ conjy, \ n, \ alpha, \ x, incx, \ y, incy, \ rho, \ z, incz, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_DOTAXPYV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjat, \ conja, \ conjw, \ conjx, \ m, \ b_n, \ alpha, \ a, inca, lda, \ w, incw, \ x, incx, \ beta, \ y, incy, \ z, incz, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_DOTXAXPYF_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kerid ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \ \ f \ ( \ conjat, \ conjx, \ m, \ b_n, \ alpha, \ a, inca, lda, \ x, incx, \ beta, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC( dotxf, BLIS_DOTXF_KER ) #endif cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_tapi.h000066400000000000000000000076141427272030600217440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_tapi_ba.c000066400000000000000000000036661427272030600224040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_tapi_ba.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1f_tapi.c" cython-blis-0.9.1/blis/_src/frame/1f/bli_l1f_tapi_ex.c000066400000000000000000000036641427272030600224340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_tapi_ex.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1f_tapi.c" cython-blis-0.9.1/blis/_src/frame/1m/000077500000000000000000000000001427272030600172455ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m.h000066400000000000000000000047511427272030600207440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_l1m_check.h" // Define kernel function types. #include "bli_l1m_ft_ker.h" // Define object function types for variants. #include "bli_l1m_oft_var.h" // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l1m_oapi.h" #include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l1m_oapi.h" #include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l1m_tapi.h" #include "bli_l1m_ft.h" #include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l1m_tapi.h" #include "bli_l1m_ft.h" #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). #include "bli_l1m_fpa.h" // Prototype level-1m implementations. #include "bli_l1m_unb_var1.h" // Pack-related #include "bli_packm.h" #include "bli_unpackm.h" cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_check.c000066400000000000000000000124431427272030600220710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based check functions. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ) \ { \ bli_l1m_xy_check( x, y ); \ } GENFRONT( addm ) GENFRONT( copym ) GENFRONT( subm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ) \ { \ bli_l1m_axy_check( alpha, x, y ); \ } GENFRONT( axpym ) GENFRONT( scal2m ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ) \ { \ bli_l1m_ax_check( alpha, x ); \ } GENFRONT( scalm ) GENFRONT( setm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ) \ { \ bli_l1m_axy_check( beta, x, y ); \ } GENFRONT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( y ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( y ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); // Check object properties. //e_val = bli_check_nonunit_diag( x ); //bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_check.h000066400000000000000000000053401427272030600220740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_fpa.c000066400000000000000000000051251427272030600215610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \ { \ return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \ } GENFRONT( addm ) GENFRONT( copym ) GENFRONT( subm ) GENFRONT( axpym ) GENFRONT( scal2m ) GENFRONT( scalm ) GENFRONT( setm ) GENFRONT( xpbym ) // // Define function pointer query interfaces for two-datatype operations. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA2( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ) \ { \ return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa2)[ dtx ][ dty ]; \ } GENFRONT( xpbym_md ) cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_fpa.h000066400000000000000000000042021427272030600215610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_ft.h000066400000000000000000000077151427272030600214400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_ft_ker.h000066400000000000000000000103151427272030600222670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_ker.h000066400000000000000000000065631427272030600216100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-1m kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l1m_ker_prot.h. // native packm kernels #undef GENTPROT #define GENTPROT PACKM_KER_PROT INSERT_GENTPROT_BASIC0( packm_2xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_3xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_4xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_6xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_8xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_10xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_12xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_14xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_16xk_ker_name ) INSERT_GENTPROT_BASIC0( packm_24xk_ker_name ) // native unpackm kernels #undef GENTPROT #define GENTPROT UNPACKM_KER_PROT INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name ) INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name ) INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name ) INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name ) INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name ) INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name ) INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name ) INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name ) // 1e/1r packm kernels #undef GENTPROT #define GENTPROT PACKM_1ER_KER_PROT INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name ) INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name ) INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name ) INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name ) INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name ) INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name ) INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name ) INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name ) cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_ker_prot.h000066400000000000000000000056611427272030600226520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_oapi.c000066400000000000000000000274741427272030600217560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the object API macros. #ifdef BLIS_ENABLE_OAPI // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ trans_t transx = bli_obj_conjtrans_status( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, y ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ buf_x, rs_x, cs_x, \ buf_y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } GENFRONT( addm ) GENFRONT( copym ) GENFRONT( subm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ trans_t transx = bli_obj_conjtrans_status( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ buf_alpha, \ buf_x, rs_x, cs_x, \ buf_y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } GENFRONT( axpym ) GENFRONT( scal2m ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ /* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ obj_t x_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x ); \ \ /* Alias x to x_local so we can apply alpha if it is non-unit. */ \ bli_obj_alias_to( x, &x_local ); \ \ /* If alpha is non-unit, apply it to the scalar attached to x. */ \ if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) \ { \ /* Create a local copy-cast of alpha (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ \ bli_obj_scalar_apply_scalar( &alpha_local, &x_local ); \ } \ \ /* Grab the address of the internal scalar buffer for the scalar attached to x. */ \ buf_alpha = bli_obj_internal_scalar_buffer( &x_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ diagoffx, \ diagx, \ uplox, \ m, \ n, \ buf_alpha, \ buf_x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } GENFRONT( scalm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ /* conj_t conjalpha = bli_obj_conj_status( alpha ); */ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ BLIS_NO_CONJUGATE, /* internal conjugation applied during copy-cast. */ \ diagoffx, \ diagx, \ uplox, \ m, \ n, \ buf_alpha, \ buf_x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } GENFRONT( setm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ if ( bli_obj_dt( x ) != bli_obj_dt( y ) ) \ return bli_xpbym_md( x, beta, y ); \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ trans_t transx = bli_obj_conjtrans_status( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ void* buf_beta; \ \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ buf_x, rs_x, cs_x, \ buf_beta, \ buf_y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } GENFRONT( xpbym ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dtx = bli_obj_dt( x ); \ num_t dty = bli_obj_dt( y ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ trans_t transx = bli_obj_conjtrans_status( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ void* buf_beta; \ \ obj_t beta_local; \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dty, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_beta = bli_obj_buffer_for_1x1( dty, &beta_local ); \ \ /* Query a (multi) type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( dtx, dty ); \ \ f \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ buf_x, rs_x, cs_x, \ buf_beta, \ buf_y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } GENFRONT( xpbym_md ) #endif cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_oapi.h000066400000000000000000000051431427272030600217500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_oapi_ba.c000066400000000000000000000036701427272030600224100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_oapi_ba.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1m_oapi.c" cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_oapi_ex.c000066400000000000000000000036661427272030600224470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_oapi_ex.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l1m_oapi.c" cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_oft_var.h000066400000000000000000000043201427272030600224540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_tapi.c000066400000000000000000000271371427272030600217570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the typed API macros. #ifdef BLIS_ENABLE_TAPI // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, auxker ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ PASTEMAC2(ch,auxker,BLIS_TAPI_EX_SUF) \ ( \ diagoffx, \ diagx, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } \ } INSERT_GENTFUNC_BASIC( addm, addd ) INSERT_GENTFUNC_BASIC( subm, subd ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ doff_t diagoffy = diagoffx; \ ctype* one = PASTEMAC(ch,1); \ \ if ( bli_does_trans( transx ) ) \ bli_negate_diag_offset( &diagoffy ); \ \ PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ diagoffy, \ m, \ n, \ one, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } \ } INSERT_GENTFUNC_BASIC0( copym ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* If alpha is zero, then the entire operation is a no-op. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ alpha, \ x, rs_x, cs_x, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ PASTEMAC2(ch,axpyd,BLIS_TAPI_EX_SUF) \ ( \ diagoffx, \ diagx, \ transx, \ m, \ n, \ alpha, \ x, rs_x, cs_x, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } \ } INSERT_GENTFUNC_BASIC0( axpym ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* If alpha is zero, then we set the output matrix to zero. This seemingly minor optimization is important because it will clear any NaNs and Infs in x that would otherwise propogate. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ \ PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ diagoffx, \ diagx, \ uplox, \ m, \ n, \ alpha, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ return; \ } \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ alpha, \ x, rs_x, cs_x, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ doff_t diagoffy = diagoffx; \ \ if ( bli_does_trans( transx ) ) \ bli_negate_diag_offset( &diagoffy ); \ \ PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ diagoffy, \ m, \ n, \ alpha, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } \ } INSERT_GENTFUNC_BASIC0( scal2m ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ conjalpha, \ diagoffx, \ diagx, \ uplox, \ m, \ n, \ alpha, \ x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( scalm ) INSERT_GENTFUNC_BASIC0( setm ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* If beta is zero, then the operation reduces to copym. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC2(ch,copym,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ \ return; \ } \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ beta, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ \ /* When the diagonal of an upper- or lower-stored matrix is unit, we handle it with a separate post-processing step. */ \ if ( bli_is_upper_or_lower( uplox ) && \ bli_is_unit_diag( diagx ) ) \ { \ PASTEMAC2(ch,xpbyd,BLIS_TAPI_EX_SUF) \ ( \ diagoffx, \ diagx, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ beta, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } \ } INSERT_GENTFUNC_BASIC0( xpbym ) #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* If beta is zero, then the operation reduces to copym. */ \ if ( PASTEMAC(chy,eq0)( *beta ) ) \ { \ PASTEMAC2(chx,chy,castm) \ ( \ transx, \ m, \ n, \ x, rs_x, cs_x, \ y, rs_y, cs_y \ ); \ \ return; \ } \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ beta, \ y, rs_y, cs_y, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC2_BASIC0( xpbym_md ) INSERT_GENTFUNC2_MIXDP0( xpbym_md ) #endif cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_tapi.h000066400000000000000000000077741427272030600217710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_tapi_ba.c000066400000000000000000000036661427272030600224220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_tapi_ba.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1m_tapi.c" cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_tapi_ex.c000066400000000000000000000036641427272030600224520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_tapi_ex.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l1m_tapi.c" cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_unb_var1.c000066400000000000000000000354441427272030600225370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ uplo_t uplox_eff; \ conj_t conjx; \ dim_t n_iter; \ dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_2m \ ( \ diagoffx, diagx, transx, \ uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ &ij0, &n_shift \ ); \ \ if ( bli_is_zeros( uplox_eff ) ) return; \ \ /* Extract the conjugation component from the transx parameter. */ \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = n_elem_max; \ \ ctype* x1 = x + (j )*ldx + (0 )*incx; \ ctype* y1 = y + (j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ const dim_t n_elem = n_elem_max - offi; \ \ ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( addm_unb_var1, addv, BLIS_ADDV_KER ) INSERT_GENTFUNC_BASIC2( copym_unb_var1, copyv, BLIS_COPYV_KER ) INSERT_GENTFUNC_BASIC2( subm_unb_var1, subv, BLIS_SUBV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ uplo_t uplox_eff; \ conj_t conjx; \ dim_t n_iter; \ dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_2m \ ( \ diagoffx, diagx, transx, \ uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ &ij0, &n_shift \ ); \ \ if ( bli_is_zeros( uplox_eff ) ) return; \ \ /* Extract the conjugation component from the transx parameter. */ \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = n_elem_max; \ \ ctype* x1 = x + (j )*ldx + (0 )*incx; \ ctype* y1 = y + (j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ alpha, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ alpha, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ const dim_t n_elem = n_elem_max - offi; \ \ ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ alpha, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( axpym_unb_var1, axpyv, BLIS_AXPYV_KER ) INSERT_GENTFUNC_BASIC2( scal2m_unb_var1, scal2v, BLIS_SCAL2V_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ uplo_t uplox_eff; \ dim_t n_iter; \ dim_t n_elem_max; \ inc_t ldx, incx; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_1m \ ( \ diagoffx, diagx, \ uplox, m, n, rs_x, cs_x, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \ &ij0, &n_shift \ ); \ \ if ( bli_is_zeros( uplox_eff ) ) return; \ \ /* Query the kernel needed for this operation. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = n_elem_max; \ \ ctype* x1 = x + (j )*ldx + (0 )*incx; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjalpha, \ n_elem, \ alpha, \ x1, incx, \ cntx \ ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjalpha, \ n_elem, \ alpha, \ x1, incx, \ cntx \ ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ const dim_t n_elem = n_elem_max - offi; \ \ ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjalpha, \ n_elem, \ alpha, \ x1, incx, \ cntx \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER ) INSERT_GENTFUNC_BASIC2( setm_unb_var1, setv, BLIS_SETV_KER ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, kername, kerid ) \ \ void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ uplo_t uplox_eff; \ conj_t conjx; \ dim_t n_iter; \ dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_2m \ ( \ diagoffx, diagx, transx, \ uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ &ij0, &n_shift \ ); \ \ if ( bli_is_zeros( uplox_eff ) ) return; \ \ /* Extract the conjugation component from the transx parameter. */ \ conjx = bli_extract_conj( transx ); \ \ /* Query the kernel needed for this operation. */ \ PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = n_elem_max; \ \ ctype* x1 = x + (j )*ldx + (0 )*incx; \ ctype* y1 = y + (j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ x1, incx, \ beta, \ y1, incy, \ cntx \ ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ x1, incx, \ beta, \ y1, incy, \ cntx \ ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ const dim_t n_elem = n_elem_max - offi; \ \ ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ \ /* Invoke the kernel with the appropriate parameters. */ \ f \ ( \ conjx, \ n_elem, \ x1, incx, \ beta, \ y1, incy, \ cntx \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( xpbym_unb_var1, xpbyv, BLIS_XPBYV_KER ) #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC2(chx,chy,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ uplo_t uplox_eff; \ dim_t n_iter; \ dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_2m \ ( \ diagoffx, diagx, transx, \ uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ &ij0, &n_shift \ ); \ \ /* Extract the conjugation component from the transx parameter. */ \ /*conjx = bli_extract_conj( transx );*/ \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( PASTEMAC(chy,eq1)( *beta ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ const dim_t n_elem = n_elem_max; \ \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ for ( dim_t i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(chx,chy,adds)( x1[i], y1[i] ); \ } \ } \ } \ else \ { \ const dim_t n_elem = n_elem_max; \ \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ ctype_x* restrict chi1 = x1; \ ctype_y* restrict psi1 = y1; \ \ for ( dim_t i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \ \ chi1 += incx; \ psi1 += incy; \ } \ } \ } \ } \ else /* ( !PASTEMAC(chy,eq1)( *beta ) ) */ \ { \ if ( incx == 1 && incy == 1 ) \ { \ const dim_t n_elem = n_elem_max; \ \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ for ( dim_t i = 0; i < n_elem; ++i ) \ { \ PASTEMAC3(chx,chy,chy,xpbys)( x1[i], *beta, y1[i] ); \ } \ } \ } \ else \ { \ const dim_t n_elem = n_elem_max; \ \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ ctype_x* restrict x1 = x + (j )*ldx + (0 )*incx; \ ctype_y* restrict y1 = y + (j )*ldy + (0 )*incy; \ \ ctype_x* restrict chi1 = x1; \ ctype_y* restrict psi1 = y1; \ \ for ( dim_t i = 0; i < n_elem; ++i ) \ { \ PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \ \ chi1 += incx; \ psi1 += incy; \ } \ } \ } \ } \ } INSERT_GENTFUNC2_BASIC0( xpbym_md_unb_var1 ) INSERT_GENTFUNC2_MIXDP0( xpbym_md_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/1m/bli_l1m_unb_var1.h000066400000000000000000000100101427272030600225220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) cython-blis-0.9.1/blis/_src/frame/1m/other/000077500000000000000000000000001427272030600203665ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1m/other/bli_scalm.h000066400000000000000000000032461427272030600224710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_scalm_cntl.h" cython-blis-0.9.1/blis/_src/frame/1m/other/bli_scalm_cntl.c000066400000000000000000000042051427272030600235000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cntl_t* bli_scalm_cntl_create_node ( void_fp var_func, cntl_t* sub_node ) { cntl_t* cntl; // It's important that we set the bszid field to BLIS_NO_PART to indicate // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. cntl = bli_cntl_create_node ( BLIS_NOID, BLIS_NO_PART, var_func, NULL, sub_node ); return cntl; } cython-blis-0.9.1/blis/_src/frame/1m/other/bli_scalm_cntl.h000066400000000000000000000033551427272030600235120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ cntl_t* bli_scalm_cntl_create_node ( void_fp var_func, cntl_t* sub_node ); cython-blis-0.9.1/blis/_src/frame/1m/other/bli_scalm_int.c000066400000000000000000000064341427272030600233400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T scalm_fp typedef void (*FUNCPTR_T)( obj_t* alpha, obj_t* x, cntx_t* cntx ); static FUNCPTR_T vars[1][3] = { // unblocked optimized unblocked blocked { bli_scalm_ex, bli_scalm_ex, NULL } }; void bli_scalm_int( obj_t* alpha, obj_t* x, cntx_t* cntx, scalm_t* cntl ) { //obj_t x_local; varnum_t n; impl_t i; FUNCPTR_T f; // Return early if one of the matrix operands has a zero dimension. if ( bli_obj_has_zero_dim( x ) ) return; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_scalm_check( alpha, x ); // First check if we are to skip this operation. if ( bli_cntl_is_noop( cntl ) ) return; // Return early if both alpha and the scalar attached to x are unit. if ( bli_obj_equals( alpha, &BLIS_ONE ) && bli_obj_scalar_equals( x, &BLIS_ONE ) ) return; // // This code has been disabled since we've now added the alpha // parameter back to the object interface to the underlying // scalm variant. // // Alias x to x_local so we can apply alpha if it is non-unit. //bli_obj_alias_to( *x, x_local ); // If alpha is non-unit, apply it to the scalar attached to x. //if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) //{ // bli_obj_scalar_apply_scalar( alpha, &x_local ); //} // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( alpha, x, cntx ); } cython-blis-0.9.1/blis/_src/frame/1m/other/bli_scalm_int.h000066400000000000000000000034261427272030600233430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_scalm_int( obj_t* alpha, obj_t* x, cntx_t* cntx, scalm_t* cntl ); cython-blis-0.9.1/blis/_src/frame/1m/packm/000077500000000000000000000000001427272030600203405ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm.h000066400000000000000000000042221427272030600224320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_packm_alloc.h" #include "bli_packm_cntl.h" #include "bli_packm_check.h" #include "bli_packm_init.h" #include "bli_packm_int.h" #include "bli_packm_scalar.h" #include "bli_packm_part.h" #include "bli_packm_struc_cxk.h" #include "bli_packm_struc_cxk_1er.h" #include "bli_packm_cxk.h" #include "bli_packm_cxk_1er.h" // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD #include "bli_packm_struc_cxk_md.h" #endif #include "bli_packm_blk_var1.h" cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_alloc.c000066400000000000000000000067441427272030600236120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { // Query the pack buffer type from the control tree node. packbuf_t pack_buf_type = bli_cntl_packm_params_pack_buf_type( cntl ); return bli_packm_alloc_ex ( size_needed, pack_buf_type, rntm, cntl, thread ); } void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { // Query the address of the mem_t entry within the control tree node. mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl ); mem_t* local_mem_p; mem_t local_mem_s; siz_t cntl_mem_size = 0; if ( bli_mem_is_alloc( cntl_mem_p ) ) cntl_mem_size = bli_mem_size( cntl_mem_p ); if ( cntl_mem_size < size_needed ) { if ( bli_thread_am_ochief( thread ) ) { // The chief thread releases the existing block associated with // the mem_t entry in the control tree, and then re-acquires a // new block, saving the associated mem_t entry to local_mem_s. if ( bli_mem_is_alloc( cntl_mem_p ) ) { bli_pba_release ( rntm, cntl_mem_p ); } bli_pba_acquire_m ( rntm, size_needed, pack_buf_type, &local_mem_s ); } // Broadcast the address of the chief thread's local mem_t entry to // all threads. local_mem_p = bli_thread_broadcast( thread, &local_mem_s ); // Save the chief thread's local mem_t entry to the mem_t field in // this thread's control tree node. *cntl_mem_p = *local_mem_p; // Barrier so that the master thread doesn't return from the function // before we are done reading. bli_thread_barrier( thread ); } return bli_mem_buffer( cntl_mem_p ); } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_alloc.h000066400000000000000000000037531427272030600236140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_blk_var1.c000066400000000000000000000263761427272030600242240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] = { /* float (0) scomplex (1) double (2) dcomplex (3) */ // 0000 row/col panels { { bli_spackm_struc_cxk, bli_cpackm_struc_cxk, bli_dpackm_struc_cxk, bli_zpackm_struc_cxk, } }, // 0001 row/col panels: 1m-expanded (1e) { { NULL, bli_cpackm_struc_cxk_1er, NULL, bli_zpackm_struc_cxk_1er, } }, // 0010 row/col panels: 1m-reordered (1r) { { NULL, bli_cpackm_struc_cxk_1er, NULL, bli_zpackm_struc_cxk_1er, } }, }; static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md); void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { // Extract various fields from the control tree. pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); bool invdiag = bli_cntl_packm_params_does_invert_diag( cntl ); bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl ); bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl ); // Every thread initializes p and determines the size of memory // block needed (which gets embedded into the otherwise "blank" mem_t // entry in the control tree node). Return early if no packing is required. if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) ) return; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_int_check( c, p, cntx ); num_t dt_c = bli_obj_dt( c ); dim_t dt_c_size = bli_dt_size( dt_c ); num_t dt_p = bli_obj_dt( p ); dim_t dt_p_size = bli_dt_size( dt_p ); struc_t strucc = bli_obj_struc( c ); doff_t diagoffc = bli_obj_diag_offset( c ); diag_t diagc = bli_obj_diag( c ); uplo_t uploc = bli_obj_uplo( c ); conj_t conjc = bli_obj_conj_status( c ); dim_t iter_dim = bli_obj_length( p ); dim_t panel_len_full = bli_obj_width( p ); dim_t panel_len_max = bli_obj_padded_width( p ); char* c_cast = bli_obj_buffer_at_off( c ); inc_t incc = bli_obj_row_stride( c ); inc_t ldc = bli_obj_col_stride( c ); dim_t panel_dim_off = bli_obj_row_off( c ); dim_t panel_len_off = bli_obj_col_off( c ); char* p_cast = bli_obj_buffer( p ); inc_t ldp = bli_obj_col_stride( p ); inc_t is_p = bli_obj_imag_stride( p ); dim_t panel_dim_max = bli_obj_panel_dim( p ); inc_t ps_p = bli_obj_panel_stride( p ); doff_t diagoffc_inc = ( doff_t )panel_dim_max; obj_t kappa_local; char* kappa_cast = bli_packm_scalar( &kappa_local, p ); // we use the default lookup table to determine the right func_t // for the current schema. func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ]; // Query the datatype-specific function pointer from the func_t object. packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers ); // For mixed-precision gemm, select the proper kernel (only dense panels). if ( dt_c != dt_p ) { packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ]; } // Query the address of the packm params field of the obj_t. The user might // have set this field in order to specify a custom packm kernel. packm_blk_var1_params_t* params = bli_obj_pack_params( c ); if ( params && params->ukr_fn[ dt_c ][ dt_p ] ) { // Query the user-provided packing kernel from the obj_t. If provided, // this overrides the kernel determined above. packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ]; } /* Compute the total number of iterations we'll need. */ dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); /* Set the initial values and increments for indices related to C and P based on whether reverse iteration was requested. */ dim_t ic0, ip0; doff_t ic_inc, ip_inc; if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) || ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) ) { ic0 = (n_iter - 1) * panel_dim_max; ic_inc = -panel_dim_max; ip0 = n_iter - 1; ip_inc = -1; } else { ic0 = 0; ic_inc = panel_dim_max; ip0 = 0; ip_inc = 1; } // Query the number of threads and thread ids from the current thread's // packm thrinfo_t node. const dim_t nt = bli_thread_n_way( thread ); const dim_t tid = bli_thread_work_id( thread ); // Determine the thread range and increment using the current thread's // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() // will depend on whether slab or round-robin partitioning was requested // at configure-time. dim_t it_start, it_end, it_inc; bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); char* p_begin = p_cast; // Iterate over every logical micropanel in the source matrix. for ( dim_t ic = ic0, ip = ip0, it = 0; it < n_iter; ic += ic_inc, ip += ip_inc, it += 1 ) { dim_t panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); dim_t panel_dim_off_i = panel_dim_off + ic; doff_t diagoffc_i = diagoffc + (ip )*diagoffc_inc; char* c_begin = c_cast + (ic )*incc*dt_c_size; inc_t p_inc = ps_p; // NOTE: We MUST use round-robin partitioning when packing // micropanels of a triangular matrix. Hermitian/symmetric // and general packing may use slab or round-robin, depending // on which was selected at configure-time. // The definition of bli_packm_my_iter() will depend on whether slab // or round-robin partitioning was requested at configure-time. bool my_iter = bli_is_triangular( strucc ) ? bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) : bli_packm_my_iter ( it, it_start, it_end, tid, nt ); if ( bli_is_triangular( strucc ) && bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i, panel_len_full ) ) { // This case executes if the panel belongs to a triangular // matrix AND is completely unstored (ie: zero). If the panel // is unstored, we do nothing. (Notice that we don't even // increment p_begin.) continue; } else if ( bli_is_triangular( strucc ) && bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) ) { // This case executes if the panel belongs to a triangular // matrix AND is diagonal-intersecting. Notice that we // cannot bury the following conditional logic into // packm_struc_cxk() because we need to know the value of // panel_len_max_i so we can properly increment p_inc. // Sanity check. Diagonals should not intersect the short end of // a micro-panel. If they do, then somehow the constraints on // cache blocksizes being a whole multiple of the register // blocksizes was somehow violated. if ( diagoffc_i < 0 ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); dim_t panel_off_i; dim_t panel_len_i; dim_t panel_len_max_i; if ( bli_is_lower( uploc ) ) { panel_off_i = 0; panel_len_i = bli_abs( diagoffc_i ) + panel_dim_i; panel_len_max_i = bli_min( bli_abs( diagoffc_i ) + panel_dim_max, panel_len_max ); } else // if ( bli_is_upper( uploc ) ) { panel_off_i = bli_abs( diagoffc_i ); panel_len_i = panel_len_full - panel_off_i; panel_len_max_i = panel_len_max - panel_off_i; } dim_t panel_len_off_i = panel_off_i + panel_len_off; char* c_use = c_begin + (panel_off_i )*ldc*dt_c_size; char* p_use = p_begin; // We need to re-compute the imaginary stride as a function of // panel_len_max_i since triangular packed matrices have panels // of varying lengths. NOTE: This imaginary stride value is // only referenced by the packm kernels for induced methods. inc_t is_p_use = ldp * panel_len_max_i; // We nudge the imaginary stride up by one if it is odd. is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); if ( my_iter ) { packm_ker_cast( strucc, diagc, uploc, conjc, schema, invdiag, panel_dim_i, panel_len_i, panel_dim_max, panel_len_max_i, panel_dim_off_i, panel_len_off_i, kappa_cast, c_use, incc, ldc, p_use, ldp, is_p_use, cntx, params ); } // NOTE: This value is usually LESS than ps_p because triangular // matrices usually have several micro-panels that are shorter // than a "full" micro-panel. p_inc = is_p_use; } else { // This case executes if the panel is either dense, or belongs // to a Hermitian or symmetric matrix, which includes stored, // unstored, and diagonal-intersecting panels. if ( my_iter ) { packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, diagc, uploc, conjc, schema, invdiag, panel_dim_i, panel_len_full, panel_dim_max, panel_len_max, panel_dim_off_i, panel_len_off, kappa_cast, c_begin, incc, ldc, p_begin, ldp, is_p, cntx, params ); } } p_begin += p_inc*dt_p_size; } } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_blk_var1.h000066400000000000000000000041471427272030600242210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_check.c000066400000000000000000000054021427272030600235630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); // Check control tree pointer. // NOTE: We can't check the control tree because we interpret a NULL // value (in bli_packm_int()) as a request to skip the operation. //e_val = bli_check_valid_cntl( ( void* )cntl ); //bli_check_error_code( e_val ); } void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( p ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_conformal_dims( a, p ); bli_check_error_code( e_val ); // Check control tree pointer. // NOTE: We can't check the control tree because we interpret a NULL // value (in bli_packm_int()) as a request to skip the operation. //e_val = bli_check_valid_cntl( ( void* )cntl ); //bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_check.h000066400000000000000000000035141427272030600235720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_cntl.c000066400000000000000000000062251427272030600234520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ) { cntl_t* cntl; packm_params_t* params; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_packm_cntl_create_node(): " ); #endif // Allocate a packm_params_t struct. params = bli_sba_acquire( rntm, sizeof( packm_params_t ) ); // Initialize the packm_params_t struct. params->size = sizeof( packm_params_t ); params->bmid_m = bmid_m; params->bmid_n = bmid_n; params->does_invert_diag = does_invert_diag; params->rev_iter_if_upper = rev_iter_if_upper; params->rev_iter_if_lower = rev_iter_if_lower; params->pack_schema = pack_schema; params->pack_buf_type = pack_buf_type; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_packm_cntl_create_node(): " ); #endif // It's important that we set the bszid field to BLIS_NO_PART to indicate // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. cntl = bli_cntl_create_node ( rntm, BLIS_NOID, BLIS_NO_PART, var_func, params, sub_node ); return cntl; } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_cntl.h000066400000000000000000000070301427272030600234520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_cxk.c000066400000000000000000000155441427272030600233030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ) \ { \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ num_t dt = PASTEMAC(ch,type); \ l1mkr_t ker_id = panel_dim_max; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ if ( f != NULL ) \ { \ /* Under normal circumstances, the packm kernel will copy over a panel_dim x panel_len submatrix of A into P. However, the kernel now handles zero-filling at edge cases, which typically consist of the outer (panel_dim_max - panel_dim) rows or columns of the micropanel. (Note that these rows/columns correspond to values beyond the edge of matrix A.) The kernel intrinsically knows its own panel_dim_max, since that corresponds to the packm micropanel's normal width (corresponding to the gemm microkernel's register blocksize (mr or nr). However, we *do* need to pass in panel_len_max because the bottom-right edge case of trsm_lu will need all elements above the extended diagonal and beyond (to the right of) the bottom-right element to be initialized to zero so the trsm portion of the computational kernel will operate with zeros for those iterations. For example, if trsm_lu is executed on an 10x10 triangular matrix, and the gemmtrsm kernel uses MR = 6, the computation will begin with the edge case, which is the bottom-right 4x4 upper triangular matrix. Code in bli_packm_tri_cxk() will extend the diagonal as identity into the remaining portion of the micropanel. But before that happens, the packm kernel must have set the 0's added in step (3) below. packm kernel packm kernel packm kernel packm_tri_cxk step 1: step 2: step 3: step 4: x x x x . . x x x x . . x x x x 0 0 x x x x 0 0 ? x x x . . ? x x x . . ? x x x 0 0 ? x x x 0 0 ? ? x x . . -> ? ? x x . . -> ? ? x x 0 0 -> ? ? x x 0 0 ? ? ? x . . ? ? ? x . . ? ? ? x 0 0 ? ? ? x 0 0 . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 . . . . . . 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 x Copied from A; valid element. ? Copied from A, but value is unknown and unused. . Uninitialized. 0 Initialized to zero. 1 Initialized to one. NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s to zero. This is not needed to support trsm, but rather to support trmm. (Both use the same packing format and code.) In this case, panel_dim will be 4 because four rows of data are copied from A, panel_len will be 4 because those four rows span four columns of A, and panel_len_max will be 6 because there are a total of 6 columns that can be written to in the packed micropanel, 2 of which lie beyond the values copied from A. */ \ f \ ( \ conja, \ schema, \ panel_dim, \ panel_len, \ panel_len_max, \ kappa, \ a, inca, lda, \ p, ldp, \ cntx \ ); \ } \ else \ { \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ panel_dim, \ panel_len, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ /* The rntm_t* can safely be NULL as long as it's not used by scal2m_ex(). */ \ NULL \ ); \ \ /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ if ( panel_dim < panel_dim_max ) \ { \ const dim_t i = panel_dim; \ const dim_t m_edge = panel_dim_max - panel_dim; \ const dim_t n_edge = panel_len_max; \ ctype* restrict p_edge = p + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ \ /* If panel_len < panel_len_max, then we zero those unused columns. */ \ if ( panel_len < panel_len_max ) \ { \ const dim_t j = panel_len; \ const dim_t m_edge = panel_dim_max; \ const dim_t n_edge = panel_len_max - panel_len; \ ctype* restrict p_edge = p + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ } INSERT_GENTFUNC_BASIC0( packm_cxk ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_cxk.h000066400000000000000000000041071427272030600233010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_cxk_1er.c000066400000000000000000000103241427272030600240410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ) \ { \ /* Note that we use panel_dim_max, not panel_dim, to query the packm kernel function pointer. This means that we always use the same kernel, even for edge cases. */ \ num_t dt = PASTEMAC(ch,type); \ l1mkr_t ker_id = panel_dim_max; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the packm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ if ( f != NULL ) \ { \ f \ ( \ conja, \ schema, \ panel_dim, \ panel_len, \ panel_len_max, \ kappa, \ a, inca, lda, \ p, ldp, \ cntx \ ); \ } \ else \ { \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ panel_dim, \ panel_len, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ if ( panel_dim < panel_dim_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = panel_dim; \ const dim_t offn = 0; \ const dim_t m_edge = panel_dim_max - panel_dim; \ const dim_t n_edge = panel_len_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ \ /* If panel_len < panel_len_max, then we zero those unused columns. */ \ if ( panel_len < panel_len_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = panel_len; \ const dim_t m_edge = panel_dim_max; \ const dim_t n_edge = panel_len_max - panel_len; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( packm_cxk_1er ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_cxk_1er.h000066400000000000000000000041371427272030600240530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_init.c000066400000000000000000000170341427272030600234550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" bool bli_packm_init ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { bli_init_once(); // The purpose of packm_init() is to initialize an object P so that // a source object A can be packed into P via one of the packm // implementations. This initialization precedes the acquisition of a // suitable block of memory from the memory allocator (if such a block // of memory has not already been allocated previously). // Check parameters. if ( bli_error_checking_is_enabled() ) bli_packm_init_check( c, p, cntx ); // We begin by copying the fields of A. bli_obj_alias_to( c, p ); // If the object is marked as being filled with zeros, then we can skip // the packm operation entirely and alias. if ( bli_obj_is_zeros( c ) ) return false; // Extract various fields from the control tree. bszid_t bmult_id_m = bli_cntl_packm_params_bmid_m( cntl ); bszid_t bmult_id_n = bli_cntl_packm_params_bmid_n( cntl ); pack_t schema = bli_cntl_packm_params_pack_schema( cntl ); num_t dt_tar = bli_obj_target_dt( c ); num_t dt_scalar = bli_obj_scalar_dt( c ); dim_t bmult_m_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx ); dim_t bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx ); dim_t bmult_n_def = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx ); // Typecast the internal scalar value to the target datatype. // Note that if the typecasting is needed, this must happen BEFORE we // change the datatype of P to reflect the target_dt. if ( dt_scalar != dt_tar ) { bli_obj_scalar_cast_to( dt_tar, p ); } // Update the storage datatype of P to be the target datatype of A. bli_obj_set_dt( dt_tar, p ); bli_obj_set_elem_size( bli_dt_size( dt_tar ), p ); // Store the pack schema to the object. bli_obj_set_pack_schema( schema, p ); // Clear the conjugation field from the object since matrix packing // in BLIS is deemed to take care of all conjugation necessary. bli_obj_set_conj( BLIS_NO_CONJUGATE, p ); // Since we are packing micropanels, mark P as dense. bli_obj_set_uplo( BLIS_DENSE, p ); // Reset the view offsets to (0,0). bli_obj_set_offs( 0, 0, p ); // Compute the dimensions padded by the dimension multiples. These // dimensions will be the dimensions of the packed matrices, including // zero-padding, and will be used by the macro- and micro-kernels. // We compute them by starting with the effective dimensions of A (now // in P) and aligning them to the dimension multiples (typically equal // to register blocksizes). This does waste a little bit of space for // level-2 operations, but that's okay with us. dim_t m_p = bli_obj_length( p ); dim_t n_p = bli_obj_width( p ); dim_t m_p_pad = bli_align_dim_to_mult( m_p, bmult_m_def ); dim_t n_p_pad = bli_align_dim_to_mult( n_p, bmult_n_def ); // Save the padded dimensions into the packed object. It is important // to save these dimensions since they represent the actual dimensions // of the zero-padded matrix. bli_obj_set_padded_dims( m_p_pad, n_p_pad, p ); // Now we prepare to compute strides, align them, and compute the // total number of bytes needed for the packed buffer. Then we use // that value to acquire an appropriate block of memory from the // memory allocator. // Extract the element size for the packed object. siz_t elem_size_p = bli_obj_elem_size( p ); // The panel dimension (for each datatype) should be equal to the // default (logical) blocksize multiple in the m dimension. dim_t m_panel = bmult_m_def; // The "column stride" of a row-micropanel packed object is interpreted // as the column stride WITHIN a micropanel. Thus, this is equal to the // packing (storage) blocksize multiple, which may be equal to the // default (logical) blocksize multiple). inc_t cs_p = bmult_m_pack; // The "row stride" of a row-micropanel packed object is interpreted // as the row stride WITHIN a micropanel. Thus, it is unit. inc_t rs_p = 1; // The "panel stride" of a micropanel packed object is interpreted as // the distance between the (0,0) element of panel k and the (0,0) // element of panel k+1. We use the padded width computed above to // allow for zero-padding (if necessary/desired) along the far end // of each micropanel (ie: the right edge of the matrix). Zero-padding // can also occur along the long edge of the last micropanel if the m // dimension of the matrix is not a whole multiple of MR. inc_t ps_p = cs_p * n_p_pad; // As a general rule, we don't want micropanel strides to be odd. There // are very few instances where this can happen, but we've seen it happen // more than zero times (such as for certain small problems), and so we // check for it here. if ( bli_is_odd( ps_p ) ) ps_p += 1; // Set the imaginary stride (in units of fundamental elements). // This is the number of real elements that must be traversed before // reaching the imaginary part of the packed micropanel. NOTE: the // imaginary stride is mostly vestigial and left over from the 3m // and 4m implementations. inc_t is_p = 1; // Store the strides and panel dimension in P. bli_obj_set_strides( rs_p, cs_p, p ); bli_obj_set_imag_stride( is_p, p ); bli_obj_set_panel_dim( m_panel, p ); bli_obj_set_panel_stride( ps_p, p ); bli_obj_set_panel_length( m_panel, p ); bli_obj_set_panel_width( n_p, p ); // Compute the size of the packed buffer. siz_t size_p = ps_p * ( m_p_pad / m_panel ) * elem_size_p; // If the requested size is zero, then we don't need to do any allocation. if ( size_p == 0 ) return false; // Update the buffer address in p to point to the buffer associated // with the mem_t entry acquired from the memory broker (now cached in // the control tree node). void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread ); bli_obj_set_buffer( buffer, p ); return true; } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_init.h000066400000000000000000000034721427272030600234630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_int.c000066400000000000000000000044101427272030600232760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { bli_init_once(); // Extract the function pointer from the object. packm_var_oft f = bli_obj_pack_fn( a ); // Barrier so that we know threads are done with previous computation // with the same packing buffer before starting to pack. bli_thread_barrier( thread ); // Invoke the variant with kappa_use. f ( a, p, cntx, rntm, cntl, thread ); // Barrier so that packing is done before computation. bli_thread_barrier( thread ); } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_int.h000066400000000000000000000034471427272030600233140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_part.c000066400000000000000000000220411427272030600234520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { dim_t m, n; // For now, we only support acquiring the middle subpartition. if ( requested_part != BLIS_SUBPART1 ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } // Partitioning top-to-bottom through packed column panels (which are // row-stored) is not yet supported. if ( bli_obj_is_col_packed( obj ) ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } // Query the dimensions of the parent object. m = bli_obj_length( obj ); n = bli_obj_width( obj ); // Foolproofing: do not let b exceed what's left of the m dimension at // row offset i. if ( b > m - i ) b = m - i; // Begin by copying the info, elem size, buffer, row stride, and column // stride fields of the parent object. Note that this omits copying view // information because the new partition will have its own dimensions // and offsets. bli_obj_init_subpart_from( obj, sub_obj ); // Modify offsets and dimensions of requested partition. bli_obj_set_dims( b, n, sub_obj ); // Tweak the padded length of the subpartition to trick the underlying // implementation into only zero-padding for the narrow submatrix of // interest. Usually, the value we want is b (for non-edge cases), but // at the edges, we want the remainder of the mem_t region in the m // dimension. Edge cases are defined as occurring when i + b is exactly // equal to the inherited sub-object's length (which happens since the // determine_blocksize function would have returned a smaller value of // b for the edge iteration). In these cases, we arrive at the new // packed length by simply subtracting off i. { dim_t m_pack_max = bli_obj_padded_length( sub_obj ); dim_t m_pack_cur; if ( i + b == m ) m_pack_cur = m_pack_max - i; else m_pack_cur = b; bli_obj_set_padded_length( m_pack_cur, sub_obj ); } // Translate the desired offsets to a panel offset and adjust the // buffer pointer of the subpartition object. { char* buf_p = bli_obj_buffer( sub_obj ); siz_t elem_size = bli_obj_elem_size( sub_obj ); dim_t off_to_panel = bli_packm_offset_to_panel_for( i, sub_obj ); buf_p = buf_p + elem_size * off_to_panel; bli_obj_set_buffer( buf_p, sub_obj ); } } void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ) { dim_t m, n; // Check parameters. //if ( bli_error_checking_is_enabled() ) // bli_packm_acquire_mpart_l2r_check( requested_part, j, b, obj, sub_obj ); // For now, we only support acquiring the middle subpartition. if ( requested_part != BLIS_SUBPART1 ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } // Partitioning left-to-right through packed row panels (which are // column-stored) is not yet supported. if ( bli_obj_is_row_packed( obj ) ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } // Query the dimensions of the parent object. m = bli_obj_length( obj ); n = bli_obj_width( obj ); // Foolproofing: do not let b exceed what's left of the n dimension at // column offset j. if ( b > n - j ) b = n - j; // Begin by copying the info, elem size, buffer, row stride, and column // stride fields of the parent object. Note that this omits copying view // information because the new partition will have its own dimensions // and offsets. bli_obj_init_subpart_from( obj, sub_obj ); // Modify offsets and dimensions of requested partition. bli_obj_set_dims( m, b, sub_obj ); // Tweak the padded width of the subpartition to trick the underlying // implementation into only zero-padding for the narrow submatrix of // interest. Usually, the value we want is b (for non-edge cases), but // at the edges, we want the remainder of the mem_t region in the n // dimension. Edge cases are defined as occurring when j + b is exactly // equal to the inherited sub-object's width (which happens since the // determine_blocksize function would have returned a smaller value of // b for the edge iteration). In these cases, we arrive at the new // packed width by simply subtracting off j. { dim_t n_pack_max = bli_obj_padded_width( sub_obj ); dim_t n_pack_cur; if ( j + b == n ) n_pack_cur = n_pack_max - j; else n_pack_cur = b; bli_obj_set_padded_width( n_pack_cur, sub_obj ); } // Translate the desired offsets to a panel offset and adjust the // buffer pointer of the subpartition object. { char* buf_p = bli_obj_buffer( sub_obj ); siz_t elem_size = bli_obj_elem_size( sub_obj ); dim_t off_to_panel = bli_packm_offset_to_panel_for( j, sub_obj ); buf_p = buf_p + elem_size * off_to_panel; bli_obj_set_buffer( buf_p, sub_obj ); } } void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ) { dim_t panel_off; if ( bli_obj_pack_schema( p ) == BLIS_PACKED_ROWS ) { // For the "packed rows" schema, a single row is effectively one // row panel, and so we use the row offset as the panel offset. // Then we multiply this offset by the effective panel stride // (ie: the row stride) to arrive at the desired offset. panel_off = offmn * bli_obj_row_stride( p ); } else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_COLUMNS ) { // For the "packed columns" schema, a single column is effectively one // column panel, and so we use the column offset as the panel offset. // Then we multiply this offset by the effective panel stride // (ie: the column stride) to arrive at the desired offset. panel_off = offmn * bli_obj_col_stride( p ); } else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_ROW_PANELS ) { // For the "packed row panels" schema, the column stride is equal to // the panel dimension (length). So we can divide it into offmn // (interpreted as a row offset) to arrive at a panel offset. Then // we multiply this offset by the panel stride to arrive at the total // offset to the panel (in units of elements). panel_off = offmn / bli_obj_col_stride( p ); panel_off = panel_off * bli_obj_panel_stride( p ); // Sanity check. if ( offmn % bli_obj_col_stride( p ) > 0 ) bli_abort(); } else if ( bli_obj_pack_schema( p ) == BLIS_PACKED_COL_PANELS ) { // For the "packed column panels" schema, the row stride is equal to // the panel dimension (width). So we can divide it into offmn // (interpreted as a column offset) to arrive at a panel offset. Then // we multiply this offset by the panel stride to arrive at the total // offset to the panel (in units of elements). panel_off = offmn / bli_obj_row_stride( p ); panel_off = panel_off * bli_obj_panel_stride( p ); // Sanity check. if ( offmn % bli_obj_row_stride( p ) > 0 ) bli_abort(); } else { panel_off = 0; bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } return panel_off; } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_part.h000066400000000000000000000050561427272030600234660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_scalar.c000066400000000000000000000062031427272030600237530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void* bli_packm_scalar( obj_t* kappa, obj_t* p ) { num_t dt_p = bli_obj_dt( p ); pack_t schema = bli_obj_pack_schema( p ); // The value for kappa we use will depends on whether the scalar // attached to A has a nonzero imaginary component. If it does, // then we will apply the scalar during packing to facilitate // implementing induced complex domain algorithms in terms of // real domain micro-kernels. (In the aforementioned situation, // applying a real scalar is easy, but applying a complex one is // harder, so we avoid the need altogether with the code below.) if ( bli_obj_scalar_has_nonzero_imag( p ) && !bli_is_nat_packed( schema ) ) { //printf( "applying non-zero imag kappa\n_p" ); // Detach the scalar. bli_obj_scalar_detach( p, kappa ); // Reset the attached scalar (to 1.0). bli_obj_scalar_reset( p ); return bli_obj_buffer_for_1x1( dt_p, kappa ); } // This branch is also for native execution, where we assume that // the micro-kernel will always apply the alpha scalar of the // higher-level operation. Thus, we use BLIS_ONE for kappa so // that the underlying packm implementation does not perform // any scaling during packing. else { // If the internal scalar of A has only a real component, then // we will apply it later (in the micro-kernel), and so we will // use BLIS_ONE to indicate no scaling during packing. return bli_obj_buffer_for_1x1( dt_p, &BLIS_ONE ); } } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_scalar.h000066400000000000000000000033151427272030600237610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_struc_cxk.c000066400000000000000000000334551427272030600245240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ { \ /* For micro-panels of general matrices, we can call the pack kernel front-end directly. */ \ PASTEMAC(ch,kername) \ ( \ conjc, \ schema, \ panel_dim, \ panel_dim_max, \ panel_len, \ panel_len_max, \ kappa, \ c, incc, ldc, \ p, ldp, \ cntx \ ); \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ /* Call a helper function for micro-panels of Hermitian/symmetric matrices. */ \ PASTEMAC(ch,packm_herm_cxk) \ ( \ strucc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ panel_dim, \ panel_len, \ panel_dim_max, \ panel_len_max, \ panel_dim_off, \ panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ cntx \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ { \ /* Call a helper function for micro-panels of triangular matrices. */ \ PASTEMAC(ch,packm_tri_cxk) \ ( \ strucc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ panel_dim, \ panel_len, \ panel_dim_max, \ panel_len_max, \ panel_dim_off, \ panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk ) #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ doff_t diagoffc = panel_dim_off - panel_len_off; \ doff_t diagoffc_abs; \ dim_t i, j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually stored, also taking conjugation into account. (Note this implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ c = c + diagoffc * ( doff_t )ldc + \ -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc ); \ } \ \ /* Pack the full panel. */ \ PASTEMAC(ch,kername) \ ( \ conjc, \ schema, \ panel_dim, \ panel_dim_max, \ panel_len, \ panel_len_max, \ kappa, \ c, incc, ldc, \ p, ldp, \ cntx \ ); \ } \ else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ dim_t p10_dim, p10_len; \ inc_t incc10, ldc10; \ doff_t diagoffc10; \ conj_t conjc10; \ \ ctype* restrict c12; \ ctype* restrict p12; \ dim_t p12_dim, p12_len; \ inc_t incc12, ldc12; \ doff_t diagoffc12; \ conj_t conjc12; \ \ /* Sanity check. Diagonals should not intersect the short end of a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ p10 = p; \ c10 = c; \ incc10 = incc; \ ldc10 = ldc; \ conjc10 = conjc; \ \ p12_dim = panel_dim; \ p12_len = panel_len - p10_len; \ j = p10_len; \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ c12 = c12 + diagoffc12 * ( doff_t )ldc + \ -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ c10 = c10 + diagoffc10 * ( doff_t )ldc + \ -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ \ p12_dim = panel_dim; \ p12_len = panel_len - p10_len; \ j = p10_len; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ incc12 = incc; \ ldc12 = ldc; \ conjc12 = conjc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc10 ); \ } \ \ /* Pack to p10. For upper storage, this includes the unstored triangle of c11. */ \ /* NOTE: Since we're only packing partial panels here, we pass in p1x_len as panel_len_max; otherwise, the packm kernel will zero- fill the columns up to panel_len_max, which is not what we need or want to happen. */ \ PASTEMAC(ch,kername) \ ( \ conjc10, \ schema, \ p10_dim, \ panel_dim_max, \ p10_len, \ p10_len, \ kappa, \ c10, incc10, ldc10, \ p10, ldp, \ cntx \ ); \ \ /* Pack to p12. For lower storage, this includes the unstored triangle of c11. */ \ /* NOTE: Since we're only packing partial panels here, we pass in p1x_len as panel_len_max; otherwise, the packm kernel will zero- fill the columns up to panel_len_max, which is not what we need or want to happen. */ \ PASTEMAC(ch,kername) \ ( \ conjc12, \ schema, \ p12_dim, \ panel_dim_max, \ p12_len, \ p12_len, \ kappa, \ c12, incc12, ldc12, \ p12, ldp, \ cntx \ ); \ \ /* Pack the stored triangle of c11 to p11. */ \ { \ dim_t p11_m = panel_dim; \ dim_t p11_n = panel_dim; \ dim_t j2 = diagoffc_abs; \ ctype* restrict c11 = c + (j2 )*ldc; \ ctype* restrict p11 = p + (j2 )*ldp; \ trans_t transc = ( trans_t )conjc; \ \ PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ uploc, \ transc, \ p11_m, \ p11_n, \ c11, incc, ldc, \ p11, 1, ldp, \ cntx, \ NULL \ ); \ \ /* If source matrix c is Hermitian, we have to zero out the imaginary components of the diagonal of p11 in case the corresponding elements in c11 were not already zero. */ \ if ( bli_is_hermitian( strucc ) ) \ { \ ctype* restrict pi11 = p11; \ \ for ( i = 0; i < p11_m; ++i ) \ { \ PASTEMAC(ch,seti0s)( *pi11 ); \ \ pi11 += 1 + ldp; \ } \ } \ \ /* Now that the diagonal has been made explicitly Hermitian (if applicable), we can now safely scale the stored triangle specified by uploc. */ \ PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ uploc, \ p11_m, \ p11_n, \ kappa, \ p11, 1, ldp, \ cntx, \ NULL \ ); \ } \ } \ } INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk ) #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ) \ { \ doff_t diagoffc = panel_dim_off - panel_len_off; \ \ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ conjc, \ schema, \ panel_dim, \ panel_dim_max, \ panel_len, \ panel_len_max, \ kappa, \ c, incc, ldc, \ p, ldp, \ cntx \ ); \ \ \ /* If the diagonal of c is implicitly unit, explicitly set the the diagonal of the packed panel to kappa. */ \ if ( bli_is_unit_diag( diagc ) ) \ { \ PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ diagoffc, \ panel_dim, \ panel_len, \ kappa, \ p, 1, ldp, \ cntx, \ NULL \ ); \ } \ \ /* If requested, invert the diagonal of the packed panel. */ \ if ( invdiag == TRUE ) \ { \ PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \ ( \ diagoffc, \ panel_dim, \ panel_len, \ p, 1, ldp, \ cntx, \ NULL \ ); \ } \ \ /* Set the region opposite the diagonal of p to zero. To do this, we need to reference the "unstored" region on the other side of the diagonal. This amounts to toggling uploc and then shifting the diagonal offset to shrink the newly referenced region (by one diagonal). Note that this zero-filling is not needed for trsm, since the unstored region is not referenced by the trsm micro-kernel; however, zero-filling is needed for trmm, which uses the gemm micro-kernel.*/ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ uplo_t uplop = uploc; \ \ bli_toggle_uplo( &uplop ); \ bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \ \ PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ diagoffc, \ BLIS_NONUNIT_DIAG, \ uplop, \ panel_dim, \ panel_len, \ zero, \ p, 1, ldp, \ cntx, \ NULL \ ); \ } \ \ /* If this panel is an edge case in both panel dimension and length, then it must be a bottom-right corner case. Set the part of the diagonal that extends into the zero-padded region to identity. NOTE: This is actually only necessary when packing for trsm, as it helps prevent NaNs and Infs from creeping into the computation. However, we set the region to identity for trmm as well. Those 1.0's end up getting muliplied by the 0.0's in the zero-padded region of the other matrix, so there is no harm in this. */ \ if ( panel_dim != panel_dim_max && \ panel_len != panel_len_max ) \ { \ ctype* restrict one = PASTEMAC(ch,1); \ dim_t i = panel_dim; \ dim_t j = panel_len; \ dim_t m_br = panel_dim_max - i; \ dim_t n_br = panel_len_max - j; \ ctype* p_br = p + (i ) + (j )*ldp; \ \ PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ m_br, \ n_br, \ one, \ p_br, 1, ldp, \ cntx, \ NULL \ ); \ } \ } INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_struc_cxk.h000066400000000000000000000047501427272030600245250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_struc_cxk_1er.c000066400000000000000000000354321427272030600252700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ) \ { \ /* Handle micro-panel packing based on the structure of the matrix being packed. */ \ if ( bli_is_general( strucc ) ) \ { \ /* For micro-panels of general matrices, we can call the pack kernel front-end directly. */ \ PASTEMAC(ch,kername) \ ( \ conjc, \ schema, \ panel_dim, \ panel_dim_max, \ panel_len, \ panel_len_max, \ kappa, \ c, incc, ldc, \ p, ldp, \ cntx \ ); \ } \ else if ( bli_is_herm_or_symm( strucc ) ) \ { \ /* Call a helper function for micro-panels of Hermitian/symmetric matrices. */ \ PASTEMAC(ch,packm_herm_cxk_1er) \ ( \ strucc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ panel_dim, \ panel_len, \ panel_dim_max, \ panel_len_max, \ panel_dim_off, \ panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ cntx, \ params \ ); \ } \ else /* ( bli_is_triangular( strucc ) ) */ \ { \ /* Call a helper function for micro-panels of triangular matrices. */ \ PASTEMAC(ch,packm_tri_cxk_1er) \ ( \ strucc, \ diagc, \ uploc, \ conjc, \ schema, \ invdiag, \ panel_dim, \ panel_len, \ panel_dim_max, \ panel_len_max, \ panel_dim_off, \ panel_len_off, \ kappa, \ c, incc, ldc, \ p, ldp, \ is_p, \ cntx, \ params \ ); \ } \ } INSERT_GENTFUNCCO_BASIC( packm_struc_cxk_1er, packm_cxk_1er ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ) \ { \ doff_t diagoffc = panel_dim_off - panel_len_off; \ doff_t diagoffc_abs; \ dim_t j; \ \ /* Handle the case where the micro-panel does NOT intersect the diagonal separately from the case where it does intersect. */ \ if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \ { \ /* If the current panel is unstored, we need to make a few adjustments so we refer to the data where it is actually stored, also taking conjugation into account. (Note this implicitly assumes we are operating on a dense panel within a larger symmetric or Hermitian matrix, since a general matrix would not contain any unstored region.) */ \ if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \ { \ c = c + diagoffc * ( doff_t )ldc + \ -diagoffc * ( doff_t )incc; \ bli_swap_incs( &incc, &ldc ); \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc ); \ } \ \ /* Pack the full panel. */ \ PASTEMAC(ch,kername) \ ( \ conjc, \ schema, \ panel_dim, \ panel_dim_max, \ panel_len, \ panel_len_max, \ kappa, \ c, incc, ldc, \ p, ldp, \ cntx \ ); \ } \ else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \ { \ ctype* restrict c10; \ ctype* restrict p10; \ dim_t p10_dim, p10_len; \ inc_t incc10, ldc10; \ doff_t diagoffc10; \ conj_t conjc10; \ \ ctype* restrict c12; \ ctype* restrict p12; \ dim_t p12_dim, p12_len; \ inc_t incc12, ldc12; \ doff_t diagoffc12; \ conj_t conjc12; \ \ \ /* Sanity check. Diagonals should not intersect the short end of a micro-panel. If they do, then somehow the constraints on cache blocksizes being a whole multiple of the register blocksizes was somehow violated. */ \ if ( diagoffc < 0 ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ diagoffc_abs = bli_abs( diagoffc ); \ \ if ( bli_is_lower( uploc ) ) \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs; \ p10 = p; \ c10 = c; \ incc10 = incc; \ ldc10 = ldc; \ conjc10 = conjc; \ \ p12_dim = panel_dim; \ p12_len = panel_len - p10_len; \ j = p10_len; \ diagoffc12 = diagoffc_abs - j; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ c12 = c12 + diagoffc12 * ( doff_t )ldc + \ -diagoffc12 * ( doff_t )incc; \ incc12 = ldc; \ ldc12 = incc; \ conjc12 = conjc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc12 ); \ } \ else /* if ( bli_is_upper( uploc ) ) */ \ { \ p10_dim = panel_dim; \ p10_len = diagoffc_abs + panel_dim; \ diagoffc10 = diagoffc; \ p10 = p; \ c10 = c; \ c10 = c10 + diagoffc10 * ( doff_t )ldc + \ -diagoffc10 * ( doff_t )incc; \ incc10 = ldc; \ ldc10 = incc; \ conjc10 = conjc; \ \ p12_dim = panel_dim; \ p12_len = panel_len - p10_len; \ j = p10_len; \ p12 = p + (j )*ldp; \ c12 = c + (j )*ldc; \ incc12 = incc; \ ldc12 = ldc; \ conjc12 = conjc; \ \ if ( bli_is_hermitian( strucc ) ) \ bli_toggle_conj( &conjc10 ); \ } \ \ /* Pack to p10. For upper storage, this includes the unstored triangle of c11. */ \ /* NOTE: Since we're only packing partial panels here, we pass in p1x_len as panel_len_max; otherwise, the packm kernel will zero- fill the columns up to panel_len_max, which is not what we need or want to happen. */ \ PASTEMAC(ch,kername) \ ( \ conjc10, \ schema, \ p10_dim, \ panel_dim_max, \ p10_len, \ p10_len, \ kappa, \ c10, incc10, ldc10, \ p10, ldp, \ cntx \ ); \ \ /* Pack to p12. For lower storage, this includes the unstored triangle of c11. */ \ /* NOTE: Since we're only packing partial panels here, we pass in p1x_len as panel_len_max; otherwise, the packm kernel will zero- fill the columns up to panel_len_max, which is not what we need or want to happen. */ \ PASTEMAC(ch,kername) \ ( \ conjc12, \ schema, \ p12_dim, \ panel_dim_max, \ p12_len, \ p12_len, \ kappa, \ c12, incc12, ldc12, \ p12, ldp, \ cntx \ ); \ \ /* Pack the stored triangle of c11 to p11. */ \ { \ dim_t j = diagoffc_abs; \ ctype* restrict c11 = c + (j )*ldc; \ ctype* restrict p11 = p + (j )*ldp; \ \ PASTEMAC(ch,scal21ms_mxn_uplo) \ ( \ schema, \ uploc, \ conjc, \ panel_dim, \ kappa, \ c11, incc, ldc, \ p11, 1, ldp, ldp \ ); \ \ /* If we are packing a micro-panel with Hermitian structure, we must take special care of the diagonal. Now, if kappa were guaranteed to be unit, all we would need to do is explicitly zero out the imaginary part of the diagonal of p11, in case the diagonal of the source matrix contained garbage (non-zero) imaginary values. HOWEVER, since kappa can be non-unit, things become a little more complicated. In general, we must re-apply the kappa scalar to ONLY the real part of the diagonal of the source matrix and save the result to the diagonal of p11. */ \ if ( bli_is_hermitian( strucc ) ) \ { \ ctype_r* restrict c11_r = ( ctype_r* )c11; \ const dim_t incc2 = 2*incc; \ const dim_t ldc2 = 2*ldc; \ \ PASTEMAC3(ch,chr,ch,scal21ms_mxn_diag) \ ( \ schema, \ panel_dim, \ panel_dim, \ kappa, \ c11_r, incc2, ldc2, \ p11, 1, ldp, ldp \ ); \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC( packm_herm_cxk_1er, packm_cxk_1er ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ) \ { \ doff_t diagoffc = panel_dim_off - panel_len_off; \ doff_t diagoffc_abs = bli_abs( diagoffc ); \ ctype* p11 = p + (diagoffc_abs )*ldp; \ \ \ /* Pack the panel. */ \ PASTEMAC(ch,kername) \ ( \ conjc, \ schema, \ panel_dim, \ panel_dim_max, \ panel_len, \ panel_len_max, \ kappa, \ c, incc, ldc, \ p, ldp, \ cntx \ ); \ \ \ /* Tweak the panel according to its triangular structure */ \ { \ /* If the diagonal of c is implicitly unit, explicitly set the the diagonal of the packed panel to kappa. */ \ if ( bli_is_unit_diag( diagc ) ) \ { \ PASTEMAC(ch,set1ms_mxn_diag) \ ( \ schema, \ 0, \ 0, \ panel_dim, \ panel_dim, \ kappa, \ p11, 1, ldp, ldp \ ); \ } \ \ \ /* If requested, invert the diagonal of the packed panel. */ \ if ( invdiag == TRUE ) \ { \ PASTEMAC(ch,invert1ms_mxn_diag) \ ( \ schema, \ 0, \ 0, \ panel_dim, \ panel_dim, \ p11, 1, ldp, ldp \ ); \ } \ \ \ /* Set the region opposite the diagonal of p to zero. To do this, we need to reference the "unstored" region on the other side of the diagonal. This amounts to toggling uploc and then shifting the diagonal offset to shrink the newly referenced region (by one diagonal). Note that this zero-filling is not needed for trsm, since the unstored region is not referenced by the trsm micro-kernel; however, zero-filling is needed for trmm, which uses the gemm micro-kernel.*/ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ uplo_t uplop = uploc; \ doff_t diagoffc11_0 = 0; \ dim_t p11_0_dim = panel_dim - 1; \ \ bli_toggle_uplo( &uplop ); \ bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc11_0 ); \ \ /* Note that this macro works a little differently than the setm operation. Here, we pass in the dimensions of only p11, rather than the whole micro-panel, and furthermore we pass in the "shrunken" dimensions of p11, corresponding to the toggling and shrinking of the diagonal above. The macro will do the right thing, incrementing the pointer to p11 by the appropriate leading dimension (ldp or rs_p), and setting only the lower or upper triangle to zero. */ \ PASTEMAC(ch,set1ms_mxn_uplo) \ ( \ schema, \ diagoffc11_0, \ uplop, \ p11_0_dim, \ p11_0_dim, \ zero, \ p11, 1, ldp, ldp \ ); \ } \ } \ \ /* If this micro-panel is an edge case in both panel dimension and length, then it must be a bottom-right corner case, which typically only happens for micro-panels being packed for trsm. (It also happens for trmm if kr > 1.) Here, we set the part of the diagonal that extends into the zero-padded region to identity. This prevents NaNs and Infs from creeping into the computation. If this code does execute for trmm, it is okay, because those 1.0's that extend into the bottom-right region end up getting muliplied by the 0.0's in the zero-padded region of the other matrix. */ \ if ( panel_dim != panel_dim_max && \ panel_len != panel_len_max ) \ { \ ctype* restrict one = PASTEMAC(ch,1); \ dim_t offm = panel_dim; \ dim_t offn = panel_len; \ dim_t m_edge = panel_dim_max - panel_dim; \ dim_t n_edge = panel_len_max - panel_len; \ \ PASTEMAC(ch,set1ms_mxn_diag) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ one, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC( packm_tri_cxk_1er, packm_cxk_1er ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_struc_cxk_1er.h000066400000000000000000000050541427272030600252720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_struc_cxk_md.c000066400000000000000000000322431427272030600251760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_GEMM_MD #undef GENTFUNC2 #define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ) \ { \ if ( bli_is_nat_packed( schema ) ) \ { \ /* Sanity check: Make sure that kappa is 1.0. Mixed-datatype alpha values are never handled when packing for native execution; instead, they are passed along to the micro-kernel. */ \ if ( !PASTEMAC(chp,eq1)( *kappa ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ \ /* NOTE: We ignore kappa for now, since it should be 1.0. */ \ PASTEMAC2(chc,chp,castm) \ ( \ ( trans_t )conjc, \ panel_dim, \ panel_len, \ c, incc, ldc, \ p, 1, ldp \ ); \ \ /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ if ( panel_dim < panel_dim_max ) \ { \ ctype_p* restrict zero = PASTEMAC(chp,0); \ const dim_t i = panel_dim; \ const dim_t m_edge = panel_dim_max - i; \ const dim_t n_edge = panel_len_max; \ ctype_p* p_edge = p + (i )*1; \ \ PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m_edge, \ n_edge, \ zero, \ p_edge, 1, ldp, \ cntx, \ NULL \ ); \ } \ \ /* If panel_len < panel_len_max, then we zero those unused columns. */ \ if ( panel_len < panel_len_max ) \ { \ ctype_p* restrict zero = PASTEMAC(chp,0); \ const dim_t j = panel_len; \ const dim_t m_edge = panel_dim_max; \ const dim_t n_edge = panel_len_max - j; \ ctype_p* p_edge = p + (j )*ldp; \ \ PASTEMAC2(chp,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m_edge, \ n_edge, \ zero, \ p_edge, 1, ldp, \ cntx, \ NULL \ ); \ } \ } \ else if ( bli_is_1r_packed( schema ) ) \ { \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ \ PASTEMAC2(chc,chp,packm_cxk_1r_md) \ ( \ conjc, \ panel_dim, \ panel_len, \ kappa, \ c, incc, ldc, \ p, ldp \ ); \ \ /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ if ( panel_dim < panel_dim_max ) \ { \ ctype_p* restrict zero = PASTEMAC(chp,0); \ const dim_t offm = panel_dim; \ const dim_t offn = 0; \ const dim_t m_edge = panel_dim_max - panel_dim; \ const dim_t n_edge = panel_len_max; \ \ ( void ) zero; \ ( void ) m_edge; ( void )offm; \ ( void ) n_edge; ( void )offn; \ \ PASTEMAC(chp,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ \ /* If panel_len < panel_len_max, then we zero those unused columns. */ \ if ( panel_len < panel_len_max ) \ { \ ctype_p* restrict zero = PASTEMAC(chp,0); \ const dim_t offm = 0; \ const dim_t offn = panel_len; \ const dim_t m_edge = panel_dim_max; \ const dim_t n_edge = panel_len_max - panel_len; \ \ ( void ) zero; \ ( void ) m_edge; ( void )offm; \ ( void ) n_edge; ( void )offn; \ \ PASTEMAC(chp,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ else if ( bli_is_1e_packed( schema ) ) \ { \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ \ PASTEMAC2(chc,chp,packm_cxk_1e_md) \ ( \ conjc, \ panel_dim, \ panel_len, \ kappa, \ c, incc, ldc, \ p, ldp \ ); \ \ /* If panel_dim < panel_dim_max, then we zero those unused rows. */ \ if ( panel_dim < panel_dim_max ) \ { \ ctype_p* restrict zero = PASTEMAC(chp,0); \ const dim_t offm = panel_dim; \ const dim_t offn = 0; \ const dim_t m_edge = panel_dim_max - panel_dim; \ const dim_t n_edge = panel_len_max; \ \ ( void ) zero; \ ( void ) m_edge; ( void )offm; \ ( void ) n_edge; ( void )offn; \ \ PASTEMAC(chp,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ \ /* If panel_len < panel_len_max, then we zero those unused columns. */ \ if ( panel_len < panel_len_max ) \ { \ ctype_p* restrict zero = PASTEMAC(chp,0); \ const dim_t offm = 0; \ const dim_t offn = panel_len; \ const dim_t m_edge = panel_dim_max; \ const dim_t n_edge = panel_len_max - panel_len; \ \ ( void ) zero; \ ( void ) m_edge; ( void )offm; \ ( void ) n_edge; ( void )offn; \ \ PASTEMAC(chp,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ else \ { \ /* Mixed-datatype packing should not occur for any other schemas. */ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } \ \ \ /* if ( bli_is_col_packed( schema ) ) \ PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: bp copied", m_panel_max, n_panel_max, \ p, rs_p, cs_p, "%4.1f", "" ); \ else if ( bli_is_row_packed( schema ) ) \ PASTEMAC(ch,fprintm)( stdout, "packm_struc_cxk: ap copied", m_panel_max, n_panel_max, \ p, rs_p, cs_p, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC2_BASIC0( packm_struc_cxk_md ) INSERT_GENTFUNC2_MIXDP0( packm_struc_cxk_md ) // ----------------------------------------------------------------------------- #undef GENTFUNC2 #define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ PASTEMAC(chp,ctyper)* restrict kappa_r = ( PASTEMAC(chp,ctyper)* )kappa; \ PASTEMAC(chp,ctyper)* restrict kappa_i = ( PASTEMAC(chp,ctyper)* )kappa + 1; \ PASTEMAC(cha,ctyper)* restrict alpha1_r = ( PASTEMAC(cha,ctyper)* )a; \ PASTEMAC(cha,ctyper)* restrict alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \ PASTEMAC(chp,ctyper)* restrict pi1_r = ( PASTEMAC(chp,ctyper)* )p; \ PASTEMAC(chp,ctyper)* restrict pi1_i = ( PASTEMAC(chp,ctyper)* )p + ldp; \ \ ( void )kappa_i; \ \ if ( PASTEMAC(chp,eq1)( *kappa ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC2(cha,chp,copyjris) \ ( \ *(alpha1_r + i*inca2), \ *(alpha1_i + i*inca2), \ *(pi1_r + i* 1), \ *(pi1_i + i* 1) \ ); \ } \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC2(cha,chp,copyris) \ ( \ *(alpha1_r + i*inca2), \ *(alpha1_i + i*inca2), \ *(pi1_r + i* 1), \ *(pi1_i + i* 1) \ ); \ } \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC3(chp,cha,chp,scal2jris) \ ( \ *kappa_r, \ *kappa_i, \ *(alpha1_r + i*inca2), \ *(alpha1_i + i*inca2), \ *(pi1_r + i* 1), \ *(pi1_i + i* 1) \ ); \ } \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC3(chp,cha,chp,scal2ris) \ ( \ *kappa_r, \ *kappa_i, \ *(alpha1_r + i*inca2), \ *(alpha1_i + i*inca2), \ *(pi1_r + i* 1), \ *(pi1_i + i* 1) \ ); \ } \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } INSERT_GENTFUNC2_BASIC0( packm_cxk_1r_md ) INSERT_GENTFUNC2_MIXDP0( packm_cxk_1r_md ) // ----------------------------------------------------------------------------- #undef GENTFUNC2 #define GENTFUNC2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype_a* restrict alpha1_ri = ( ctype_a* )a; \ ctype_p* restrict pi1_ri = ( ctype_p* )p; \ ctype_p* restrict pi1_ir = ( ctype_p* )p + ldp1/2; \ \ ( void )inca1; \ \ if ( PASTEMAC(chp,eq1)( *kappa ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC2(cha,chp,copyj1es) \ ( \ *(alpha1_ri + i*inca1), \ *(pi1_ri + i* 1), \ *(pi1_ir + i* 1) \ ); \ } \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC2(cha,chp,copy1es) \ ( \ *(alpha1_ri + i*inca1), \ *(pi1_ri + i* 1), \ *(pi1_ir + i* 1) \ ); \ } \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC3(chp,cha,chp,scal2j1es) \ ( \ *kappa, \ *(alpha1_ri + i*inca1), \ *(pi1_ri + i* 1), \ *(pi1_ir + i* 1) \ ); \ } \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC3(chp,cha,chp,scal21es) \ ( \ *kappa, \ *(alpha1_ri + i*inca1), \ *(pi1_ri + i* 1), \ *(pi1_ir + i* 1) \ ); \ } \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } INSERT_GENTFUNC2_BASIC0( packm_cxk_1e_md ) INSERT_GENTFUNC2_MIXDP0( packm_cxk_1e_md ) #endif cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_struc_cxk_md.h000066400000000000000000000061001427272030600251740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_thrinfo.c000066400000000000000000000043461427272030600241650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ) { bli_thrinfo_init ( thread, ocomm, ocomm_id, n_way, work_id, FALSE, BLIS_NO_PART, sub_node ); } void bli_packm_thrinfo_init_single ( thrinfo_t* thread ) { bli_packm_thrinfo_init ( thread, &BLIS_SINGLE_COMM, 0, 1, 0, BLIS_NO_PART, NULL ); } cython-blis-0.9.1/blis/_src/frame/1m/packm/bli_packm_thrinfo.h000066400000000000000000000057621427272030600241750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // thrinfo_t macros specific to packm. // /* #define bli_packm_thread_my_iter( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) */ #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif cython-blis-0.9.1/blis/_src/frame/1m/unpackm/000077500000000000000000000000001427272030600207035ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm.h000066400000000000000000000034441427272030600233450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_unpackm_cntl.h" #include "bli_unpackm_check.h" #include "bli_unpackm_int.h" #include "bli_unpackm_blk_var1.h" #include "bli_unpackm_cxk.h" cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_blk_var1.c000066400000000000000000000206031427272030600251150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T unpackm_fp typedef void (*FUNCPTR_T)( struc_t strucc, doff_t diagoffc, diag_t diagc, uplo_t uploc, trans_t transc, dim_t m, dim_t n, dim_t m_panel, dim_t n_panel, void* p, inc_t rs_p, inc_t cs_p, dim_t pd_p, inc_t ps_p, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx ); static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1); void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_cp = bli_obj_dt( c ); // Normally we take the parameters from the source argument. But here, // the packm/unpackm framework is not yet solidified enough for us to // assume that at this point struc(P) == struc(C), (ie: since // densification may have marked P's structure as dense when the root // is upper or lower). So, we take the struc field from C, not P. struc_t strucc = bli_obj_struc( c ); doff_t diagoffc = bli_obj_diag_offset( c ); diag_t diagc = bli_obj_diag( c ); uplo_t uploc = bli_obj_uplo( c ); // Again, normally the trans argument is on the source matrix. But we // know that the packed matrix is not transposed. If there is to be a // transposition, it is because C was originally transposed when packed. // Thus, we query C for the trans status, not P. Also, we only query // the trans status (not the conjugation status), since we probably // don't want to un-conjugate if the original matrix was conjugated // when packed. trans_t transc = bli_obj_onlytrans_status( c ); dim_t m_c = bli_obj_length( c ); dim_t n_c = bli_obj_width( c ); dim_t m_panel = bli_obj_panel_length( c ); dim_t n_panel = bli_obj_panel_width( c ); void* buf_p = bli_obj_buffer_at_off( p ); inc_t rs_p = bli_obj_row_stride( p ); inc_t cs_p = bli_obj_col_stride( p ); dim_t pd_p = bli_obj_panel_dim( p ); inc_t ps_p = bli_obj_panel_stride( p ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); FUNCPTR_T f; // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_cp]; // Invoke the function. f( strucc, diagoffc, diagc, uploc, transc, m_c, n_c, m_panel, n_panel, buf_p, rs_p, cs_p, pd_p, ps_p, buf_c, rs_c, cs_c, cntx ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict c_cast = c; \ ctype* restrict p_cast = p; \ ctype* restrict c_begin; \ ctype* restrict p_begin; \ \ dim_t iter_dim; \ dim_t num_iter; \ dim_t it, ic, ip; \ dim_t ic0, ip0; \ doff_t ic_inc, ip_inc; \ doff_t diagoffc_i; \ doff_t diagoffc_inc; \ dim_t panel_len; \ dim_t panel_dim_i; \ dim_t panel_dim_max; \ inc_t vs_c; \ inc_t incc, ldc; \ inc_t ldp; \ dim_t* m_panel_full; \ dim_t* n_panel_full; \ \ \ /* If c needs a transposition, induce it so that we can more simply express the remaining parameters and code. */ \ if ( bli_does_trans( transc ) ) \ { \ bli_swap_incs( &rs_c, &cs_c ); \ bli_negate_diag_offset( &diagoffc ); \ bli_toggle_uplo( &uploc ); \ bli_toggle_trans( &transc ); \ } \ \ /* If the strides of p indicate row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \ { \ /* Prepare to unpack from column panels. */ \ iter_dim = n; \ panel_len = m; \ panel_dim_max = pd_p; \ incc = cs_c; \ ldc = rs_c; \ vs_c = cs_c; \ diagoffc_inc = -( doff_t)panel_dim_max; \ ldp = rs_p; \ m_panel_full = &m; \ n_panel_full = &panel_dim_i; \ } \ else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \ { \ /* Prepare to unpack from row panels. */ \ iter_dim = m; \ panel_len = n; \ panel_dim_max = pd_p; \ incc = rs_c; \ ldc = cs_c; \ vs_c = rs_c; \ diagoffc_inc = ( doff_t )panel_dim_max; \ ldp = cs_p; \ m_panel_full = &panel_dim_i; \ n_panel_full = &n; \ } \ \ /* Compute the total number of iterations we'll need. */ \ num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ \ { \ ic0 = 0; \ ic_inc = panel_dim_max; \ ip0 = 0; \ ip_inc = 1; \ } \ \ for ( ic = ic0, ip = ip0, it = 0; it < num_iter; \ ic += ic_inc, ip += ip_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ diagoffc_i = diagoffc + (ip )*diagoffc_inc; \ \ p_begin = p_cast + ip * ps_p; \ c_begin = c_cast + ic * vs_c; \ \ /* If the current panel of C intersects the diagonal AND is upper or lower stored, then we must call scal2m. Otherwise, we can use a variant that is oblivious to structure and storage (and thus tends to be faster). */ \ if ( bli_intersects_diag_n( diagoffc_i, *m_panel_full, *n_panel_full ) && \ bli_is_upper_or_lower( uploc ) ) \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ diagoffc_i, \ diagc, \ uploc, \ transc, \ *m_panel_full, \ *n_panel_full, \ one, \ p_begin, rs_p, cs_p, \ c_begin, rs_c, cs_c, \ cntx, \ NULL \ ); \ } \ else \ { \ /* Pack the current panel. */ \ PASTEMAC(ch,unpackm_cxk) \ ( \ BLIS_NO_CONJUGATE, \ panel_dim_i, \ panel_len, \ one, \ p_begin, ldp, \ c_begin, incc, ldc, \ cntx \ ); \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "p copied", *m_panel_full, *n_panel_full, \ p_begin, rs_p, cs_p, "%4.1f", "" );*/ \ } \ \ } INSERT_GENTFUNC_BASIC0( unpackm_blk_var1 ) cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_blk_var1.h000066400000000000000000000044451427272030600251300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_check.c000066400000000000000000000046321427272030600244750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( p ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_conformal_dims( p, a ); bli_check_error_code( e_val ); // Check pack status. e_val = bli_check_packm_schema_on_unpack( p ); bli_check_error_code( e_val ); // Check control tree pointer // NOTE: We can't check the control tree until we stop interpreting a // NULL value (in bli_unpackm_int()) as a request to skip the operation. //e_val = bli_check_valid_cntl( ( void* )cntl ); //bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_check.h000066400000000000000000000033541427272030600245020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_cntl.c000066400000000000000000000053221427272030600243550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ) { cntl_t* cntl; unpackm_params_t* params; err_t r_val; // NOTE: If this function is ever called, figure out whether the // bli_malloc_intl() below needs to be changed to bli_sba_acquire(). bli_abort(); // Allocate an unpackm_params_t struct. params = bli_malloc_intl( sizeof( unpackm_params_t ), &r_val ); // Initialize the unpackm_params_t struct. params->size = sizeof( unpackm_params_t ); params->var_func = unpackm_var_func; // It's important that we set the bszid field to BLIS_NO_PART to indicate // that no blocksize partitioning is performed. bli_cntl_free() will rely // on this information to know how to step through the thrinfo_t tree in // sync with the cntl_t tree. cntl = bli_cntl_create_node ( rntm, BLIS_NOID, BLIS_NO_PART, var_func, params, sub_node ); return cntl; } cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_cntl.h000066400000000000000000000043251427272030600243640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_cxk.c000066400000000000000000000060071427272030600242030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ) \ { \ num_t dt = PASTEMAC(ch,type); \ l1mkr_t ker_id = panel_dim; \ \ PASTECH2(ch,opname,_ker_ft) f; \ \ /* Query the context for the unpackm kernel corresponding to the current panel dimension, or kernel id. If the id is invalid, the function will return NULL. */ \ f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \ \ /* If there exists a kernel implementation for the micro-panel dimension provided, we invoke the implementation. Otherwise, we use scal2m. */ \ if ( f != NULL ) \ { \ f \ ( \ conjp, \ panel_len, \ kappa, \ p, ldp, \ a, inca, lda, \ cntx \ ); \ } \ else \ { \ trans_t transp = ( trans_t )conjp; \ \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ transp, \ panel_dim, \ panel_len, \ kappa, \ p, 1, ldp, \ a, inca, lda, \ cntx, \ NULL \ ); \ } \ } INSERT_GENTFUNC_BASIC0( unpackm_cxk ) cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_cxk.h000066400000000000000000000037601427272030600242130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_int.c000066400000000000000000000047771427272030600242240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ) { bli_init_once(); unpackm_var_oft f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_unpackm_int_check( p, a, cntx ); // If p was aliased to a during the pack stage (because it was already // in an acceptable packed/contiguous format), then no unpack is actually // necessary, so we return. if ( bli_obj_is_alias_of( p, a ) ) return; // Extract the function pointer from the current control tree node. f = bli_cntl_unpackm_params_var_func( cntl ); // Invoke the variant. if ( bli_thread_am_ochief( thread ) ) { f ( p, a, cntx, cntl, thread ); } // Barrier so that unpacking is done before computation. bli_thread_barrier( thread ); } cython-blis-0.9.1/blis/_src/frame/1m/unpackm/bli_unpackm_int.h000066400000000000000000000034251427272030600242160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/2/000077500000000000000000000000001427272030600170715ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/bli_l2.h000066400000000000000000000047761427272030600204230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_l2_check.h" // Define function types. #include "bli_l2_ft_unb.h" // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_l2_oapi.h" #include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_l2_oapi.h" #include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_l2_tapi.h" #include "bli_l2_ft.h" #include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_l2_tapi.h" #include "bli_l2_ft.h" #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). #include "bli_l2_fpa.h" // Operation-specific headers #include "bli_gemv.h" #include "bli_ger.h" #include "bli_hemv.h" #include "bli_her.h" #include "bli_her2.h" #include "bli_symv.h" #include "bli_syr.h" #include "bli_syr2.h" #include "bli_trmv.h" #include "bli_trsv.h" cython-blis-0.9.1/blis/_src/frame/2/bli_l2_check.c000066400000000000000000000244151427272030600215430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { err_t e_val; // Perform checks common to gemv/hemv/symv/trmv/trsv. bli_xxmv_check( alpha, a, x, beta, y ); // Check object structure. e_val = bli_check_general_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); } void bli_hemv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { err_t e_val; // Perform checks common to gemv/hemv/symv/trmv/trsv. bli_xxmv_check( alpha, a, x, beta, y ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); } void bli_symv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { err_t e_val; // Perform checks common to gemv/hemv/symv/trmv/trsv. bli_xxmv_check( alpha, a, x, beta, y ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); } void bli_trmv_check ( obj_t* alpha, obj_t* a, obj_t* x ) { err_t e_val; // Perform checks common to gemv/hemv/symv/trmv/trsv. bli_xxmv_check( alpha, a, x, alpha, x ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); } void bli_trsv_check ( obj_t* alpha, obj_t* a, obj_t* x ) { err_t e_val; // Perform checks common to gemv/hemv/symv/trmv/trsv. bli_xxmv_check( alpha, a, x, alpha, x ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); } void bli_ger_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ) { err_t e_val; // Perform checks common to ger/her/her2/syr/syr2. bli_xxr_check( alpha, x, y, a ); // Check object structure. e_val = bli_check_general_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); } void bli_her_check ( obj_t* alpha, obj_t* x, obj_t* a ) { err_t e_val; // Perform checks common to ger/her/her2/syr/syr2. bli_xxr_check( alpha, x, x, a ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); } void bli_her2_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ) { err_t e_val; // Perform checks common to ger/her/her2/syr/syr2. bli_xxr_check( alpha, x, y, a ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); } void bli_syr_check ( obj_t* alpha, obj_t* x, obj_t* a ) { err_t e_val; // Perform checks common to ger/her/her2/syr/syr2. bli_xxr_check( alpha, x, x, a ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); } void bli_syr2_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ) { err_t e_val; // Perform checks common to ger/her/her2/syr/syr2. bli_xxr_check( alpha, x, y, a ); // Check squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check object structure. e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( a, x ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( a, y ); bli_check_error_code( e_val ); } // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( x, bli_obj_width_after_trans( a ) ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( y, bli_obj_length_after_trans( a ) ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( x, bli_obj_length_after_trans( a ) ); bli_check_error_code( e_val ); e_val = bli_check_vector_dim_equals( y, bli_obj_width_after_trans( a ) ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/2/bli_l2_check.h000066400000000000000000000055351427272030600215520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); cython-blis-0.9.1/blis/_src/frame/2/bli_l2_fpa.c000066400000000000000000000066671427272030600212450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \ { \ return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \ } GENFRONT( gemv ) GENFRONT( ger ) GENFRONT( hemv ) GENFRONT( symv ) GENFRONT( her ) GENFRONT( syr ) GENFRONT( her2 ) GENFRONT( syr2 ) GENFRONT( trmv ) GENFRONT( trsv ) // // Define function pointer query interfaces for level-2 implementations. // #undef GENFRONT #define GENFRONT( opname, varname ) \ \ GENARRAY_FPA( PASTECH2(opname,_unb,_vft), \ varname ); \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ) \ { \ return PASTECH(varname,_fpa)[ dt ]; \ } GENFRONT( gemv, gemv_unb_var1 ) GENFRONT( gemv, gemv_unb_var2 ) GENFRONT( gemv, gemv_unf_var1 ) GENFRONT( gemv, gemv_unf_var2 ) GENFRONT( ger, ger_unb_var1 ) GENFRONT( ger, ger_unb_var2 ) GENFRONT( hemv, hemv_unb_var1 ) GENFRONT( hemv, hemv_unb_var2 ) GENFRONT( hemv, hemv_unb_var3 ) GENFRONT( hemv, hemv_unb_var4 ) GENFRONT( hemv, hemv_unf_var1 ) GENFRONT( hemv, hemv_unf_var3 ) GENFRONT( hemv, hemv_unf_var1a ) GENFRONT( hemv, hemv_unf_var3a ) GENFRONT( her, her_unb_var1 ) GENFRONT( her, her_unb_var2 ) GENFRONT( her2, her2_unb_var1 ) GENFRONT( her2, her2_unb_var2 ) GENFRONT( her2, her2_unb_var3 ) GENFRONT( her2, her2_unb_var4 ) GENFRONT( her2, her2_unf_var1 ) GENFRONT( her2, her2_unf_var4 ) GENFRONT( trmv, trmv_unb_var1 ) GENFRONT( trmv, trmv_unb_var2 ) GENFRONT( trmv, trmv_unf_var1 ) GENFRONT( trmv, trmv_unf_var2 ) GENFRONT( trsv, trsv_unb_var1 ) GENFRONT( trsv, trsv_unb_var2 ) GENFRONT( trsv, trsv_unf_var1 ) GENFRONT( trsv, trsv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/bli_l2_fpa.h000066400000000000000000000061311427272030600212340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/bli_l2_ft.h000066400000000000000000000111261427272030600210770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) cython-blis-0.9.1/blis/_src/frame/2/bli_l2_ft_unb.h000066400000000000000000000105371427272030600217500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif cython-blis-0.9.1/blis/_src/frame/2/bli_l2_oapi.c000066400000000000000000000301501427272030600214070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the object API macros. #ifdef BLIS_ENABLE_OAPI // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( a ); \ \ trans_t transa = bli_obj_conjtrans_status( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t m = bli_obj_length( a ); \ dim_t n = bli_obj_width( a ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ \ void* buf_alpha; \ void* buf_beta; \ \ obj_t alpha_local; \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ transa, \ conjx, \ m, \ n, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ buf_beta, \ buf_y, incy, \ cntx, \ rntm \ ); \ } GENFRONT( gemv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( a ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ dim_t m = bli_obj_length( a ); \ dim_t n = bli_obj_width( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, y, a ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ conjx, \ conjy, \ m, \ n, \ buf_alpha, \ buf_x, incx, \ buf_y, incy, \ buf_a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } GENFRONT( ger ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uploa = bli_obj_uplo( a ); \ conj_t conja = bli_obj_conj_status( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t m = bli_obj_length( a ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ \ void* buf_alpha; \ void* buf_beta; \ \ obj_t alpha_local; \ obj_t beta_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, a, x, beta, y ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ beta, &beta_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ buf_beta = bli_obj_buffer_for_1x1( dt, &beta_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ uploa, \ conja, \ conjx, \ m, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ buf_beta, \ buf_y, incy, \ cntx, \ rntm \ ); \ } GENFRONT( hemv ) GENFRONT( symv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uploa = bli_obj_uplo( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ dim_t m = bli_obj_length( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, a ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ uploa, \ conjx, \ m, \ buf_alpha, \ buf_x, incx, \ buf_a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } GENFRONT( her ) GENFRONT( syr ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uploa = bli_obj_uplo( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ dim_t m = bli_obj_length( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, x, y, a ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ uploa, \ conjx, \ conjy, \ m, \ buf_alpha, \ buf_x, incx, \ buf_y, incy, \ buf_a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } GENFRONT( her2 ) GENFRONT( syr2 ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uploa = bli_obj_uplo( a ); \ trans_t transa = bli_obj_conjtrans_status( a ); \ diag_t diaga = bli_obj_diag( a ); \ dim_t m = bli_obj_length( a ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_alpha; \ \ obj_t alpha_local; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( alpha, a, x ); \ \ /* Create local copy-casts of scalars (and apply internal conjugation as needed). */ \ bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, \ alpha, &alpha_local ); \ buf_alpha = bli_obj_buffer_for_1x1( dt, &alpha_local ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ uploa, \ transa, \ diaga, \ m, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ cntx, \ rntm \ ); \ } GENFRONT( trmv ) GENFRONT( trsv ) #endif cython-blis-0.9.1/blis/_src/frame/2/bli_l2_oapi.h000066400000000000000000000053221427272030600214170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) cython-blis-0.9.1/blis/_src/frame/2/bli_l2_oapi_ba.c000066400000000000000000000036671427272030600220660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_oapi_ba.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l2_oapi.c" cython-blis-0.9.1/blis/_src/frame/2/bli_l2_oapi_ex.c000066400000000000000000000036651427272030600221160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_oapi_ex.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_l2_oapi.c" cython-blis-0.9.1/blis/_src/frame/2/bli_l2_tapi.c000066400000000000000000000332531427272030600214230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the typed API macros. #ifdef BLIS_ENABLE_TAPI // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ dim_t m_y, n_x; \ \ /* Determine the dimensions of y and x. */ \ bli_set_dims_with_trans( transa, m, n, &m_y, &n_x ); \ \ /* If y has zero elements, return early. */ \ if ( bli_zero_dim1( m_y ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ if ( bli_zero_dim1( n_x ) || PASTEMAC(ch,eq0)( *alpha ) ) \ { \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m_y, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ return; \ } \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_does_notrans( transa ) ) \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \ else /* column or general stored */ f = PASTEMAC(ch,cvarname); \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \ else /* column or general stored */ f = PASTEMAC(ch,rvarname); \ } \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ transa, \ conjx, \ m, \ n, \ alpha, \ a, rs_a, cs_a, \ x, incx, \ beta, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC3( gemv, gemv, gemv_unf_var1, gemv_unf_var2 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If x or y has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim2( m, n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \ else /* column or general stored */ f = PASTEMAC(ch,cvarname); \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ conjx, \ conjy, \ m, \ n, \ alpha, \ x, incx, \ y, incy, \ a, rs_a, cs_a, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC3( ger, ger, ger_unb_var1, ger_unb_var2 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* If x has zero elements, or if alpha is zero, scale y by beta and return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) \ { \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ return; \ } \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_is_lower( uploa ) ) \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \ else /* column or general stored */ f = PASTEMAC(ch,cvarname); \ } \ else /* if ( bli_is_upper( uploa ) ) */ \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \ else /* column or general stored */ f = PASTEMAC(ch,rvarname); \ } \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ uploa, \ conja, \ conjx, \ conjh, /* used by variants to distinguish hemv from symv */ \ m, \ alpha, \ a, rs_a, cs_a, \ x, incx, \ beta, \ y, incy, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC4( hemv, hemv, BLIS_CONJUGATE, hemv_unf_var1, hemv_unf_var3 ) INSERT_GENTFUNC_BASIC4( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_var3 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, ftname, conjh, rvarname, cvarname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ ctype alpha_local; \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(chr,eq0)( *alpha ) ) return; \ \ /* Make a local copy of alpha, cast into the complex domain. This allows us to use the same underlying her variants to implement both her and syr operations. */ \ PASTEMAC2(chr,ch,copys)( *alpha, alpha_local ); \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_is_lower( uploa ) ) \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \ else /* column or general stored */ f = PASTEMAC(ch,cvarname); \ } \ else /* if ( bli_is_upper( uploa ) ) */ \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \ else /* column or general stored */ f = PASTEMAC(ch,rvarname); \ } \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ uploa, \ conjx, \ conjh, /* used by variants to distinguish her from syr */ \ m, \ &alpha_local, \ x, incx, \ a, rs_a, cs_a, \ cntx \ ); \ } INSERT_GENTFUNCR_BASIC4( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_is_lower( uploa ) ) \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \ else /* column or general stored */ f = PASTEMAC(ch,cvarname); \ } \ else /* if ( bli_is_upper( uploa ) ) */ \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \ else /* column or general stored */ f = PASTEMAC(ch,rvarname); \ } \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ uploa, \ conjx, \ conjh, /* used by variants to distinguish her2 from syr2 */ \ m, \ alpha, \ x, incx, \ a, rs_a, cs_a, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC4( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, ftname, conjh, rvarname, cvarname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If x has zero elements, or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_is_lower( uploa ) ) \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \ else /* column or general stored */ f = PASTEMAC(ch,cvarname); \ } \ else /* if ( bli_is_upper( uploa ) ) */ \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \ else /* column or general stored */ f = PASTEMAC(ch,rvarname); \ } \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ uploa, \ conjx, \ conjy, \ conjh, \ m, \ alpha, \ x, incx, \ y, incy, \ a, rs_a, cs_a, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC4( her2, her2, BLIS_CONJUGATE, her2_unf_var1, her2_unf_var4 ) INSERT_GENTFUNC_BASIC4( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_var4 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, ftname, rvarname, cvarname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If x has zero elements, return early. */ \ if ( bli_zero_dim1( m ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* If alpha is zero, set x to zero and return early. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ alpha, \ x, incx, \ cntx, \ NULL \ ); \ return; \ } \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,ftname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_does_notrans( transa ) ) \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,rvarname); \ else /* column or general stored */ f = PASTEMAC(ch,cvarname); \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ if ( bli_is_row_stored( rs_a, cs_a ) ) f = PASTEMAC(ch,cvarname); \ else /* column or general stored */ f = PASTEMAC(ch,rvarname); \ } \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ uploa, \ transa, \ diaga, \ m, \ alpha, \ a, rs_a, cs_a, \ x, incx, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC3( trmv, trmv, trmv_unf_var1, trmv_unf_var2 ) INSERT_GENTFUNC_BASIC3( trsv, trmv, trsv_unf_var1, trsv_unf_var2 ) #endif cython-blis-0.9.1/blis/_src/frame/2/bli_l2_tapi.h000066400000000000000000000110651427272030600214250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) cython-blis-0.9.1/blis/_src/frame/2/bli_l2_tapi_ba.c000066400000000000000000000036651427272030600220710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_tapi_ba.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l2_tapi.c" cython-blis-0.9.1/blis/_src/frame/2/bli_l2_tapi_ex.c000066400000000000000000000036631427272030600221210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_tapi_ex.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_l2_tapi.c" cython-blis-0.9.1/blis/_src/frame/2/gemv/000077500000000000000000000000001427272030600200275ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/gemv/amd/000077500000000000000000000000001427272030600205705ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/gemv/amd/bli_gemv_unf_var2_amd.c000066400000000000000000000134651427272030600251540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, scalvsuf, axpyfsuf, fusefac ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ /*const num_t dt = PASTEMAC(ch,type);*/ \ \ ctype* A1; \ ctype* x1; \ ctype* y1; \ dim_t i; \ dim_t b_fuse, f; \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ &n_elem, &n_iter, &rs_at, &cs_at ); \ \ conja = bli_extract_conj( transa ); \ \ /* y = beta * y; */ \ /* NOTE: We don't explicitly handle the case where beta == 0 here since that behavior is handled within the scalv kernel itself. */ \ PASTEMAC2(ch,scalv,scalvsuf) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ beta, \ y, incy, \ cntx \ ); \ \ /* If alpha == 0, then we are done. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /*PASTECH(ch,axpyf_ker_ft) kfp_af;*/ \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ /*kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );*/ \ /*b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );*/ \ b_fuse = fusefac; \ \ for ( i = 0; i < n_iter; i += f ) \ { \ f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a + (0 )*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ y1 = y + (0 )*incy; \ \ /* y = y + alpha * A1 * x1; */ \ /*kfp_af*/ \ PASTEMAC2(ch,axpyf,axpyfsuf) \ ( \ conja, \ conjx, \ n_elem, \ f, \ alpha, \ A1, rs_at, cs_at, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } //INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) GENTFUNC( float, s, gemv_unf_var2, _zen_int10, _zen_int_5, 5 ) GENTFUNC( double, d, gemv_unf_var2, _zen_int10, _zen_int_16x4, 4 ) GENTFUNC( scomplex, c, gemv_unf_var2, _zen_int10, _zen_int_4, 4 ) //GENTFUNC( dcomplex, z, gemv_unf_var2, _zen_int10, _ex, 1 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* zero = PASTEMAC(ch,0); \ ctype* A1; \ ctype* x1; \ ctype* y1; \ dim_t i; \ dim_t b_fuse, f; \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ &n_elem, &n_iter, &rs_at, &cs_at ); \ \ conja = bli_extract_conj( transa ); \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ for ( i = 0; i < n_iter; i += f ) \ { \ f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a + (0 )*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ y1 = y + (0 )*incy; \ \ /* y = y + alpha * A1 * x1; */ \ kfp_af \ ( \ conja, \ conjx, \ n_elem, \ f, \ alpha, \ A1, rs_at, cs_at, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } //INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) GENTFUNC( dcomplex, z, gemv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv.h000066400000000000000000000034701427272030600217700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" #include "bli_gemv_var.h" cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv_unb_var1.c000066400000000000000000000056641427272030600235670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* a1t; \ ctype* x1; \ ctype* psi1; \ dim_t i; \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ &n_iter, &n_elem, &rs_at, &cs_at ); \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < n_iter; ++i ) \ { \ a1t = a + (i )*rs_at + (0 )*cs_at; \ x1 = x + (0 )*incy; \ psi1 = y + (i )*incy; \ \ /* psi1 = beta * psi1 + alpha * a1t * x1; */ \ kfp_dv \ ( \ conja, \ conjx, \ n_elem, \ alpha, \ a1t, cs_at, \ x1, incx, \ beta, \ psi1, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( gemv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv_unb_var2.c000066400000000000000000000067631427272030600235710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* zero = PASTEMAC(ch,0); \ ctype* a1; \ ctype* chi1; \ ctype* y1; \ ctype alpha_chi1; \ dim_t i; \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ &n_elem, &n_iter, &rs_at, &cs_at ); \ \ conja = bli_extract_conj( transa ); \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < n_iter; ++i ) \ { \ a1 = a + (0 )*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ y1 = y + (0 )*incy; \ \ /* y = y + alpha * chi1 * a1; */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \ PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \ \ kfp_av \ ( \ conja, \ n_elem, \ &alpha_chi1, \ a1, rs_at, \ y1, incy, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( gemv_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv_unf_var1.c000066400000000000000000000061451427272030600235660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* A1; \ ctype* x1; \ ctype* y1; \ dim_t i; \ dim_t b_fuse, f; \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ &n_iter, &n_elem, &rs_at, &cs_at ); \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ for ( i = 0; i < n_iter; i += f ) \ { \ f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a + (i )*rs_at + (0 )*cs_at; \ x1 = x + (0 )*incy; \ y1 = y + (i )*incy; \ \ /* y1 = beta * y1 + alpha * A1 * x; */ \ kfp_df \ ( \ conja, \ conjx, \ n_elem, \ f, \ alpha, \ A1, cs_at, rs_at, \ x1, incx, \ beta, \ y1, incy, \ cntx \ ); \ \ } \ } INSERT_GENTFUNC_BASIC0( gemv_unf_var1 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv_unf_var2.c000066400000000000000000000070711427272030600235660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* zero = PASTEMAC(ch,0); \ ctype* A1; \ ctype* x1; \ ctype* y1; \ dim_t i; \ dim_t b_fuse, f; \ dim_t n_elem, n_iter; \ inc_t rs_at, cs_at; \ conj_t conja; \ \ bli_set_dims_incs_with_trans( transa, \ m, n, rs_a, cs_a, \ &n_elem, &n_iter, &rs_at, &cs_at ); \ \ conja = bli_extract_conj( transa ); \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n_elem, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ for ( i = 0; i < n_iter; i += f ) \ { \ f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ A1 = a + (0 )*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ y1 = y + (0 )*incy; \ \ /* y = y + alpha * A1 * x1; */ \ kfp_af \ ( \ conja, \ conjx, \ n_elem, \ f, \ alpha, \ A1, rs_at, cs_at, \ x1, incx, \ y1, incy, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( gemv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv_var.h000066400000000000000000000052301427272030600226340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv_var_oapi.c000066400000000000000000000062001427272030600236350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(varname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( a ); \ \ trans_t transa = bli_obj_conjtrans_status( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ \ dim_t m = bli_obj_length( a ); \ dim_t n = bli_obj_width( a ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ transa, \ conjx, \ m, \ n, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ buf_beta, \ buf_y, incy, \ cntx \ ); \ } \ GENFRONT( gemv, gemv_unb_var1 ) GENFRONT( gemv, gemv_unb_var2 ) GENFRONT( gemv, gemv_unf_var1 ) GENFRONT( gemv, gemv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/bli_gemv_var_oapi.c.prev000066400000000000000000000062061427272030600246160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( ftname, opname ) \ \ /*static gemv_vft GENARRAY(ftypes,gemv_unb_var1);*/ \ static GENARRAY_VFP(ftname,opname); \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ gemv_t* cntl \ ) \ { \ num_t dt = bli_obj_dt( a ); \ \ trans_t transa = bli_obj_conjtrans_status( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ \ dim_t m = bli_obj_length( a ); \ dim_t n = bli_obj_width( a ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \ \ PASTECH(ftname,_vft) f = PASTECH(opname,_vfp)[dt]; \ \ /* Invoke the void pointer-based function for the given datatype. */ \ f( \ transa, \ conjx, \ m, \ n, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ buf_beta, \ buf_y, incy, \ cntx \ ); \ } \ GENFRONT( gemv, gemv_unb_var1 ) GENFRONT( gemv, gemv_unb_var2 ) GENFRONT( gemv, gemv_unf_var1 ) GENFRONT( gemv, gemv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/gemv/other/000077500000000000000000000000001427272030600211505ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_blk_var1.c000066400000000000000000000074061427272030600246700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemv_blk_var1( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, gemv_t* cntl ) { obj_t a1, a1_pack; obj_t y1, y1_pack; dim_t m_trans; dim_t i; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( a ); // Partition along the m dimension. for ( i = 0; i < m_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, m_trans, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and y1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, i, b_alg, y, &y1 ); // Initialize objects for packing A1 and y1 (if needed). bli_packm_init( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y( cntl ) ); // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y( cntl ) ); // y1 = beta * y1 + alpha * A1 * x; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a1_pack, x, beta, &y1_pack, cntx, bli_cntl_sub_gemv( cntl ) ); // Copy/unpack y1 (if y1 was packed). bli_unpackv_int( &y1_pack, &y1, cntx, bli_cntl_sub_unpackv_y( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_blk_var2.c000066400000000000000000000074011427272030600246640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemv_blk_var2( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, gemv_t* cntl ) { obj_t a1, a1_pack; obj_t x1, x1_pack; dim_t n_trans; dim_t i; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a1_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( a ); // y = beta * y; bli_scalv_int( beta, y, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition along the "k" dimension (n dimension of A). for ( i = 0; i < n_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, n_trans, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and x1. bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, i, b_alg, x, &x1 ); // Initialize objects for packing A1 and x1 (if needed). bli_packm_init( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x( cntl ) ); // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x( cntl ) ); // y = y + alpha * A1 * x1; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a1_pack, &x1_pack, &BLIS_ONE, y, cntx, bli_cntl_sub_gemv( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_cntl.c000066400000000000000000000205241427272030600241230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern scalv_t* scalv_cntl; extern packm_t* packm_cntl; extern packv_t* packv_cntl; extern unpackv_t* unpackv_cntl; gemv_t* gemv_cntl_bs_ke_dot = NULL; gemv_t* gemv_cntl_bs_ke_axpy = NULL; gemv_t* gemv_cntl_rp_bs_dot = NULL; gemv_t* gemv_cntl_rp_bs_axpy = NULL; gemv_t* gemv_cntl_cp_bs_dot = NULL; gemv_t* gemv_cntl_cp_bs_axpy = NULL; gemv_t* gemv_cntl_ge_dot = NULL; gemv_t* gemv_cntl_ge_axpy = NULL; void bli_gemv_cntl_init() { // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. gemv_cntl_bs_ke_dot = bli_gemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, 0, NULL, NULL, NULL, NULL, NULL, NULL ); gemv_cntl_bs_ke_axpy = bli_gemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT2, 0, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for problems with relatively small m dimension // (ie: where trans(A) is a row panel problem). gemv_cntl_rp_bs_dot = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_N2, scalv_cntl, // scale y up-front packm_cntl, // pack A1 (if needed) packv_cntl, // pack x1 (if needed) NULL, // y is not partitioned in var2 gemv_cntl_bs_ke_dot, NULL ); // y is not partitioned in var2 gemv_cntl_rp_bs_axpy = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_N2, scalv_cntl, // scale y up-front packm_cntl, // pack A1 (if needed) packv_cntl, // pack x1 (if needed) NULL, // y is not partitioned in var2 gemv_cntl_bs_ke_axpy, NULL ); // y is not partitioned in var2 // Create control trees for problems with relatively small n dimension // (ie: where trans(A) is a column panel problem). gemv_cntl_cp_bs_dot = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, NULL, // no scaling in blk_var1 packm_cntl, // pack A1 (if needed) NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_bs_ke_dot, unpackv_cntl ); // unpack y1 (if packed) gemv_cntl_cp_bs_axpy = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, NULL, // no scaling in blk_var1 packm_cntl, // pack A1 (if needed) NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_bs_ke_axpy, unpackv_cntl ); // unpack y1 (if packed) // Create control trees for generally large problems. Here, we choose a // variant that partitions subproblems into row panels. gemv_cntl_ge_dot = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, NULL, // no scaling in blk_var1 NULL, // do not pack A1 NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_dot, unpackv_cntl ); // unpack y1 (if packed) gemv_cntl_ge_axpy = bli_gemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, NULL, // no scaling in blk_var1 NULL, // do not pack A1 NULL, // x is not partitioned in var1 packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_axpy, unpackv_cntl ); // unpack y1 (if packed) } void bli_gemv_cntl_finalize() { bli_cntl_free_node( gemv_cntl_bs_ke_dot ); bli_cntl_free_node( gemv_cntl_bs_ke_axpy ); bli_cntl_free_node( gemv_cntl_rp_bs_dot ); bli_cntl_free_node( gemv_cntl_rp_bs_axpy ); bli_cntl_free_node( gemv_cntl_cp_bs_dot ); bli_cntl_free_node( gemv_cntl_cp_bs_axpy ); bli_cntl_free_node( gemv_cntl_ge_dot ); bli_cntl_free_node( gemv_cntl_ge_axpy ); } gemv_t* bli_gemv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a, packv_t* sub_packv_x, packv_t* sub_packv_y, gemv_t* sub_gemv, unpackv_t* sub_unpackv_y ) { gemv_t* cntl; cntl = ( gemv_t* ) bli_malloc_intl( sizeof(gemv_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_scalv = sub_scalv; cntl->sub_packm_a = sub_packm_a; cntl->sub_packv_x = sub_packv_x; cntl->sub_packv_y = sub_packv_y; cntl->sub_gemv = sub_gemv; cntl->sub_unpackv_y = sub_unpackv_y; return cntl; } void bli_gemv_cntl_obj_init( gemv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a, packv_t* sub_packv_x, packv_t* sub_packv_y, gemv_t* sub_gemv, unpackv_t* sub_unpackv_y ) { cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_scalv = sub_scalv; cntl->sub_packm_a = sub_packm_a; cntl->sub_packv_x = sub_packv_x; cntl->sub_packv_y = sub_packv_y; cntl->sub_gemv = sub_gemv; cntl->sub_unpackv_y = sub_unpackv_y; } cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_cntl.h000066400000000000000000000066721427272030600241400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct gemv_s { impl_t impl_type; varnum_t var_num; bszid_t bszid; struct scalv_s* sub_scalv; struct packm_s* sub_packm_a; struct packv_s* sub_packv_x; struct packv_s* sub_packv_y; struct gemv_s* sub_gemv; struct unpackv_s* sub_unpackv_y; }; typedef struct gemv_s gemv_t; #define bli_cntl_sub_gemv( cntl ) cntl->sub_gemv #define bli_cntl_sub_gemv_rp( cntl ) cntl->sub_gemv_rp #define bli_cntl_sub_gemv_cp( cntl ) cntl->sub_gemv_cp #define bli_cntl_sub_gemv_n_rp( cntl ) cntl->sub_gemv_n_rp #define bli_cntl_sub_gemv_n_cp( cntl ) cntl->sub_gemv_n_cp #define bli_cntl_sub_gemv_t_rp( cntl ) cntl->sub_gemv_t_rp #define bli_cntl_sub_gemv_t_cp( cntl ) cntl->sub_gemv_t_cp void bli_gemv_cntl_init( void ); void bli_gemv_cntl_finalize( void ); gemv_t* bli_gemv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a, packv_t* sub_packv_x, packv_t* sub_packv_y, gemv_t* sub_gemv, unpackv_t* sub_unpackv_y ); void bli_gemv_cntl_obj_init( gemv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a, packv_t* sub_packv_x, packv_t* sub_packv_y, gemv_t* sub_gemv, unpackv_t* sub_unpackv_y ); cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_front.c000066400000000000000000000166721427272030600243240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern gemv_t* gemv_cntl_bs_ke_axpy; extern gemv_t* gemv_cntl_bs_ke_dot; extern gemv_t* gemv_cntl_ge_axpy; extern gemv_t* gemv_cntl_ge_dot; void bli_gemv_front ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx ) { gemv_t* gemv_cntl; num_t dt_targ_a; num_t dt_targ_x; num_t dt_targ_y; bool a_has_unit_inc; bool x_has_unit_inc; bool y_has_unit_inc; obj_t alpha_local; obj_t beta_local; num_t dt_alpha; num_t dt_beta; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemv_check( alpha, a, x, beta, y ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_dt( a ); dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); // Determine whether each operand is stored with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( a ) || bli_obj_is_col_stored( a ) ); x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Create an object to hold a copy-cast of beta. Notice that we use // the datatype of y. Here's why: If y is real and beta is complex, // there is no reason to keep beta_local in the complex domain since // the complex part of beta*y will not be stored. If y is complex and // beta is real then beta is harmlessly promoted to complex. dt_beta = dt_targ_y; bli_obj_scalar_init_detached_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc && y_has_unit_inc ) { // A row-major layout with no transpose is typically best served by // a dot-based implementation (and the same goes for a column-major // layout with a transposition) because it engenders unit stride // within matrix A. Similarly, an axpy-based code is better for // row-major cases with a transpose and column-major without a // transpose. For the general stride case, we mimic that of column- // major storage since that is the format into which we copy/pack. if ( bli_obj_has_notrans( a ) ) { if ( bli_obj_is_row_stored( a ) ) gemv_cntl = gemv_cntl_bs_ke_dot; else gemv_cntl = gemv_cntl_bs_ke_axpy; } else // if ( bli_obj_has_trans( a ) ) { if ( bli_obj_is_row_stored( a ) ) gemv_cntl = gemv_cntl_bs_ke_axpy; else gemv_cntl = gemv_cntl_bs_ke_dot; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_has_notrans( a ) ) { if ( bli_obj_is_row_tilted( a ) ) gemv_cntl = gemv_cntl_ge_dot; else gemv_cntl = gemv_cntl_ge_axpy; } else // if ( bli_obj_has_trans( a ) ) { if ( bli_obj_is_row_tilted( a ) ) gemv_cntl = gemv_cntl_ge_axpy; else gemv_cntl = gemv_cntl_ge_dot; } } // Invoke the internal back-end with the copy-casts of scalars and the // chosen control tree. bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_TRANSPOSE, &alpha_local, a, x, &beta_local, y, cntx, gemv_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, ao, xo, betao, yo; \ \ dim_t m_a, n_a; \ dim_t m_x; \ dim_t m_y; \ inc_t rs_x, cs_x; \ inc_t rs_y, cs_y; \ \ bli_set_dims_with_trans( BLIS_NO_TRANSPOSE, m, n, &m_a, &n_a ); \ bli_set_dims_with_trans( transa, m, n, &m_y, &m_x ); \ \ rs_x = incx; cs_x = m_x * incx; \ rs_y = incy; cs_y = m_y * incy; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ \ bli_obj_create_with_attached_buffer( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ bli_obj_create_with_attached_buffer( dt, m_x, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m_y, 1, y, rs_y, cs_y, &yo ); \ \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conj( conjx, &xo ); \ \ PASTEMAC0(opname)( &alphao, \ &ao, \ &xo, \ &betao, \ &yo, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( gemv_front ) cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_front.h000066400000000000000000000042721427272030600243220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_gemv_front ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( gemv_front ) cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_int.c000066400000000000000000000070331427272030600237550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemv_fp typedef void (*FUNCPTR_T)( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, gemv_t* cntl ); static FUNCPTR_T vars[3][3] = { // unblocked unblocked with fusing blocked { bli_gemv_unb_var1, bli_gemv_unf_var1, bli_gemv_blk_var1 }, { bli_gemv_unb_var2, bli_gemv_unf_var2, bli_gemv_blk_var2 }, { NULL, NULL, NULL }, }; void bli_gemv_int( trans_t transa, conj_t conjx, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, gemv_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t a_local; obj_t x_local; // Apply the trans and/or conj parameters to aliases of the objects. bli_obj_alias_with_trans( transa, a, &a_local ); bli_obj_alias_with_conj( conjx, x, &x_local ); // Check parameters. We use the aliased copy of A so the transa parameter // is taken into account for dimension checking. if ( bli_error_checking_is_enabled() ) bli_gemv_check( alpha, &a_local, &x_local, beta, y ); // If y has a zero dimension, return early. if ( bli_obj_has_zero_dim( y ) ) return; // If x has a zero dimension, scale y by beta and return early. if ( bli_obj_has_zero_dim( x ) ) { bli_scalm( beta, y ); return; } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( alpha, &a_local, &x_local, beta, y, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/2/gemv/other/bli_gemv_int.h000066400000000000000000000035411427272030600237620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_gemv_int ( trans_t transa, conj_t conjx, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, gemv_t* cntl ); cython-blis-0.9.1/blis/_src/frame/2/ger/000077500000000000000000000000001427272030600176465ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/ger/bli_ger.h000066400000000000000000000034631427272030600214300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" #include "bli_ger_var.h" cython-blis-0.9.1/blis/_src/frame/2/ger/bli_ger_unb_var1.c000066400000000000000000000053211427272030600232130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* a1t; \ ctype* chi1; \ ctype* y1; \ ctype alpha_chi1; \ dim_t i; \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ a1t = a + (i )*rs_a + (0 )*cs_a; \ chi1 = x + (i )*incx; \ y1 = y + (0 )*incy; \ \ /* a1t = a1t + alpha * chi1 * y; */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \ PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \ \ kfp_av \ ( \ conjy, \ n, \ &alpha_chi1, \ y1, incy, \ a1t, cs_a, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( ger_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/2/ger/bli_ger_unb_var2.c000066400000000000000000000053141427272030600232160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* a1; \ ctype* x1; \ ctype* psi1; \ ctype alpha_psi1; \ dim_t j; \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( j = 0; j < n; ++j ) \ { \ a1 = a + (0 )*rs_a + (j )*cs_a; \ x1 = x + (0 )*incx; \ psi1 = y + (j )*incy; \ \ /* a1 = a1 + alpha * psi1 * x; */ \ PASTEMAC(ch,copycjs)( conjy, *psi1, alpha_psi1 ); \ PASTEMAC(ch,scals)( *alpha, alpha_psi1 ); \ \ kfp_av \ ( \ conjx, \ m, \ &alpha_psi1, \ x1, incx, \ a1, rs_a, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( ger_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/ger/bli_ger_var.h000066400000000000000000000047371427272030600223050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/ger/bli_ger_var_oapi.c000066400000000000000000000057221427272030600233030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(varname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( a ); \ \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ \ dim_t m = bli_obj_length( a ); \ dim_t n = bli_obj_width( a ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ conjx, \ conjy, \ m, \ n, \ buf_alpha, \ buf_x, incx, \ buf_y, incy, \ buf_a, rs_a, cs_a, \ cntx \ ); \ } \ GENFRONT( ger, ger_unb_var1 ) GENFRONT( ger, ger_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/ger/other/000077500000000000000000000000001427272030600207675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_blk_var1.c000066400000000000000000000073451427272030600243300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_ger_blk_var1( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ) { obj_t a1, a1_pack; obj_t x1, x1_pack; dim_t i; dim_t b_alg; dim_t m_trans; // Initialize objects for packing. bli_obj_init_pack( &a1_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension in partitioning direction. m_trans = bli_obj_length_after_trans( a ); // Partition along the m dimension. for ( i = 0; i < m_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, m_trans, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and x1. bli_acquire_mpart_t2b( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, i, b_alg, x, &x1 ); // Initialize objects for packing A1 and x1 (if needed). bli_packm_init( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x( cntl ) ); // Copy/pack A1, x1 (if needed). bli_packm_int( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x( cntl ) ); // A1 = A1 + alpha * x1 * y; bli_ger_int( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, alpha, &x1_pack, y, &a1_pack, cntx, bli_cntl_sub_ger( cntl ) ); // Copy/unpack A1 (if A1 was packed). bli_unpackm_int( &a1_pack, &a1, cntx, bli_cntl_sub_unpackm_a( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_blk_var2.c000066400000000000000000000073441427272030600243300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_ger_blk_var2( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ) { obj_t a1, a1_pack; obj_t y1, y1_pack; dim_t i; dim_t b_alg; dim_t n_trans; // Initialize objects for packing. bli_obj_init_pack( &a1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension in partitioning direction. n_trans = bli_obj_width_after_trans( a ); // Partition along the n dimension. for ( i = 0; i < n_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( i, n_trans, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and y1. bli_acquire_mpart_l2r( BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, i, b_alg, y, &y1 ); // Initialize objects for packing A1 and y1 (if needed). bli_packm_init( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y( cntl ) ); // Copy/pack A1, y1 (if needed). bli_packm_int( &a1, &a1_pack, cntx, bli_cntl_sub_packm_a( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y( cntl ) ); // A1 = A1 + alpha * x * y1; bli_ger_int( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, alpha, x, &y1_pack, &a1_pack, cntx, bli_cntl_sub_ger( cntl ) ); // Copy/unpack A1 (if A1 was packed). bli_unpackm_int( &a1_pack, &a1, cntx, bli_cntl_sub_unpackm_a( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a1_pack, bli_cntl_sub_packm_a( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_cntl.c000066400000000000000000000171041427272030600235610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern packm_t* packm_cntl; extern packv_t* packv_cntl; extern unpackm_t* unpackm_cntl; ger_t* ger_cntl_bs_ke_row = NULL; ger_t* ger_cntl_bs_ke_col = NULL; ger_t* ger_cntl_rp_bs_row = NULL; ger_t* ger_cntl_rp_bs_col = NULL; ger_t* ger_cntl_cp_bs_row = NULL; ger_t* ger_cntl_cp_bs_col = NULL; ger_t* ger_cntl_ge_row = NULL; ger_t* ger_cntl_ge_col = NULL; void bli_ger_cntl_init() { // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. ger_cntl_bs_ke_row = bli_ger_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, 0, NULL, NULL, NULL, NULL, NULL ); ger_cntl_bs_ke_col = bli_ger_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT2, 0, NULL, NULL, NULL, NULL, NULL ); // Create control trees for problems with relatively small m dimension // (ie: where A is a row panel problem). ger_cntl_rp_bs_row = bli_ger_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_N2, NULL, // x is not partitioned in var2 packv_cntl, // pack y1 (if needed) packm_cntl, // pack A1 (if needed) ger_cntl_bs_ke_row, unpackm_cntl ); // unpack A1 (if packed) ger_cntl_rp_bs_col = bli_ger_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_N2, NULL, // x is not partitioned in var2 packv_cntl, // pack y1 (if needed) packm_cntl, // pack A1 (if needed) ger_cntl_bs_ke_col, unpackm_cntl ); // unpack A1 (if packed) // Create control trees for problems with relatively small n dimension // (ie: where A is a column panel problem). ger_cntl_cp_bs_row = bli_ger_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, packv_cntl, // pack x1 (if needed) NULL, // y is not partitioned in var1 packm_cntl, // pack A1 (if needed) ger_cntl_bs_ke_row, unpackm_cntl ); // unpack A1 (if packed) ger_cntl_cp_bs_col = bli_ger_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, packv_cntl, // pack x1 (if needed) NULL, // y is not partitioned in var1 packm_cntl, // pack A1 (if needed) ger_cntl_bs_ke_col, unpackm_cntl ); // unpack A1 (if packed) // Create control trees for generally large problems. Here, we choose a // variant that partitions subproblems into column panels. ger_cntl_ge_row = bli_ger_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_N2, NULL, // x is not partitioned in var2 packv_cntl, // pack y1 (if needed) NULL, // do not pack A1 ger_cntl_cp_bs_row, NULL ); // do not unpack A1 ger_cntl_ge_col = bli_ger_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_N2, NULL, // x is not partitioned in var2 packv_cntl, // pack y1 (if needed) NULL, // do not pack A1 ger_cntl_cp_bs_col, NULL ); // do not unpack A1 } void bli_ger_cntl_finalize() { bli_cntl_free_node( ger_cntl_bs_ke_row ); bli_cntl_free_node( ger_cntl_bs_ke_col ); bli_cntl_free_node( ger_cntl_rp_bs_row ); bli_cntl_free_node( ger_cntl_rp_bs_col ); bli_cntl_free_node( ger_cntl_cp_bs_row ); bli_cntl_free_node( ger_cntl_cp_bs_col ); bli_cntl_free_node( ger_cntl_ge_row ); bli_cntl_free_node( ger_cntl_ge_col ); } ger_t* bli_ger_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x, packv_t* sub_packv_y, packm_t* sub_packm_a, ger_t* sub_ger, unpackm_t* sub_unpackm_a ) { ger_t* cntl; cntl = ( ger_t* ) bli_malloc_intl( sizeof(ger_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packv_x = sub_packv_x; cntl->sub_packv_y = sub_packv_y; cntl->sub_packm_a = sub_packm_a; cntl->sub_ger = sub_ger; cntl->sub_unpackm_a = sub_unpackm_a; return cntl; } void bli_ger_cntl_obj_init( ger_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x, packv_t* sub_packv_y, packm_t* sub_packm_a, ger_t* sub_ger, unpackm_t* sub_unpackm_a ) { cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packv_x = sub_packv_x; cntl->sub_packv_y = sub_packv_y; cntl->sub_packm_a = sub_packm_a; cntl->sub_ger = sub_ger; cntl->sub_unpackm_a = sub_unpackm_a; } cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_cntl.h000066400000000000000000000060251427272030600235660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct ger_s { impl_t impl_type; varnum_t var_num; bszid_t bszid; struct packv_s* sub_packv_x; struct packv_s* sub_packv_y; struct packm_s* sub_packm_a; struct ger_s* sub_ger; struct unpackm_s* sub_unpackm_a; }; typedef struct ger_s ger_t; #define bli_cntl_sub_ger( cntl ) cntl->sub_ger #define bli_cntl_sub_ger_rp( cntl ) cntl->sub_ger_rp #define bli_cntl_sub_ger_cp( cntl ) cntl->sub_ger_cp void bli_ger_cntl_init( void ); void bli_ger_cntl_finalize( void ); ger_t* bli_ger_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x, packv_t* sub_packv_y, packm_t* sub_packm_a, ger_t* sub_ger, unpackm_t* sub_unpackm_a ); void bli_ger_cntl_obj_init( ger_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x, packv_t* sub_packv_y, packm_t* sub_packm_a, ger_t* sub_ger, unpackm_t* sub_unpackm_a ); cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_front.c000066400000000000000000000132261427272030600237520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern ger_t* ger_cntl_bs_ke_row; extern ger_t* ger_cntl_bs_ke_col; extern ger_t* ger_cntl_ge_row; extern ger_t* ger_cntl_ge_col; void bli_ger_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx ) { ger_t* ger_cntl; num_t dt_targ_x; num_t dt_targ_y; //num_t dt_targ_a; bool x_has_unit_inc; bool y_has_unit_inc; bool a_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_ger_check( alpha, x, y, a ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); //dt_targ_a = bli_obj_target_dt( a ); // Determine whether each operand with unit stride. x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); a_has_unit_inc = ( bli_obj_is_row_stored( a ) || bli_obj_is_col_stored( a ) ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of x and y to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_has_unit_inc && y_has_unit_inc && a_has_unit_inc ) { // Use different control trees depending on storage of the matrix // operand. if ( bli_obj_is_row_stored( a ) ) ger_cntl = ger_cntl_bs_ke_row; else ger_cntl = ger_cntl_bs_ke_col; } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_row_tilted( a ) ) ger_cntl = ger_cntl_ge_row; else ger_cntl = ger_cntl_ge_col; } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. bli_ger_int( BLIS_NO_CONJUGATE, BLIS_NO_CONJUGATE, &alpha_local, x, y, a, cntx, ger_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, xo, yo, ao; \ \ dim_t m_x; \ dim_t m_y; \ inc_t rs_x, cs_x; \ inc_t rs_y, cs_y; \ \ bli_set_dims_with_trans( BLIS_NO_TRANSPOSE, m, n, &m_x, &m_y ); \ \ rs_x = incx; cs_x = m_x * incx; \ rs_y = incy; cs_y = m_y * incy; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ \ bli_obj_create_with_attached_buffer( dt, m_x, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m_y, 1, y, rs_y, cs_y, &yo ); \ bli_obj_create_with_attached_buffer( dt, m, n, a, rs_a, cs_a, &ao ); \ \ bli_obj_set_conj( conjx, &xo ); \ bli_obj_set_conj( conjy, &yo ); \ \ PASTEMAC0(opname)( &alphao, \ &xo, \ &yo, \ &ao, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( ger_front ) cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_front.h000066400000000000000000000042131427272030600237530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_ger_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( ger_front ) cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_int.c000066400000000000000000000105161427272030600234130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T ger_fp typedef void (*FUNCPTR_T)( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ); static FUNCPTR_T vars[4][3] = { // unblocked unblocked with fusing blocked { bli_ger_unb_var1, NULL, bli_ger_blk_var1, }, { bli_ger_unb_var2, NULL, bli_ger_blk_var2, }, { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, }; void bli_ger_int( conj_t conjx, conj_t conjy, obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t alpha_local; obj_t x_local; obj_t y_local; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_ger_check( alpha, x, y, a ); // If A has a zero dimension, return early. if ( bli_obj_has_zero_dim( a ) ) return; // If x or y has a zero dimension, return early. if ( bli_obj_has_zero_dim( x ) || bli_obj_has_zero_dim( y ) ) return; // Alias the objects, applying conjx and conjy to x and y, respectively. bli_obj_alias_with_conj( conjx, x, &x_local ); bli_obj_alias_with_conj( conjy, y, &y_local ); bli_obj_alias_to( a, &a_local ); // If matrix A is marked for conjugation, we interpret this as a request // to apply a conjugation to the other operands. if ( bli_obj_has_conj( &a_local ) ) { bli_obj_toggle_conj( &a_local ); bli_obj_toggle_conj( &x_local ); bli_obj_toggle_conj( &y_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ), BLIS_CONJUGATE, alpha, &alpha_local ); } else { bli_obj_alias_to( *alpha, alpha_local ); } // If we are about the call a leaf-level implementation, and matrix A // still needs a transposition, then we must induce one by swapping the // strides and dimensions. if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( &a_local ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( &alpha_local, &x_local, &y_local, &a_local, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/2/ger/other/bli_ger_int.h000066400000000000000000000036061427272030600234220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_ger_int( conj_t conjx, conj_t conjy, obj_t* alpha, obj_t* x, obj_t* y, obj_t* a, cntx_t* cntx, ger_t* cntl ); cython-blis-0.9.1/blis/_src/frame/2/hemv/000077500000000000000000000000001427272030600200305ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv.h000066400000000000000000000034701427272030600217720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" #include "bli_hemv_var.h" cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unb_var1.c000066400000000000000000000114111427272030600235540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* one = PASTEMAC(ch,1); \ ctype* zero = PASTEMAC(ch,0); \ ctype* a10t; \ ctype* alpha11; \ ctype* x0; \ ctype* chi1; \ ctype* y0; \ ctype* psi1; \ ctype conjx_chi1; \ ctype alpha_chi1; \ ctype alpha11_temp; \ dim_t i; \ dim_t n_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointers. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ a10t = a + (i )*rs_at + (0 )*cs_at; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ x0 = x + (0 )*incx; \ chi1 = x + (i )*incx; \ y0 = y + (0 )*incy; \ psi1 = y + (i )*incy; \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ /* y0 = y0 + alpha * a10t' * chi1; */ \ kfp_av \ ( \ conj0, \ n_behind, \ &alpha_chi1, \ a10t, cs_at, \ y0, incy, \ cntx \ ); \ \ /* psi1 = psi1 + alpha * a10t * x0; */ \ kfp_dv \ ( \ conj1, \ conjx, \ n_behind, \ alpha, \ a10t, cs_at, \ x0, incx, \ one, \ psi1, \ cntx \ ); \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unb_var2.c000066400000000000000000000114731427272030600235650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* one = PASTEMAC(ch,1); \ ctype* zero = PASTEMAC(ch,0); \ ctype* a10t; \ ctype* alpha11; \ ctype* a21; \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype* psi1; \ ctype conjx_chi1; \ ctype alpha_chi1; \ ctype alpha11_temp; \ dim_t i; \ dim_t n_behind; \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ n_ahead = m - i - 1; \ a10t = a + (i )*rs_at + (0 )*cs_at; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a21 = a + (i+1)*rs_at + (i )*cs_at; \ x0 = x + (0 )*incx; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ psi1 = y + (i )*incy; \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ /* psi1 = psi1 + alpha * a10t * x0; */ \ kfp_dv \ ( \ conj0, \ conjx, \ n_behind, \ alpha, \ a10t, cs_at, \ x0, incx, \ one, \ psi1, \ cntx \ ); \ \ /* psi1 = psi1 + alpha * a21' * x2; */ \ kfp_dv \ ( \ conj1, \ conjx, \ n_ahead, \ alpha, \ a21, rs_at, \ x2, incx, \ one, \ psi1, \ cntx \ ); \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unb_var3.c000066400000000000000000000114051427272030600235610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* one = PASTEMAC(ch,1); \ ctype* zero = PASTEMAC(ch,0); \ ctype* alpha11; \ ctype* a21; \ ctype* chi1; \ ctype* x2; \ ctype* psi1; \ ctype* y2; \ ctype conjx_chi1; \ ctype alpha_chi1; \ ctype alpha11_temp; \ dim_t i; \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ PASTECH(ch,dotxv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointers. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_ahead = m - i - 1; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a21 = a + (i+1)*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ psi1 = y + (i )*incy; \ y2 = y + (i+1)*incy; \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ /* psi1 = psi1 + alpha * a21' * x2; */ \ kfp_dv \ ( \ conj0, \ conjx, \ n_ahead, \ alpha, \ a21, rs_at, \ x2, incx, \ one, \ psi1, \ cntx \ ); \ \ /* y2 = y2 + alpha * a21 * chi1; */ \ kfp_av \ ( \ conj1, \ n_ahead, \ &alpha_chi1, \ a21, rs_at, \ y2, incy, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unb_var3 ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unb_var4.c000066400000000000000000000113021427272030600235560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* zero = PASTEMAC(ch,0); \ ctype* a10t; \ ctype* alpha11; \ ctype* a21; \ ctype* chi1; \ ctype* y0; \ ctype* psi1; \ ctype* y2; \ ctype conjx_chi1; \ ctype alpha_chi1; \ ctype alpha11_temp; \ dim_t i; \ dim_t n_behind; \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointers. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ n_ahead = m - i - 1; \ a10t = a + (i )*rs_at + (0 )*cs_at; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a21 = a + (i+1)*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ y0 = y + (0 )*incy; \ psi1 = y + (i )*incy; \ y2 = y + (i+1)*incy; \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ /* y0 = y0 + alpha * a10t' * chi1; */ \ kfp_av \ ( \ conj0, \ n_behind, \ &alpha_chi1, \ a10t, cs_at, \ y0, incy, \ cntx \ ); \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ /* y2 = y2 + alpha * a21 * chi1; */ \ kfp_av \ ( \ conj1, \ n_ahead, \ &alpha_chi1, \ a21, rs_at, \ y2, incy, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unb_var4 ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unf_var1.c000066400000000000000000000140031427272030600235600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* one = PASTEMAC(ch,1); \ ctype* zero = PASTEMAC(ch,0); \ ctype* A10; \ ctype* A11; \ ctype* a10t; \ ctype* alpha11; \ ctype* a21; \ ctype* x0; \ ctype* x1; \ ctype* chi11; \ ctype* y0; \ ctype* y1; \ ctype* y01; \ ctype* psi11; \ ctype* y21; \ ctype conjx_chi11; \ ctype alpha_chi11; \ ctype alpha11_temp; \ dim_t i, k, j; \ dim_t b_fuse, f; \ dim_t n_behind; \ dim_t f_ahead, f_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ \ for ( i = 0; i < m; i += f ) \ { \ f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \ n_behind = i; \ A10 = a + (i )*rs_at + (0 )*cs_at; \ A11 = a + (i )*rs_at + (i )*cs_at; \ x0 = x + (0 )*incx; \ x1 = x + (i )*incx; \ y0 = y + (0 )*incy; \ y1 = y + (i )*incy; \ \ /* y1 = y1 + alpha * A10 * x0; (dotxf) */ \ /* y0 = y0 + alpha * A10' * x1; (axpyf) */ \ kfp_xf \ ( \ conj0, \ conj1, \ conjx, \ conjx, \ n_behind, \ f, \ alpha, \ A10, cs_at, rs_at, \ x0, incx, \ x1, incx, \ one, \ y1, incy, \ y0, incy, \ cntx \ ); \ \ /* y1 = y1 + alpha * A11 * x1; (variant 4) */ \ for ( k = 0; k < f; ++k ) \ { \ f_behind = k; \ f_ahead = f - k - 1; \ a10t = A11 + (k )*rs_at + (0 )*cs_at; \ alpha11 = A11 + (k )*rs_at + (k )*cs_at; \ a21 = A11 + (k+1)*rs_at + (k )*cs_at; \ chi11 = x1 + (k )*incx; \ y01 = y1 + (0 )*incy; \ psi11 = y1 + (k )*incy; \ y21 = y1 + (k+1)*incy; \ \ /* y01 = y01 + alpha * a10t' * chi11; */ \ PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ if ( bli_is_conj( conj1 ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ } \ else \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ } \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* psi11 = psi11 + alpha * alpha11 * chi11; */ \ PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \ \ /* y21 = y21 + alpha * a21 * chi11; */ \ if ( bli_is_conj( conj0 ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ } \ else \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unf_var1 ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unf_var1a.c000066400000000000000000000111671427272030600237310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* zero = PASTEMAC(ch,0); \ ctype* a10t; \ ctype* alpha11; \ ctype* x0; \ ctype* chi1; \ ctype* y0; \ ctype* psi1; \ ctype rho; \ ctype conjx_chi1; \ ctype alpha_chi1; \ ctype alpha11_temp; \ dim_t i; \ dim_t n_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \ \ /* Query the context for the kernel function pointer. */ \ kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ a10t = a + (i )*rs_at + (0 )*cs_at; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ x0 = x + (0 )*incx; \ chi1 = x + (i )*incx; \ y0 = y + (0 )*incy; \ psi1 = y + (i )*incy; \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ /* psi1 = psi1 + alpha * a10t * x0; (dotv) */ \ /* y0 = y0 + alpha * a10t' * chi1; (axpyv) */ \ kfp_vf \ ( \ conj0, \ conj1, \ conjx, \ n_behind, \ &alpha_chi1, \ a10t, cs_at, \ x0, incx, \ &rho, \ y0, incy, \ cntx \ ); \ PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unf_var1a ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unf_var3.c000066400000000000000000000140111427272030600235610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* one = PASTEMAC(ch,1); \ ctype* zero = PASTEMAC(ch,0); \ ctype* A11; \ ctype* A21; \ ctype* a10t; \ ctype* alpha11; \ ctype* a21; \ ctype* x1; \ ctype* x2; \ ctype* chi11; \ ctype* y1; \ ctype* y2; \ ctype* y01; \ ctype* psi11; \ ctype* y21; \ ctype conjx_chi11; \ ctype alpha_chi11; \ ctype alpha11_temp; \ dim_t i, k, j; \ dim_t b_fuse, f; \ dim_t n_ahead; \ dim_t f_ahead, f_behind; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \ \ for ( i = 0; i < m; i += f ) \ { \ f = bli_determine_blocksize_dim_f( i, m, b_fuse ); \ n_ahead = m - i - f; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A21 = a + (i+f)*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ x2 = x + (i+f)*incx; \ y1 = y + (i )*incy; \ y2 = y + (i+f)*incy; \ \ /* y1 = y1 + alpha * A11 * x1; (variant 4) */ \ for ( k = 0; k < f; ++k ) \ { \ f_behind = k; \ f_ahead = f - k - 1; \ a10t = A11 + (k )*rs_at + (0 )*cs_at; \ alpha11 = A11 + (k )*rs_at + (k )*cs_at; \ a21 = A11 + (k+1)*rs_at + (k )*cs_at; \ chi11 = x1 + (k )*incx; \ y01 = y1 + (0 )*incy; \ psi11 = y1 + (k )*incy; \ y21 = y1 + (k+1)*incy; \ \ /* y01 = y01 + alpha * a10t' * chi11; */ \ PASTEMAC(ch,copycjs)( conjx, *chi11, conjx_chi11 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi11, alpha_chi11 ); \ if ( bli_is_conj( conj0 ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpyjs)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ } \ else \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpys)( alpha_chi11, *(a10t + j*cs_at), *(y01 + j*incy) ); \ } \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* psi11 = psi11 + alpha * alpha11 * chi11; */ \ PASTEMAC(ch,axpys)( alpha_chi11, alpha11_temp, *psi11 ); \ \ /* y21 = y21 + alpha * a21 * chi11; */ \ if ( bli_is_conj( conj1 ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ } \ else \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(y21 + j*incy) ); \ } \ } \ \ /* y1 = y1 + alpha * A21' * x2; (dotxf) */ \ /* y2 = y2 + alpha * A21 * x1; (axpyf) */ \ kfp_xf \ ( \ conj0, \ conj1, \ conjx, \ conjx, \ n_ahead, \ f, \ alpha, \ A21, rs_at, cs_at, \ x2, incx, \ x1, incx, \ one, \ y1, incy, \ y2, incy, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unf_var3 ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_unf_var3a.c000066400000000000000000000111631427272030600237270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* zero = PASTEMAC(ch,0); \ ctype* alpha11; \ ctype* a21; \ ctype* chi1; \ ctype* x2; \ ctype* psi1; \ ctype* y2; \ ctype rho; \ ctype conjx_chi1; \ ctype alpha_chi1; \ ctype alpha11_temp; \ dim_t i; \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ \ conj0 = bli_apply_conj( conjh, conja ); \ conj1 = conja; \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ \ conj0 = conja; \ conj1 = bli_apply_conj( conjh, conja ); \ } \ \ /* If beta is zero, use setv. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* y = 0; */ \ PASTEMAC2(ch,setv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ zero, \ y, incy, \ cntx, \ NULL \ ); \ } \ else \ { \ /* y = beta * y; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ beta, \ y, incy, \ cntx, \ NULL \ ); \ } \ \ PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \ \ /* Query the context for the kernel function pointer. */ \ kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_ahead = m - i - 1; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a21 = a + (i+1)*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ psi1 = y + (i )*incy; \ y2 = y + (i+1)*incy; \ \ /* For hemv, explicitly set the imaginary component of alpha11 to zero. */ \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_temp ); \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( alpha11_temp ); \ \ /* Apply conjx to chi1 and and scale by alpha. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ /* psi1 = psi1 + alpha * alpha11 * chi1; */ \ PASTEMAC(ch,axpys)( alpha_chi1, alpha11_temp, *psi1 ); \ \ /* psi1 = psi1 + alpha * a21' * x2; (dotv) */ \ /* y2 = y2 + alpha * a21 * chi1; (axpyv) */ \ kfp_vf \ ( \ conj0, \ conj1, \ conjx, \ n_ahead, \ &alpha_chi1, \ a21, rs_at, \ x2, incx, \ &rho, \ y2, incy, \ cntx \ ); \ PASTEMAC(ch,axpys)( *alpha, rho, *psi1 ); \ } \ } INSERT_GENTFUNC_BASIC0( hemv_unf_var3a ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_var.h000066400000000000000000000060041427272030600226360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) cython-blis-0.9.1/blis/_src/frame/2/hemv/bli_hemv_var_oapi.c000066400000000000000000000064421427272030600236470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(varname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uplo = bli_obj_uplo( a ); \ conj_t conja = bli_obj_conj_status( a ); \ conj_t conjx = bli_obj_conj_status( x ); \ \ dim_t m = bli_obj_length( a ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ uplo, \ conja, \ conjx, \ conjh, \ m, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ buf_beta, \ buf_y, incy, \ cntx \ ); \ } \ GENFRONT( hemv, hemv_unb_var1 ) GENFRONT( hemv, hemv_unb_var2 ) GENFRONT( hemv, hemv_unb_var3 ) GENFRONT( hemv, hemv_unb_var4 ) GENFRONT( hemv, hemv_unf_var1 ) GENFRONT( hemv, hemv_unf_var3 ) GENFRONT( hemv, hemv_unf_var1a ) GENFRONT( hemv, hemv_unf_var3a ) cython-blis-0.9.1/blis/_src/frame/2/hemv/other/000077500000000000000000000000001427272030600211515ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_blk_var1.c000066400000000000000000000133101427272030600246610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_hemv_blk_var1( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ) { obj_t a11, a11_pack; obj_t a10; obj_t x1, x1_pack; obj_t x0; obj_t y1, y1_pack; obj_t y0; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( a ); // y = beta * y; bli_scalv_int( beta, y, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A10, x1, x0, y1, and y0. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, a, &a10 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, x, &x0 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, y, &y0 ); // Initialize objects for packing A11, x1, and y1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // y0 = y0 + alpha * A10' * x1; bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ), BLIS_NO_CONJUGATE, alpha, &a10, &x1_pack, &BLIS_ONE, &y0, cntx, bli_cntl_sub_gemv_t_rp( cntl ) ); // y1 = y1 + alpha * A11 * x1; bli_hemv_int( conjh, alpha, &a11_pack, &x1_pack, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_hemv( cntl ) ); // y1 = y1 + alpha * A10 * x0; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a10, &x0, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_gemv_n_rp( cntl ) ); // Copy/unpack y1 (if y1 was packed). bli_unpackv_int( &y1_pack, &y1, cntx, bli_cntl_sub_unpackv_y1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_blk_var2.c000066400000000000000000000134731427272030600246740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_hemv_blk_var2( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ) { obj_t a11, a11_pack; obj_t a10; obj_t a21; obj_t x1, x1_pack; obj_t x0; obj_t x2; obj_t y1, y1_pack; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( a ); // y = beta * y; bli_scalv_int( beta, y, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A10, A21, x1, x0, x2, y1, and y0. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, a, &a10 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, a, &a21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, x, &x0 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, x, &x2 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); // Initialize objects for packing A11, x1, and y1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // y1 = y1 + alpha * A10 * x0; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a10, &x0, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_gemv_n_rp( cntl ) ); // y1 = y1 + alpha * A11 * x1; bli_hemv_int( conjh, alpha, &a11_pack, &x1_pack, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_hemv( cntl ) ); // y1 = y1 + alpha * A21' * x2; bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ), BLIS_NO_CONJUGATE, alpha, &a21, &x2, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_gemv_t_cp( cntl ) ); // Copy/unpack y1 (if y1 was packed). bli_unpackv_int( &y1_pack, &y1, cntx, bli_cntl_sub_unpackv_y1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_blk_var3.c000066400000000000000000000133101427272030600246630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_hemv_blk_var3( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ) { obj_t a11, a11_pack; obj_t a21; obj_t x1, x1_pack; obj_t x2; obj_t y1, y1_pack; obj_t y2; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( a ); // y = beta * y; bli_scalv_int( beta, y, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A10, x1, x0, y1, and y0. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, a, &a21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, x, &x2 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, y, &y2 ); // Initialize objects for packing A11, x1, and y1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // y1 = y1 + alpha * A21' * x2; bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ), BLIS_NO_CONJUGATE, alpha, &a21, &x2, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_gemv_t_cp( cntl ) ); // y1 = y1 + alpha * A11 * x1; bli_hemv_int( conjh, alpha, &a11_pack, &x1_pack, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_hemv( cntl ) ); // y2 = y2 + alpha * A21 * x1; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a21, &x1_pack, &BLIS_ONE, &y2, cntx, bli_cntl_sub_gemv_n_cp( cntl ) ); // Copy/unpack y1 (if y1 was packed). bli_unpackv_int( &y1_pack, &y1, cntx, bli_cntl_sub_unpackv_y1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_blk_var4.c000066400000000000000000000134671427272030600247010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_hemv_blk_var4( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ) { obj_t a11, a11_pack; obj_t a10; obj_t a21; obj_t x1, x1_pack; obj_t y1, y1_pack; obj_t y0; obj_t y2; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( a ); // y = beta * y; bli_scalv_int( beta, y, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A10, A21, x1, y1, y0, and y2. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, a, &a10 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, a, &a21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, y, &y0 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, y, &y2 ); // Initialize objects for packing A11, x1, and y1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack A11, x1, y1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // y0 = y0 + alpha * A10' * x1; bli_gemv_int( bli_apply_conj( conjh, BLIS_TRANSPOSE ), BLIS_NO_CONJUGATE, alpha, &a10, &x1_pack, &BLIS_ONE, &y0, cntx, bli_cntl_sub_gemv_t_rp( cntl ) ); // y1 = y1 + alpha * A11 * x1; bli_hemv_int( conjh, alpha, &a11_pack, &x1_pack, &BLIS_ONE, &y1_pack, cntx, bli_cntl_sub_hemv( cntl ) ); // y2 = y2 + alpha * A21 * x1; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a21, &x1_pack, &BLIS_ONE, &y2, cntx, bli_cntl_sub_gemv_n_cp( cntl ) ); // Copy/unpack y1 (if y1 was packed). bli_unpackv_int( &y1_pack, &y1, cntx, bli_cntl_sub_unpackv_y1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_cntl.c000066400000000000000000000166441427272030600241350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern scalv_t* scalv_cntl; extern packm_t* packm_cntl; extern packv_t* packv_cntl; extern unpackv_t* unpackv_cntl; extern gemv_t* gemv_cntl_rp_bs_dot; extern gemv_t* gemv_cntl_rp_bs_axpy; extern gemv_t* gemv_cntl_cp_bs_dot; extern gemv_t* gemv_cntl_cp_bs_axpy; hemv_t* hemv_cntl_bs_ke_lrow_ucol = NULL; hemv_t* hemv_cntl_bs_ke_lcol_urow = NULL; hemv_t* hemv_cntl_ge_lrow_ucol = NULL; hemv_t* hemv_cntl_ge_lcol_urow = NULL; void bli_hemv_cntl_init() { // Create control trees for the lowest-level kernels. These trees induce // operations on (presumably) relatively small block-subvector problems. hemv_cntl_bs_ke_lrow_ucol = bli_hemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); hemv_cntl_bs_ke_lcol_urow = bli_hemv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT3, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here, we choose a // variant that prioritizes keeping a subvector of y in cache. hemv_cntl_ge_lrow_ucol = bli_hemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_M2, scalv_cntl, // scale y up-front packm_cntl, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_dot, // gemv_n_rp needed by var2 NULL, // gemv_n_cp not used by var2 NULL, // gemv_t_rp not used by var2 gemv_cntl_rp_bs_axpy, // gemv_t_cp needed by var2 hemv_cntl_bs_ke_lrow_ucol, unpackv_cntl ); // unpack y1 (if packed) hemv_cntl_ge_lcol_urow = bli_hemv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_M2, scalv_cntl, // scale y up-front packm_cntl, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) gemv_cntl_rp_bs_axpy, // gemv_n_rp needed by var2 NULL, // gemv_n_cp not used by var2 NULL, // gemv_t_rp not used by var2 gemv_cntl_rp_bs_dot, // gemv_t_cp needed by var2 hemv_cntl_bs_ke_lcol_urow, unpackv_cntl ); // unpack y1 (if packed) } void bli_hemv_cntl_finalize() { bli_cntl_free_node( hemv_cntl_bs_ke_lrow_ucol ); bli_cntl_free_node( hemv_cntl_bs_ke_lcol_urow ); bli_cntl_free_node( hemv_cntl_ge_lrow_ucol ); bli_cntl_free_node( hemv_cntl_ge_lcol_urow ); } hemv_t* bli_hemv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, packv_t* sub_packv_y1, gemv_t* sub_gemv_n_rp, gemv_t* sub_gemv_n_cp, gemv_t* sub_gemv_t_rp, gemv_t* sub_gemv_t_cp, hemv_t* sub_hemv, unpackv_t* sub_unpackv_y1 ) { hemv_t* cntl; cntl = ( hemv_t* ) bli_malloc_intl( sizeof(hemv_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_scalv = sub_scalv; cntl->sub_packm_a11 = sub_packm_a11; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_packv_y1 = sub_packv_y1; cntl->sub_gemv_n_rp = sub_gemv_n_rp; cntl->sub_gemv_n_cp = sub_gemv_n_cp; cntl->sub_gemv_t_rp = sub_gemv_t_rp; cntl->sub_gemv_t_cp = sub_gemv_t_cp; cntl->sub_hemv = sub_hemv; cntl->sub_unpackv_y1 = sub_unpackv_y1; return cntl; } void bli_hemv_cntl_obj_init( hemv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, packv_t* sub_packv_y1, gemv_t* sub_gemv_n_rp, gemv_t* sub_gemv_n_cp, gemv_t* sub_gemv_t_rp, gemv_t* sub_gemv_t_cp, hemv_t* sub_hemv, unpackv_t* sub_unpackv_y1 ) { cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_scalv = sub_scalv; cntl->sub_packm_a11 = sub_packm_a11; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_packv_y1 = sub_packv_y1; cntl->sub_gemv_n_rp = sub_gemv_n_rp; cntl->sub_gemv_n_cp = sub_gemv_n_cp; cntl->sub_gemv_t_rp = sub_gemv_t_rp; cntl->sub_gemv_t_cp = sub_gemv_t_cp; cntl->sub_hemv = sub_hemv; cntl->sub_unpackv_y1 = sub_unpackv_y1; } cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_cntl.h000066400000000000000000000072741427272030600241410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct hemv_s { impl_t impl_type; varnum_t var_num; bszid_t bszid; struct scalv_s* sub_scalv; struct packm_s* sub_packm_a11; struct packv_s* sub_packv_x1; struct packv_s* sub_packv_y1; struct gemv_s* sub_gemv_n_rp; struct gemv_s* sub_gemv_n_cp; struct gemv_s* sub_gemv_t_rp; struct gemv_s* sub_gemv_t_cp; struct hemv_s* sub_hemv; struct unpackv_s* sub_unpackv_y1; }; typedef struct hemv_s hemv_t; #define bli_cntl_sub_hemv( cntl ) cntl->sub_hemv void bli_hemv_cntl_init( void ); void bli_hemv_cntl_finalize( void ); hemv_t* bli_hemv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, packv_t* sub_packv_y1, gemv_t* sub_gemv_n_rp, gemv_t* sub_gemv_n_cp, gemv_t* sub_gemv_t_rp, gemv_t* sub_gemv_t_cp, hemv_t* sub_hemv, unpackv_t* sub_unpackv_y1 ); void bli_hemv_cntl_obj_init( hemv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, packv_t* sub_packv_y1, gemv_t* sub_gemv_n_rp, gemv_t* sub_gemv_n_cp, gemv_t* sub_gemv_t_rp, gemv_t* sub_gemv_t_cp, hemv_t* sub_hemv, unpackv_t* sub_unpackv_y1 ); cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_front.c000066400000000000000000000162551427272030600243230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern hemv_t* hemv_cntl_bs_ke_lrow_ucol; extern hemv_t* hemv_cntl_bs_ke_lcol_urow; extern hemv_t* hemv_cntl_ge_lrow_ucol; extern hemv_t* hemv_cntl_ge_lcol_urow; void bli_hemv_front ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx ) { hemv_t* hemv_cntl; num_t dt_targ_a; num_t dt_targ_x; num_t dt_targ_y; bool a_has_unit_inc; bool x_has_unit_inc; bool y_has_unit_inc; obj_t alpha_local; obj_t beta_local; num_t dt_alpha; num_t dt_beta; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_hemv_check( alpha, a, x, beta, y ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_dt( a ); dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); // Determine whether each operand with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( a ) || bli_obj_is_col_stored( a ) ); x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Create an object to hold a copy-cast of beta. Notice that we use // the datatype of y. Here's why: If y is real and beta is complex, // there is no reason to keep beta_local in the complex domain since // the complex part of beta*y will not be stored. If y is complex and // beta is real then beta is harmlessly promoted to complex. dt_beta = dt_targ_y; bli_obj_scalar_init_detached_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc && y_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( a ) ) { if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; else hemv_cntl = hemv_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( a ) ) { if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lcol_urow; else hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( a ) ) { if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lrow_ucol; else hemv_cntl = hemv_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( a ) ) { if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lcol_urow; else hemv_cntl = hemv_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-casts of scalars and the // chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the // Hermitian (and not symmetric) algorithms. bli_hemv_int( BLIS_CONJUGATE, &alpha_local, a, x, &beta_local, y, cntx, hemv_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, ao, xo, betao, yo; \ \ inc_t rs_x, cs_x; \ inc_t rs_y, cs_y; \ \ rs_x = incx; cs_x = m * incx; \ rs_y = incy; cs_y = m * incy; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ \ bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_conj( conja, &ao ); \ bli_obj_set_conj( conjx, &xo ); \ \ bli_obj_set_struc( BLIS_HERMITIAN, &ao ); \ \ PASTEMAC0(opname)( &alphao, \ &ao, \ &xo, \ &betao, \ &yo, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( hemv_front ) cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_front.h000066400000000000000000000044101427272030600243160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_hemv_front ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx ); // // Prototype BLAS-like interfaces with homogeneous-typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( hemv_front ) cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_int.c000066400000000000000000000102441427272030600237550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T hemv_fp typedef void (*FUNCPTR_T)( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ); static FUNCPTR_T vars[4][3] = { // unblocked unblocked with fusing blocked { bli_hemv_unb_var1, bli_hemv_unf_var1, bli_hemv_blk_var1, }, { bli_hemv_unb_var2, NULL, bli_hemv_blk_var2, }, { bli_hemv_unb_var3, bli_hemv_unf_var3, bli_hemv_blk_var3, }, { bli_hemv_unb_var4, NULL, bli_hemv_blk_var4, }, }; void bli_hemv_int( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) { if ( bli_is_conj( conjh ) ) bli_hemv_check( alpha, a, x, beta, y ); else bli_symv_check( alpha, a, x, beta, y ); } // If y has a zero dimension, return early. if ( bli_obj_has_zero_dim( y ) ) return; // If x has a zero dimension, scale y by beta and return early. if ( bli_obj_has_zero_dim( x ) ) { bli_scalm( beta, y ); return; } // Alias A in case we need to induce the upper triangular case. bli_obj_alias_to( a, &a_local ); /* // Our blocked algorithms only [explicitly] implement the lower triangular // case, so if matrix A is stored as upper triangular, we must toggle the // transposition (and conjugation) bits so that the diagonal partitioning // routines grab the correct partitions corresponding to the upper // triangular case. But we only need to do this for blocked algorithms, // since unblocked algorithms are responsible for handling the upper case // explicitly (and they should not be inspecting the transposition bit anyway). if ( bli_cntl_is_blocked( cntl ) && bli_obj_is_upper( a ) ) { bli_obj_toggle_conj( &a_local ); bli_obj_toggle_trans( &a_local ); } */ // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( conjh, alpha, &a_local, x, beta, y, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/2/hemv/other/bli_hemv_int.h000066400000000000000000000036141427272030600237650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_hemv_int( conj_t conjh, obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx, hemv_t* cntl ); cython-blis-0.9.1/blis/_src/frame/2/her/000077500000000000000000000000001427272030600176475ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/her/bli_her.h000066400000000000000000000034631427272030600214320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" #include "bli_her_var.h" cython-blis-0.9.1/blis/_src/frame/2/her/bli_her_unb_var1.c000066400000000000000000000113531427272030600232170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x0; \ ctype* chi1; \ ctype* c10t; \ ctype* gamma11; \ ctype alpha_local; \ ctype alpha_chi1; \ ctype alpha_chi1_chi1; \ ctype conjx0_chi1; \ ctype conjx1_chi1; \ dim_t i; \ dim_t n_behind; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ \ /* Eliminate unused variable warnings. */ \ ( void )conj0; \ \ /* Make a local copy of alpha and zero out the imaginary component if we are being invoked as her, since her requires alpha to be real. */ \ PASTEMAC(ch,copys)( *alpha, alpha_local ); \ if ( bli_is_conj( conjh ) ) \ { \ PASTEMAC(ch,seti0s)( alpha_local ); \ } \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx, but only if we are being invoked as her; for syr, conjx is unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx as needed to arrive at the effective conjugation for the scalar and vector subproblems. */ \ conj0 = conjx; \ conj1 = bli_apply_conj( conjh, conjx ); \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ x0 = x + (0 )*incx; \ chi1 = x + (i )*incx; \ c10t = c + (i )*rs_ct + (0 )*cs_ct; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ \ /* Apply conjx to chi1. */ \ PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ PASTEMAC(ch,copycjs)( conj1, *chi1, conjx1_chi1 ); \ \ /* Compute scalar for vector subproblem. */ \ PASTEMAC(ch,scal2s)( alpha_local, conjx0_chi1, alpha_chi1 ); \ \ /* Compute alpha * chi1 * conj(chi1) after chi1 has already been conjugated, if needed, by conjx. */ \ PASTEMAC(ch,scal2s)( alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \ \ /* c10t = c10t + alpha * chi1 * x0'; */ \ kfp_av \ ( \ conj1, \ n_behind, \ &alpha_chi1, \ x0, incx, \ c10t, cs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha_chi1_chi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/2/her/bli_her_unb_var2.c000066400000000000000000000113551427272030600232220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* chi1; \ ctype* x2; \ ctype* gamma11; \ ctype* c21; \ ctype alpha_local; \ ctype alpha_chi1; \ ctype alpha_chi1_chi1; \ ctype conjx0_chi1; \ ctype conjx1_chi1; \ dim_t i; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ \ /* Eliminate unused variable warnings. */ \ ( void )conj0; \ \ /* Make a local copy of alpha and zero out the imaginary component if we are being invoked as her, since her requires alpha to be real. */ \ PASTEMAC(ch,copys)( *alpha, alpha_local ); \ if ( bli_is_conj( conjh ) ) \ { \ PASTEMAC(ch,seti0s)( alpha_local ); \ } \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx, but only if we are being invoked as her; for syr, conjx is unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx as needed to arrive at the effective conjugation for the scalar and vector subproblems. */ \ conj0 = bli_apply_conj( conjh, conjx ); \ conj1 = conjx; \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_ahead = m - i - 1; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx to chi1. */ \ PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ PASTEMAC(ch,copycjs)( conj1, *chi1, conjx1_chi1 ); \ \ /* Compute scalar for vector subproblem. */ \ PASTEMAC(ch,scal2s)( alpha_local, conjx0_chi1, alpha_chi1 ); \ \ /* Compute alpha * chi1 * conj(chi1) after chi1 has already been conjugated, if needed, by conjx. */ \ PASTEMAC(ch,scal2s)( alpha_chi1, conjx1_chi1, alpha_chi1_chi1 ); \ \ /* c21 = c21 + alpha * x2 * conj(chi1); */ \ kfp_av \ ( \ conj1, \ n_ahead, \ &alpha_chi1, \ x2, incx, \ c21, rs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha_chi1_chi1, *gamma11 ); \ \ /* For her, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/her/bli_her_var.h000066400000000000000000000050251427272030600222760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, /* complex alpha allows her variants to also perform syr. */ \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/her/bli_her_var_oapi.c000066400000000000000000000054521427272030600233050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(varname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( c ); \ \ uplo_t uplo = bli_obj_uplo( c ); \ conj_t conjx = bli_obj_conj_status( x ); \ \ dim_t m = bli_obj_length( c ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_c = bli_obj_buffer_at_off( c ); \ inc_t rs_c = bli_obj_row_stride( c ); \ inc_t cs_c = bli_obj_col_stride( c ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ uplo, \ conjx, \ conjh, \ m, \ buf_alpha, \ buf_x, incx, \ buf_c, rs_c, cs_c, \ cntx \ ); \ } \ GENFRONT( her, her_unb_var1 ) GENFRONT( her, her_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/her/other/000077500000000000000000000000001427272030600207705ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_blk_var1.c000066400000000000000000000112451427272030600243240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_her_blk_var1( conj_t conjh, obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx, her_t* cntl ) { obj_t c11, c11_pack; obj_t c10; obj_t x1, x1_pack; obj_t x0; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &c11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( c ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for C11, C10, x1, and x0. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, c, &c11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, c, &c10 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, x, &x0 ); // Initialize objects for packing C11 and x1 (if needed). bli_packm_init( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // C10 = C10 + alpha * x1 * x0'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha, &x1_pack, &x0, &c10, cntx, bli_cntl_sub_ger( cntl ) ); // C11 = C11 + alpha * x1 * x1'; bli_her_int( conjh, alpha, &x1_pack, &c11_pack, cntx, bli_cntl_sub_her( cntl ) ); // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, cntx, bli_cntl_sub_unpackm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_blk_var2.c000066400000000000000000000112451427272030600243250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_her_blk_var2( conj_t conjh, obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx, her_t* cntl ) { obj_t c11, c11_pack; obj_t c21; obj_t x1, x1_pack; obj_t x2; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &c11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( c ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for C11, C21, x1, and x2. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, c, &c11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, c, &c21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, x, &x2 ); // Initialize objects for packing C11 and x1 (if needed). bli_packm_init( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack C11, x1 (if needed). bli_packm_int( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // C21 = C21 + alpha * x2 * x1'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha, &x2, &x1_pack, &c21, cntx, bli_cntl_sub_ger( cntl ) ); // C11 = C11 + alpha * x1 * x1'; bli_her_int( conjh, alpha, &x1_pack, &c11_pack, cntx, bli_cntl_sub_her( cntl ) ); // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, cntx, bli_cntl_sub_unpackm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_cntl.c000066400000000000000000000127371427272030600235720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern packm_t* packm_cntl; extern packv_t* packv_cntl; extern unpackm_t* unpackm_cntl; extern ger_t* ger_cntl_rp_bs_row; extern ger_t* ger_cntl_cp_bs_col; extern ger_t* ger_cntl_bs_ke_row; extern ger_t* ger_cntl_bs_ke_col; her_t* her_cntl_bs_ke_lrow_ucol = NULL; her_t* her_cntl_bs_ke_lcol_urow = NULL; her_t* her_cntl_ge_lrow_ucol = NULL; her_t* her_cntl_ge_lcol_urow = NULL; void bli_her_cntl_init() { // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. her_cntl_bs_ke_lrow_ucol = bli_her_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT1, 0, NULL, NULL, NULL, NULL, NULL ); her_cntl_bs_ke_lcol_urow = bli_her_cntl_obj_create( BLIS_UNBLOCKED, BLIS_VARIANT2, 0, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here, we choose // variants that partition for ger subproblems in the same direction // as the assumed storage. her_cntl_ge_lrow_ucol = bli_her_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, packv_cntl, // pack x1 (if needed) NULL, // do NOT pack C11 ger_cntl_rp_bs_row, her_cntl_bs_ke_lrow_ucol, NULL ); // no unpacking needed her_cntl_ge_lcol_urow = bli_her_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT2, BLIS_M2, packv_cntl, // pack x1 (if needed) NULL, // do NOT pack C11 ger_cntl_cp_bs_col, her_cntl_bs_ke_lcol_urow, NULL ); // no unpacking needed } void bli_her_cntl_finalize() { bli_cntl_free_node( her_cntl_bs_ke_lrow_ucol ); bli_cntl_free_node( her_cntl_bs_ke_lcol_urow ); bli_cntl_free_node( her_cntl_ge_lrow_ucol ); bli_cntl_free_node( her_cntl_ge_lcol_urow ); } her_t* bli_her_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packm_t* sub_packm_c11, ger_t* sub_ger, her_t* sub_her, unpackm_t* sub_unpackm_c11 ) { her_t* cntl; cntl = ( her_t* ) bli_malloc_intl( sizeof(her_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_packm_c11 = sub_packm_c11; cntl->sub_ger = sub_ger; cntl->sub_her = sub_her; cntl->sub_unpackm_c11 = sub_unpackm_c11; return cntl; } void bli_her_cntl_obj_init( her_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packm_t* sub_packm_c11, ger_t* sub_ger, her_t* sub_her, unpackm_t* sub_unpackm_c11 ) { cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_packm_c11 = sub_packm_c11; cntl->sub_ger = sub_ger; cntl->sub_her = sub_her; cntl->sub_unpackm_c11 = sub_unpackm_c11; } cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_cntl.h000066400000000000000000000056521427272030600235750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct her_s { impl_t impl_type; varnum_t var_num; bszid_t bszid; struct packv_s* sub_packv_x1; struct packm_s* sub_packm_c11; struct ger_s* sub_ger; struct her_s* sub_her; struct unpackm_s* sub_unpackm_c11; }; typedef struct her_s her_t; #define bli_cntl_sub_her( cntl ) cntl->sub_her void bli_her_cntl_init( void ); void bli_her_cntl_finalize( void ); her_t* bli_her_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packm_t* sub_packm_c11, ger_t* sub_ger, her_t* sub_her, unpackm_t* sub_unpackm_c11 ); void bli_her_cntl_obj_init( her_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packm_t* sub_packm_c11, ger_t* sub_ger, her_t* sub_her, unpackm_t* sub_unpackm_c11 ); cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_front.c000066400000000000000000000133731427272030600237570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern her_t* her_cntl_bs_ke_lrow_ucol; extern her_t* her_cntl_bs_ke_lcol_urow; extern her_t* her_cntl_ge_lrow_ucol; extern her_t* her_cntl_ge_lcol_urow; void bli_her_front ( obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx ) { her_t* her_cntl; num_t dt_targ_x; //num_t dt_targ_c; bool x_has_unit_inc; bool c_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her_check( alpha, x, c ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_dt( x ); //dt_targ_c = bli_obj_target_dt( c ); // Determine whether each operand with unit stride. x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); c_has_unit_inc = ( bli_obj_is_row_stored( c ) || bli_obj_is_col_stored( c ) ); // Create object to hold a copy-cast of alpha. dt_alpha = dt_targ_x; bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_has_unit_inc && c_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol; else her_cntl = her_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lcol_urow; else her_cntl = her_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lrow_ucol; else her_cntl = her_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lcol_urow; else her_cntl = her_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the // Hermitian (and not symmetric) algorithms. bli_her_int( BLIS_CONJUGATE, &alpha_local, x, c, cntx, her_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt_r = PASTEMAC(chr,type); \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, xo, co; \ \ inc_t rs_x, cs_x; \ \ rs_x = incx; cs_x = m * incx; \ \ bli_obj_create_1x1_with_attached_buffer( dt_r, alpha, &alphao ); \ \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_conj( conjx, &xo ); \ bli_obj_set_uplo( uploc, &co ); \ \ bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ \ PASTEMAC0(opname)( &alphao, \ &xo, \ &co, \ cntx ); \ } INSERT_GENTFUNCR_BASIC0( her_front ) cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_front.h000066400000000000000000000041351427272030600237600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_her_front ( obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx ); #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC( her_front ) cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_int.c000066400000000000000000000071601427272030600234160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T her_fp typedef void (*FUNCPTR_T)( conj_t conjh, obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx, her_t* cntl ); static FUNCPTR_T vars[4][3] = { // unblocked unblocked with fusing blocked { bli_her_unb_var1, NULL, bli_her_blk_var1, }, { bli_her_unb_var2, NULL, bli_her_blk_var2, }, { NULL, NULL, NULL, }, { NULL, NULL, NULL, }, }; void bli_her_int( conj_t conjh, obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx, her_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t x_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) { if ( bli_is_conj( conjh ) ) bli_her_check( alpha, x, c ); else bli_syr_check( alpha, x, c ); } // If C or x has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) return; if ( bli_obj_has_zero_dim( x ) ) return; // Alias the operands in case we need to apply conjugations. bli_obj_alias_to( x, &x_local ); bli_obj_alias_to( c, &c_local ); // If matrix C is marked for conjugation, we interpret this as a request // to apply a conjugation to the other operands. if ( bli_obj_has_conj( &c_local ) ) { bli_obj_toggle_conj( &c_local ); // Notice that we don't need to conjugate alpha since it is guaranteed // to be real. bli_obj_toggle_conj( &x_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( conjh, alpha, &x_local, &c_local, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/2/her/other/bli_her_int.h000066400000000000000000000035101427272030600234160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_her_int( conj_t conjh, obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx, her_t* cntl ); cython-blis-0.9.1/blis/_src/frame/2/her2/000077500000000000000000000000001427272030600177315ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2.h000066400000000000000000000034671427272030600216020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" #include "bli_her2_var.h" cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_unb_var1.c000066400000000000000000000124511427272030600233630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x0; \ ctype* chi1; \ ctype* y0; \ ctype* psi1; \ ctype* c10t; \ ctype* gamma11; \ ctype alpha0; \ ctype alpha1; \ ctype alpha0_chi1; \ ctype alpha1_psi1; \ ctype alpha0_chi1_psi1; \ ctype conjx0_chi1; \ ctype conjy1_psi1; \ ctype conjy0_psi1; \ dim_t i; \ dim_t n_behind; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ \ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ conjy = bli_apply_conj( conjh, conjy ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ conj0 = bli_apply_conj( conjh, conjy ); \ conj1 = bli_apply_conj( conjh, conjx ); \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ x0 = x + (0 )*incx; \ chi1 = x + (i )*incx; \ y0 = y + (0 )*incy; \ psi1 = y + (i )*incy; \ c10t = c + (i )*rs_ct + (0 )*cs_ct; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \ PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \ PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \ \ /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have already been conjugated, if needed, by conjx and conjy. */ \ PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \ \ /* c10t = c10t + alpha * chi1 * y0'; */ \ kfp_av \ ( \ conj0, \ n_behind, \ &alpha0_chi1, \ y0, incy, \ c10t, cs_ct, \ cntx \ ); \ \ /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \ kfp_av \ ( \ conj1, \ n_behind, \ &alpha1_psi1, \ x0, incx, \ c10t, cs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + conj(alpha) * psi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her2_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_unb_var2.c000066400000000000000000000130271427272030600233640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype* psi1; \ ctype* c10t; \ ctype* gamma11; \ ctype* c21; \ ctype alpha0; \ ctype alpha1; \ ctype alpha0_psi1; \ ctype alpha1_psi1; \ ctype alpha0_chi1_psi1; \ ctype conjy0_psi1; \ ctype conjy1_psi1; \ ctype conjx0_chi1; \ dim_t i; \ dim_t n_behind; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ conj_t conjh_conjy; \ \ /* Eliminate unused variable warnings. */ \ ( void )conjh_conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ \ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ conjy = bli_apply_conj( conjh, conjy ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ conj0 = conjx; \ conj1 = bli_apply_conj( conjh, conjx ); \ conjh_conjy = bli_apply_conj( conjh, conjy ); \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ n_ahead = m - i - 1; \ x0 = x + (0 )*incx; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ psi1 = y + (i )*incy; \ c10t = c + (i )*rs_ct + (0 )*cs_ct; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \ PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \ PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \ \ /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have already been conjugated, if needed, by conjx and conjy. */ \ PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \ \ /* c21 = c21 + alpha * x2 * conj(psi1); */ \ kfp_av \ ( \ conj0, \ n_ahead, \ &alpha0_psi1, \ x2, incx, \ c21, rs_ct, \ cntx \ ); \ \ /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \ kfp_av \ ( \ conj1, \ n_behind, \ &alpha1_psi1, \ x0, incx, \ c10t, cs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + conj(alpha) * psi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her2_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_unb_var3.c000066400000000000000000000130251427272030600233630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* chi1; \ ctype* y0; \ ctype* psi1; \ ctype* y2; \ ctype* c10t; \ ctype* gamma11; \ ctype* c21; \ ctype alpha0; \ ctype alpha1; \ ctype alpha0_chi1; \ ctype alpha1_chi1; \ ctype alpha0_chi1_psi1; \ ctype conjx0_chi1; \ ctype conjx1_chi1; \ ctype conjy0_psi1; \ dim_t i; \ dim_t n_behind; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ conj_t conjh_conjx; \ \ /* Eliminate unused variable warnings. */ \ ( void )conjh_conjx; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ \ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ conjy = bli_apply_conj( conjh, conjy ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ conj0 = bli_apply_conj( conjh, conjy ); \ conj1 = conjy; \ conjh_conjx = bli_apply_conj( conjh, conjx ); \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ n_ahead = m - i - 1; \ chi1 = x + (i )*incx; \ y0 = y + (0 )*incy; \ psi1 = y + (i )*incy; \ y2 = y + (i+1)*incy; \ c10t = c + (i )*rs_ct + (0 )*cs_ct; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \ PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \ PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \ \ /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have already been conjugated, if needed, by conjx and conjy. */ \ PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \ \ /* c10t = c10t + alpha * chi1 * y0'; */ \ kfp_av \ ( \ conj0, \ n_behind, \ &alpha0_chi1, \ y0, incy, \ c10t, cs_ct, \ cntx \ ); \ \ /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \ kfp_av \ ( \ conj1, \ n_ahead, \ &alpha1_chi1, \ y2, incy, \ c21, rs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + conj(alpha) * psi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her2_unb_var3 ) cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_unb_var4.c000066400000000000000000000127671427272030600234000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* chi1; \ ctype* x2; \ ctype* psi1; \ ctype* y2; \ ctype* gamma11; \ ctype* c21; \ ctype alpha0; \ ctype alpha1; \ ctype alpha0_psi1; \ ctype alpha1_chi1; \ ctype alpha0_chi1_psi1; \ ctype conjy0_psi1; \ ctype conjx1_chi1; \ ctype conjx0_chi1; \ dim_t i; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ conj_t conjh_conjx; \ conj_t conjh_conjy; \ \ /* Eliminate unused variable warnings. */ \ ( void )conjh_conjx; \ ( void )conjh_conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ \ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ conjy = bli_apply_conj( conjh, conjy ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ conj0 = conjx; \ conj1 = conjy; \ conjh_conjx = bli_apply_conj( conjh, conjx ); \ conjh_conjy = bli_apply_conj( conjh, conjy ); \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_ahead = m - i - 1; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ psi1 = y + (i )*incy; \ y2 = y + (i+1)*incy; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \ PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \ PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \ \ /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have already been conjugated, if needed, by conjx and conjy. */ \ PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \ \ /* c21 = c21 + alpha * x2 * conj(psi1); */ \ kfp_av \ ( \ conj0, \ n_ahead, \ &alpha0_psi1, \ x2, incx, \ c21, rs_ct, \ cntx \ ); \ \ /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \ kfp_av \ ( \ conj1, \ n_ahead, \ &alpha1_chi1, \ y2, incy, \ c21, rs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + conj(alpha) * psi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her2_unb_var4 ) cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_unf_var1.c000066400000000000000000000123501427272030600233650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* x0; \ ctype* chi1; \ ctype* y0; \ ctype* psi1; \ ctype* c10t; \ ctype* gamma11; \ ctype alpha0; \ ctype alpha1; \ ctype alpha0_chi1; \ ctype alpha1_psi1; \ ctype alpha0_chi1_psi1; \ ctype conjx0_chi1; \ ctype conjy1_psi1; \ ctype conjy0_psi1; \ dim_t i; \ dim_t n_behind; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ \ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ conjy = bli_apply_conj( conjh, conjy ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ conj0 = bli_apply_conj( conjh, conjy ); \ conj1 = bli_apply_conj( conjh, conjx ); \ \ PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ \ /* Query the context for the kernel function pointer. */ \ kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_behind = i; \ x0 = x + (0 )*incx; \ chi1 = x + (i )*incx; \ y0 = y + (0 )*incy; \ psi1 = y + (i )*incy; \ c10t = c + (i )*rs_ct + (0 )*cs_ct; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx0_chi1 ); \ PASTEMAC(ch,copycjs)( conjy, *psi1, conjy1_psi1 ); \ PASTEMAC(ch,copycjs)( conj0, *psi1, conjy0_psi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjx0_chi1, alpha0_chi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjy1_psi1, alpha1_psi1 ); \ \ /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have already been conjugated, if needed, by conjx and conjy. */ \ PASTEMAC(ch,scal2s)( alpha0_chi1, conjy0_psi1, alpha0_chi1_psi1 ); \ \ /* c10t = c10t + alpha * chi1 * y0'; */ \ /* c10t = c10t + conj(alpha) * psi1 * x0'; */ \ kfp_2v \ ( \ conj0, \ conj1, \ n_behind, \ &alpha0_chi1, \ &alpha1_psi1, \ y0, incy, \ x0, incx, \ c10t, cs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + conj(alpha) * psi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her2_unf_var1 ) cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_unf_var4.c000066400000000000000000000126701427272030600233750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* chi1; \ ctype* x2; \ ctype* psi1; \ ctype* y2; \ ctype* gamma11; \ ctype* c21; \ ctype alpha0; \ ctype alpha1; \ ctype alpha0_psi1; \ ctype alpha1_chi1; \ ctype alpha0_chi1_psi1; \ ctype conjy0_psi1; \ ctype conjx1_chi1; \ ctype conjx0_chi1; \ dim_t i; \ dim_t n_ahead; \ inc_t rs_ct, cs_ct; \ conj_t conj0, conj1; \ conj_t conjh_conjx; \ conj_t conjh_conjy; \ \ /* Eliminate unused variable warnings. */ \ ( void )conjh_conjx; \ ( void )conjh_conjy; \ \ /* The algorithm will be expressed in terms of the lower triangular case; the upper triangular case is supported by swapping the row and column strides of A and toggling some conj parameters. */ \ if ( bli_is_lower( uplo ) ) \ { \ rs_ct = rs_c; \ cs_ct = cs_c; \ \ PASTEMAC(ch,copys)( *alpha, alpha0 ); \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha1 ); \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ rs_ct = cs_c; \ cs_ct = rs_c; \ \ /* Toggle conjugation of conjx/conjy, but only if we are being invoked as her2; for syr2, conjx/conjy are unchanged. */ \ conjx = bli_apply_conj( conjh, conjx ); \ conjy = bli_apply_conj( conjh, conjy ); \ \ PASTEMAC(ch,copycjs)( conjh, *alpha, alpha0 ); \ PASTEMAC(ch,copys)( *alpha, alpha1 ); \ } \ \ /* Apply conjh (which carries the conjugation component of the Hermitian transpose, if applicable) to conjx and/or conjy as needed to arrive at the effective conjugation for the vector subproblems. */ \ conj0 = conjx; \ conj1 = conjy; \ conjh_conjx = bli_apply_conj( conjh, conjx ); \ conjh_conjy = bli_apply_conj( conjh, conjy ); \ \ PASTECH(ch,axpy2v_ker_ft) kfp_2v; \ \ /* Query the context for the kernel function pointer. */ \ kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \ \ for ( i = 0; i < m; ++i ) \ { \ n_ahead = m - i - 1; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ psi1 = y + (i )*incy; \ y2 = y + (i+1)*incy; \ gamma11 = c + (i )*rs_ct + (i )*cs_ct; \ c21 = c + (i+1)*rs_ct + (i )*cs_ct; \ \ /* Apply conjx and/or conjy to chi1 and/or psi1. */ \ PASTEMAC(ch,copycjs)( conjh_conjy, *psi1, conjy0_psi1 ); \ PASTEMAC(ch,copycjs)( conjh_conjx, *chi1, conjx1_chi1 ); \ PASTEMAC(ch,copycjs)( conj0, *chi1, conjx0_chi1 ); \ \ /* Compute scalars for vector subproblems. */ \ PASTEMAC(ch,scal2s)( alpha0, conjy0_psi1, alpha0_psi1 ); \ PASTEMAC(ch,scal2s)( alpha1, conjx1_chi1, alpha1_chi1 ); \ \ /* Compute alpha * chi1 * conj(psi1) after both chi1 and psi1 have already been conjugated, if needed, by conjx and conjy. */ \ PASTEMAC(ch,scal2s)( alpha0_psi1, conjx0_chi1, alpha0_chi1_psi1 ); \ \ /* c21 = c21 + alpha * x2 * conj(psi1); */ \ /* c21 = c21 + conj(alpha) * y2 * conj(chi1); */ \ kfp_2v \ ( \ conj0, \ conj1, \ n_ahead, \ &alpha0_psi1, \ &alpha1_chi1, \ x2, incx, \ y2, incy, \ c21, rs_ct, \ cntx \ ); \ \ /* gamma11 = gamma11 + alpha * chi1 * conj(psi1) \ + conj(alpha) * psi1 * conj(chi1); */ \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ PASTEMAC(ch,adds)( alpha0_chi1_psi1, *gamma11 ); \ \ /* For her2, explicitly set the imaginary component of gamma11 to zero. */ \ if ( bli_is_conj( conjh ) ) \ PASTEMAC(ch,seti0s)( *gamma11 ); \ } \ } INSERT_GENTFUNC_BASIC0( her2_unf_var4 ) cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_var.h000066400000000000000000000055551427272030600224520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) cython-blis-0.9.1/blis/_src/frame/2/her2/bli_her2_var_oapi.c000066400000000000000000000062321427272030600234460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(varname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( c ); \ \ uplo_t uplo = bli_obj_uplo( c ); \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ \ dim_t m = bli_obj_length( c ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t incy = bli_obj_vector_inc( y ); \ \ void* buf_c = bli_obj_buffer_at_off( c ); \ inc_t rs_c = bli_obj_row_stride( c ); \ inc_t cs_c = bli_obj_col_stride( c ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ uplo, \ conjx, \ conjy, \ conjh, \ m, \ buf_alpha, \ buf_x, incx, \ buf_y, incy, \ buf_c, rs_c, cs_c, \ cntx \ ); \ } \ GENFRONT( her2, her2_unb_var1 ) GENFRONT( her2, her2_unb_var2 ) GENFRONT( her2, her2_unb_var3 ) GENFRONT( her2, her2_unb_var4 ) GENFRONT( her2, her2_unf_var1 ) GENFRONT( her2, her2_unf_var4 ) cython-blis-0.9.1/blis/_src/frame/2/her2/other/000077500000000000000000000000001427272030600210525ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_blk_var1.c000066400000000000000000000131031427272030600244630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_her2_blk_var1( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ) { obj_t c11, c11_pack; obj_t c10; obj_t x1, x1_pack; obj_t x0; obj_t y1, y1_pack; obj_t y0; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &c11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( c ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for C11, C10, x1, x0, y1, and y0. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, c, &c11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, c, &c10 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, x, &x0 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, y, &y0 ); // Initialize objects for packing C11, x1, and y1 (if needed). bli_packm_init( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // C10 = C10 + alpha * x1 * y0'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha, &x1_pack, &y0, &c10, cntx, bli_cntl_sub_ger_rp( cntl ) ); // C10 = C10 + conj(alpha) * y1 * x0'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha_conj, &y1_pack, &x0, &c10, cntx, bli_cntl_sub_ger_rp( cntl ) ); // C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1'; bli_her2_int( conjh, alpha, alpha_conj, &x1_pack, &y1_pack, &c11_pack, cntx, bli_cntl_sub_her2( cntl ) ); // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, cntx, bli_cntl_sub_unpackm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_blk_var2.c000066400000000000000000000132621427272030600244720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_her2_blk_var2( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ) { obj_t c11, c11_pack; obj_t c10; obj_t c21; obj_t x1, x1_pack; obj_t x0; obj_t x2; obj_t y1, y1_pack; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &c11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( c ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for C11, C10, C21, x1, x0, x2, and y1. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, c, &c11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, c, &c10 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, c, &c21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, x, &x0 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, x, &x2 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); // Initialize objects for packing C11, x1, and y1 (if needed). bli_packm_init( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // C10 = C10 + conj(alpha) * y1 * x0'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha_conj, &y1_pack, &x0, &c10, cntx, bli_cntl_sub_ger_rp( cntl ) ); // C21 = C21 + alpha * x2 * y1'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha, &x2, &y1_pack, &c21, cntx, bli_cntl_sub_ger_cp( cntl ) ); // C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1'; bli_her2_int( conjh, alpha, alpha_conj, &x1_pack, &y1_pack, &c11_pack, cntx, bli_cntl_sub_her2( cntl ) ); // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, cntx, bli_cntl_sub_unpackm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_blk_var3.c000066400000000000000000000132621427272030600244730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_her2_blk_var3( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ) { obj_t c11, c11_pack; obj_t c10; obj_t c21; obj_t x1, x1_pack; obj_t y1, y1_pack; obj_t y0; obj_t y2; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &c11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( c ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for C11, C10, C21, x1, y1, y0, and y2. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, c, &c11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, c, &c10 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, c, &c21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, y, &y0 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, y, &y2 ); // Initialize objects for packing C11, x1, and y1 (if needed). bli_packm_init( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // C10 = C10 + alpha * x1 * y0'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha, &x1_pack, &y0, &c10, cntx, bli_cntl_sub_ger_rp( cntl ) ); // C21 = C21 + conj(alpha) * y2 * x1'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha_conj, &y2, &x1_pack, &c21, cntx, bli_cntl_sub_ger_cp( cntl ) ); // C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1'; bli_her2_int( conjh, alpha, alpha_conj, &x1_pack, &y1_pack, &c11_pack, cntx, bli_cntl_sub_her2( cntl ) ); // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, cntx, bli_cntl_sub_unpackm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_blk_var4.c000066400000000000000000000131031427272030600244660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_her2_blk_var4( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ) { obj_t c11, c11_pack; obj_t c21; obj_t x1, x1_pack; obj_t x2; obj_t y1, y1_pack; obj_t y2; dim_t mn; dim_t ij; dim_t b_alg; // Even though this blocked algorithm is expressed only in terms of the // lower triangular case, the upper triangular case is still supported: // when bli_acquire_mpart_tl2br() is passed a matrix that is stored in // in the upper triangle, and the requested subpartition resides in the // lower triangle (as is the case for this algorithm), the routine fills // the request as if the caller had actually requested the corresponding // "mirror" subpartition in the upper triangle, except that it marks the // subpartition for transposition (and conjugation). // Initialize objects for packing. bli_obj_init_pack( &c11_pack ); bli_obj_init_pack( &x1_pack ); bli_obj_init_pack( &y1_pack ); // Query dimension. mn = bli_obj_length( c ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, c, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for C11, C21, x1, x2, y1, and y2. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, c, &c11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, c, &c21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, x, &x2 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, y, &y1 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, y, &y2 ); // Initialize objects for packing C11, x1, and y1 (if needed). bli_packm_init( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_init( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // Copy/pack C11, x1, y1 (if needed). bli_packm_int( &c11, &c11_pack, cntx, bli_cntl_sub_packm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_int( &y1, &y1_pack, cntx, bli_cntl_sub_packv_y1( cntl ) ); // C21 = C21 + alpha * x2 * y1'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha, &x2, &y1_pack, &c21, cntx, bli_cntl_sub_ger_cp( cntl ) ); // C21 = C21 + conj(alpha) * y2 * x1'; bli_ger_int( BLIS_NO_CONJUGATE, conjh, alpha_conj, &y2, &x1_pack, &c21, cntx, bli_cntl_sub_ger_cp( cntl ) ); // C11 = C11 + alpha * x1 * y1' + conj(alpha) * y1 * x1'; bli_her2_int( conjh, alpha, alpha_conj, &x1_pack, &y1_pack, &c11_pack, cntx, bli_cntl_sub_her2( cntl ) ); // Copy/unpack C11 (if C11 was packed). bli_unpackm_int( &c11_pack, &c11, cntx, bli_cntl_sub_unpackm_c11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &c11_pack, bli_cntl_sub_packm_c11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); bli_packv_release( &y1_pack, bli_cntl_sub_packv_y1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_cntl.c000066400000000000000000000142261427272030600237310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern packm_t* packm_cntl; extern packv_t* packv_cntl; extern unpackm_t* unpackm_cntl; extern ger_t* ger_cntl_rp_bs_row; extern ger_t* ger_cntl_cp_bs_col; her2_t* her2_cntl_bs_ke_lrow_ucol = NULL; her2_t* her2_cntl_bs_ke_lcol_urow = NULL; her2_t* her2_cntl_ge_lrow_ucol = NULL; her2_t* her2_cntl_ge_lcol_urow = NULL; void bli_her2_cntl_init() { // Create control trees for the lowest-level kernels. These trees induce // operations on (persumably) relatively small block-subvector problems. her2_cntl_bs_ke_lrow_ucol = bli_her2_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); her2_cntl_bs_ke_lcol_urow = bli_her2_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT4, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here, we choose // variants that partition for ger subproblems in the same direction // as the assumed storage. her2_cntl_ge_lrow_ucol = bli_her2_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, BLIS_M2, packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) packm_cntl, // pack C11 (if needed) ger_cntl_rp_bs_row, ger_cntl_rp_bs_row, her2_cntl_bs_ke_lrow_ucol, unpackm_cntl ); // unpack C11 (if packed) her2_cntl_ge_lcol_urow = bli_her2_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT4, BLIS_M2, packv_cntl, // pack x1 (if needed) packv_cntl, // pack y1 (if needed) packm_cntl, // pack C11 (if needed) ger_cntl_cp_bs_col, ger_cntl_cp_bs_col, her2_cntl_bs_ke_lcol_urow, unpackm_cntl ); // unpack C11 (if packed) } void bli_her2_cntl_finalize() { bli_cntl_free_node( her2_cntl_bs_ke_lrow_ucol ); bli_cntl_free_node( her2_cntl_bs_ke_lcol_urow ); bli_cntl_free_node( her2_cntl_ge_lrow_ucol ); bli_cntl_free_node( her2_cntl_ge_lcol_urow ); } her2_t* bli_her2_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packv_t* sub_packv_y1, packm_t* sub_packm_c11, ger_t* sub_ger_rp, ger_t* sub_ger_cp, her2_t* sub_her2, unpackm_t* sub_unpackm_c11 ) { her2_t* cntl; cntl = ( her2_t* ) bli_malloc_intl( sizeof(her2_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_packv_y1 = sub_packv_y1; cntl->sub_packm_c11 = sub_packm_c11; cntl->sub_ger_rp = sub_ger_rp; cntl->sub_ger_cp = sub_ger_cp; cntl->sub_her2 = sub_her2; cntl->sub_unpackm_c11 = sub_unpackm_c11; return cntl; } void bli_her2_cntl_obj_init( her2_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packv_t* sub_packv_y1, packm_t* sub_packm_c11, ger_t* sub_ger_rp, ger_t* sub_ger_cp, her2_t* sub_her2, unpackm_t* sub_unpackm_c11 ) { cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_packv_y1 = sub_packv_y1; cntl->sub_packm_c11 = sub_packm_c11; cntl->sub_ger_rp = sub_ger_rp; cntl->sub_ger_cp = sub_ger_cp; cntl->sub_her2 = sub_her2; cntl->sub_unpackm_c11 = sub_unpackm_c11; } cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_cntl.h000066400000000000000000000063701427272030600237370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct her2_s { impl_t impl_type; varnum_t var_num; bszid_t bszid; struct packv_s* sub_packv_x1; struct packv_s* sub_packv_y1; struct packm_s* sub_packm_c11; struct ger_s* sub_ger_rp; struct ger_s* sub_ger_cp; struct her2_s* sub_her2; struct unpackm_s* sub_unpackm_c11; }; typedef struct her2_s her2_t; #define bli_cntl_sub_her2( cntl ) cntl->sub_her2 void bli_her2_cntl_init( void ); void bli_her2_cntl_finalize( void ); her2_t* bli_her2_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packv_t* sub_packv_y1, packm_t* sub_packm_c11, ger_t* sub_ger_rp, ger_t* sub_ger_cp, her2_t* sub_her2, unpackm_t* sub_unpackm_c11 ); void bli_her2_cntl_obj_init( her2_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packv_t* sub_packv_x1, packv_t* sub_packv_y1, packm_t* sub_packm_c11, ger_t* sub_ger_rp, ger_t* sub_ger_cp, her2_t* sub_her2, unpackm_t* sub_unpackm_c11 ); cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_front.c000066400000000000000000000152131427272030600241160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern her2_t* her2_cntl_bs_ke_lrow_ucol; extern her2_t* her2_cntl_bs_ke_lcol_urow; extern her2_t* her2_cntl_ge_lrow_ucol; extern her2_t* her2_cntl_ge_lcol_urow; void bli_her2_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx ) { her2_t* her2_cntl; num_t dt_targ_x; num_t dt_targ_y; //num_t dt_targ_c; bool x_has_unit_inc; bool y_has_unit_inc; bool c_has_unit_inc; obj_t alpha_local; obj_t alpha_conj_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2_check( alpha, x, y, c ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); //dt_targ_c = bli_obj_target_dt( c ); // Determine whether each operand with unit stride. x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); c_has_unit_inc = ( bli_obj_is_row_stored( c ) || bli_obj_is_col_stored( c ) ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the datatypes of x and y. dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Also create a conjugated copy of alpha. bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_CONJUGATE, alpha, &alpha_conj_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_has_unit_inc && y_has_unit_inc && c_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol; else her2_cntl = her2_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow; else her2_cntl = her2_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lrow_ucol; else her2_cntl = her2_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lcol_urow; else her2_cntl = her2_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. Set conjh to BLIS_CONJUGATE to invoke the // Hermitian (and not symmetric) algorithms. bli_her2_int( BLIS_CONJUGATE, &alpha_local, &alpha_conj_local, x, y, c, cntx, her2_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, xo, yo, co; \ \ inc_t rs_x, cs_x; \ inc_t rs_y, cs_y; \ \ rs_x = incx; cs_x = m * incx; \ rs_y = incy; cs_y = m * incy; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \ bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_conj( conjx, &xo ); \ bli_obj_set_conj( conjy, &yo ); \ bli_obj_set_uplo( uploc, &co ); \ \ bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ \ PASTEMAC0(opname)( &alphao, \ &xo, \ &yo, \ &co, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( her2_front ) cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_front.h000066400000000000000000000042321427272030600241220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_her2_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( her2_front ) cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_int.c000066400000000000000000000107341427272030600235630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T her2_fp typedef void (*FUNCPTR_T)( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ); static FUNCPTR_T vars[4][3] = { // unblocked unblocked with fusing blocked { bli_her2_unb_var1, bli_her2_unf_var1, bli_her2_blk_var1 }, { bli_her2_unb_var2, NULL, bli_her2_blk_var2 }, { bli_her2_unb_var3, NULL, bli_her2_blk_var3 }, { bli_her2_unb_var4, bli_her2_unf_var4, bli_her2_blk_var4 }, }; void bli_her2_int( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ) { varnum_t n; impl_t i; FUNCPTR_T f; obj_t alpha_local; obj_t alpha_conj_local; obj_t x_local; obj_t y_local; obj_t c_local; // Check parameters. if ( bli_error_checking_is_enabled() ) { if ( bli_is_conj( conjh ) ) bli_her2_check( alpha, x, y, c ); else bli_syr2_check( alpha, x, y, c ); } // If C, x, or y has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) return; if ( bli_obj_has_zero_dim( x ) ) return; if ( bli_obj_has_zero_dim( y ) ) return; // Alias the operands in case we need to apply conjugations. bli_obj_alias_to( x, &x_local ); bli_obj_alias_to( y, &y_local ); bli_obj_alias_to( c, &c_local ); // If matrix C is marked for conjugation, we interpret this as a request // to apply a conjugation to the other operands. if ( bli_obj_has_conj( &c_local ) ) { bli_obj_toggle_conj( &c_local ); bli_obj_toggle_conj( &x_local ); bli_obj_toggle_conj( &y_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha ), BLIS_CONJUGATE, alpha, &alpha_local ); bli_obj_scalar_init_detached_copy_of( bli_obj_dt( alpha_conj ), BLIS_CONJUGATE, alpha_conj, &alpha_conj_local ); } else { bli_obj_alias_to( *alpha, alpha_local ); bli_obj_alias_to( *alpha_conj, alpha_conj_local ); } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[n][i]; // Invoke the variant. f( conjh, &alpha_local, &alpha_conj_local, &x_local, &y_local, &c_local, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/2/her2/other/bli_her2_int.h000066400000000000000000000036231427272030600235670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_her2_int( conj_t conjh, obj_t* alpha, obj_t* alpha_conj, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx, her2_t* cntl ); cython-blis-0.9.1/blis/_src/frame/2/symv/000077500000000000000000000000001427272030600200675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/symv/bli_symv.h000066400000000000000000000033441427272030600220700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" cython-blis-0.9.1/blis/_src/frame/2/symv/other/000077500000000000000000000000001427272030600212105ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/symv/other/bli_symv_front.c000066400000000000000000000162641427272030600244210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern hemv_t* hemv_cntl_bs_ke_lrow_ucol; extern hemv_t* hemv_cntl_bs_ke_lcol_urow; extern hemv_t* hemv_cntl_ge_lrow_ucol; extern hemv_t* hemv_cntl_ge_lcol_urow; void bli_symv_front ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx ) { hemv_t* hemv_cntl; num_t dt_targ_a; num_t dt_targ_x; num_t dt_targ_y; bool a_has_unit_inc; bool x_has_unit_inc; bool y_has_unit_inc; obj_t alpha_local; obj_t beta_local; num_t dt_alpha; num_t dt_beta; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_symv_check( alpha, a, x, beta, y ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_dt( a ); dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); // Determine whether each operand with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( a ) || bli_obj_is_col_stored( a ) ); x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // Create an object to hold a copy-cast of beta. Notice that we use // the datatype of y. Here's why: If y is real and beta is complex, // there is no reason to keep beta_local in the complex domain since // the complex part of beta*y will not be stored. If y is complex and // beta is real then beta is harmlessly promoted to complex. dt_beta = dt_targ_y; bli_obj_scalar_init_detached_copy_of( dt_beta, BLIS_NO_CONJUGATE, beta, &beta_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc && y_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( a ) ) { if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; else hemv_cntl = hemv_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( a ) ) { if ( bli_obj_is_row_stored( a ) ) hemv_cntl = hemv_cntl_bs_ke_lcol_urow; else hemv_cntl = hemv_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( a ) ) { if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lrow_ucol; else hemv_cntl = hemv_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( a ) ) { if ( bli_obj_is_row_tilted( a ) ) hemv_cntl = hemv_cntl_ge_lcol_urow; else hemv_cntl = hemv_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-casts of scalars and the // chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the // symmetric (and not Hermitian) algorithms. bli_hemv_int( BLIS_NO_CONJUGATE, &alpha_local, a, x, &beta_local, y, cntx, hemv_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, ao, xo, betao, yo; \ \ inc_t rs_x, cs_x; \ inc_t rs_y, cs_y; \ \ rs_x = incx; cs_x = m * incx; \ rs_y = incy; cs_y = m * incy; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ bli_obj_create_1x1_with_attached_buffer( dt, beta, &betao ); \ \ bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_conj( conja, &ao ); \ bli_obj_set_conj( conjx, &xo ); \ \ bli_obj_set_struc( BLIS_SYMMETRIC, &ao ); \ \ PASTEMAC0(opname)( &alphao, \ &ao, \ &xo, \ &betao, \ &yo, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( symv_front ) cython-blis-0.9.1/blis/_src/frame/2/symv/other/bli_symv_front.h000066400000000000000000000042761427272030600244260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_symv_front ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( symv_front ) cython-blis-0.9.1/blis/_src/frame/2/syr/000077500000000000000000000000001427272030600177065ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/syr/bli_syr.h000066400000000000000000000033431427272030600215250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" cython-blis-0.9.1/blis/_src/frame/2/syr/other/000077500000000000000000000000001427272030600210275ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/syr/other/bli_syr_front.c000066400000000000000000000135601427272030600240530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern her_t* her_cntl_bs_ke_lrow_ucol; extern her_t* her_cntl_bs_ke_lcol_urow; extern her_t* her_cntl_ge_lrow_ucol; extern her_t* her_cntl_ge_lcol_urow; void bli_syr_front ( obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx ) { her_t* her_cntl; num_t dt_targ_x; num_t dt_targ_c; bool x_has_unit_inc; bool c_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syr_check( alpha, x, c ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_dt( x ); dt_targ_c = bli_obj_target_dt( c ); // Determine whether each operand with unit stride. x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); c_has_unit_inc = ( bli_obj_is_row_stored( c ) || bli_obj_is_col_stored( c ) ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of x and c to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_dt_union( dt_targ_x, dt_targ_c ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_has_unit_inc && c_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lrow_ucol; else her_cntl = her_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_bs_ke_lcol_urow; else her_cntl = her_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lrow_ucol; else her_cntl = her_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her_cntl = her_cntl_ge_lcol_urow; else her_cntl = her_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the // symmetric (and not Hermitian) algorithms. bli_her_int( BLIS_NO_CONJUGATE, &alpha_local, x, c, cntx, her_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, xo, co; \ \ inc_t rs_x, cs_x; \ \ rs_x = incx; cs_x = m * incx; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_conj( conjx, &xo ); \ bli_obj_set_uplo( uploc, &co ); \ \ bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ \ PASTEMAC0(opname)( &alphao, \ &xo, \ &co, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( syr_front ) cython-blis-0.9.1/blis/_src/frame/2/syr/other/bli_syr_front.h000066400000000000000000000041141427272030600240530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_syr_front ( obj_t* alpha, obj_t* x, obj_t* c, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( syr_front ) cython-blis-0.9.1/blis/_src/frame/2/syr2/000077500000000000000000000000001427272030600177705ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/syr2/bli_syr2.h000066400000000000000000000033441427272030600216720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" cython-blis-0.9.1/blis/_src/frame/2/syr2/other/000077500000000000000000000000001427272030600211115ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/syr2/other/bli_syr2_front.c000066400000000000000000000145621427272030600242220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern her2_t* her2_cntl_bs_ke_lrow_ucol; extern her2_t* her2_cntl_bs_ke_lcol_urow; extern her2_t* her2_cntl_ge_lrow_ucol; extern her2_t* her2_cntl_ge_lcol_urow; void bli_syr2_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx ) { her2_t* her2_cntl; num_t dt_targ_x; num_t dt_targ_y; //num_t dt_targ_c; bool x_has_unit_inc; bool y_has_unit_inc; bool c_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syr2_check( alpha, x, y, c ); // Query the target datatypes of each object. dt_targ_x = bli_obj_target_dt( x ); dt_targ_y = bli_obj_target_dt( y ); //dt_targ_c = bli_obj_target_dt( c ); // Determine whether each operand with unit stride. x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); y_has_unit_inc = ( bli_obj_vector_inc( y ) == 1 ); c_has_unit_inc = ( bli_obj_is_row_stored( c ) || bli_obj_is_col_stored( c ) ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the datatypes of x and y. dt_alpha = bli_dt_union( dt_targ_x, dt_targ_y ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( x_has_unit_inc && y_has_unit_inc && c_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of upper/lower triangular storage and row/column-storage. // The row-stored lower triangular and column-stored upper triangular // trees are identical. Same for the remaining two trees. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lrow_ucol; else her2_cntl = her2_cntl_bs_ke_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_bs_ke_lcol_urow; else her2_cntl = her2_cntl_bs_ke_lrow_ucol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); if ( y_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, y ); if ( c_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, c ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_is_lower( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lrow_ucol; else her2_cntl = her2_cntl_ge_lcol_urow; } else // if ( bli_obj_is_upper( c ) ) { if ( bli_obj_is_row_stored( c ) ) her2_cntl = her2_cntl_ge_lcol_urow; else her2_cntl = her2_cntl_ge_lrow_ucol; } } // Invoke the internal back-end with the copy-cast scalar and the // chosen control tree. Set conjh to BLIS_NO_CONJUGATE to invoke the // symmetric (and not Hermitian) algorithms. bli_her2_int( BLIS_NO_CONJUGATE, &alpha_local, &alpha_local, x, y, c, cntx, her2_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, xo, yo, co; \ \ inc_t rs_x, cs_x; \ inc_t rs_y, cs_y; \ \ rs_x = incx; cs_x = m * incx; \ rs_y = incy; cs_y = m * incy; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ bli_obj_create_with_attached_buffer( dt, m, 1, y, rs_y, cs_y, &yo ); \ bli_obj_create_with_attached_buffer( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_conj( conjx, &xo ); \ bli_obj_set_conj( conjy, &yo ); \ bli_obj_set_uplo( uploc, &co ); \ \ bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ \ PASTEMAC0(opname)( &alphao, \ &xo, \ &yo, \ &co, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( syr2_front ) cython-blis-0.9.1/blis/_src/frame/2/syr2/other/bli_syr2_front.h000066400000000000000000000042321427272030600242200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_syr2_front ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* c, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( syr2_front ) cython-blis-0.9.1/blis/_src/frame/2/trmv/000077500000000000000000000000001427272030600200615ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/trmv/bli_trmv.h000066400000000000000000000034701427272030600220540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" #include "bli_trmv_var.h" cython-blis-0.9.1/blis/_src/frame/2/trmv/bli_trmv_unb_var1.c000066400000000000000000000106521427272030600236440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* a10t; \ ctype* alpha11; \ ctype* a12t; \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype alpha_alpha11_conj; \ ctype rho; \ dim_t iter, i; \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,dotv_ker_ft) kfp_dv; \ \ /* Query the context for the kernel function pointer. */ \ kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_ahead = m - iter - 1; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a12t = a + (i )*rs_at + (i+1)*cs_at; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ \ /* chi1 = alpha * alpha11 * chi1; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \ \ /* chi1 = chi1 + alpha * a12t * x2; */ \ kfp_dv \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_ahead, \ a12t, cs_at, \ x2, incx, \ &rho, \ cntx \ ); \ PASTEMAC(ch,axpys)( *alpha, rho, *chi1 ); \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_ahead = i; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a10t = a + (i )*rs_at + (0 )*cs_at; \ chi1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* chi1 = alpha * alpha11 * chi1; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \ \ /* chi1 = chi1 + alpha * a10t * x0; */ \ kfp_dv \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_ahead, \ a10t, cs_at, \ x0, incx, \ &rho, \ cntx \ ); \ PASTEMAC(ch,axpys)( *alpha, rho, *chi1 ); \ } \ } \ } INSERT_GENTFUNC_BASIC0( trmv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/2/trmv/bli_trmv_unb_var2.c000066400000000000000000000106141427272030600236430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* a01; \ ctype* alpha11; \ ctype* a21; \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype alpha_alpha11_conj; \ ctype alpha_chi1; \ dim_t iter, i; \ dim_t n_behind; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_behind = i; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a01 = a + (0 )*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* x0 = x0 + alpha * chi1 * a01; */ \ PASTEMAC(ch,scal2s)( *alpha, *chi1, alpha_chi1 ); \ kfp_av \ ( \ conja, \ n_behind, \ &alpha_chi1, \ a01, rs_at, \ x0, incx, \ cntx \ ); \ \ /* chi1 = alpha * alpha11 * chi1; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_behind = iter; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a21 = a + (i+1)*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ \ /* x2 = x2 + alpha * chi1 * a21; */ \ PASTEMAC(ch,scal2s)( *alpha, *chi1, alpha_chi1 ); \ kfp_av \ ( \ conja, \ n_behind, \ &alpha_chi1, \ a21, rs_at, \ x2, incx, \ cntx \ ); \ \ /* chi1 = alpha * alpha11 * chi1; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi1 ); \ } \ } \ } INSERT_GENTFUNC_BASIC0( trmv_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trmv/bli_trmv_unf_var1.c000066400000000000000000000143571427272030600236560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* one = PASTEMAC(ch,1); \ ctype* A10; \ ctype* A11; \ ctype* A12; \ ctype* a10t; \ ctype* alpha11; \ ctype* a12t; \ ctype* x0; \ ctype* x1; \ ctype* x2; \ ctype* x01; \ ctype* chi11; \ ctype* x21; \ ctype alpha_alpha11_conj; \ ctype rho1; \ dim_t iter, i, k, j, l; \ dim_t b_fuse, f; \ dim_t n_ahead, f_ahead; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_ahead = m - iter - f; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A12 = a + (i )*rs_at + (i+f)*cs_at; \ x1 = x + (i )*incx; \ x2 = x + (i+f)*incx; \ \ /* x1 = alpha * A11 * x1; */ \ for ( k = 0; k < f; ++k ) \ { \ l = k; \ f_ahead = f - l - 1; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a12t = A11 + (l )*rs_at + (l+1)*cs_at; \ chi11 = x1 + (l )*incx; \ x21 = x1 + (l+1)*incx; \ \ /* chi11 = alpha * alpha11 * chi11; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \ \ /* chi11 = chi11 + alpha * a12t * x21; */ \ PASTEMAC(ch,set0s)( rho1 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \ } \ else \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \ } \ PASTEMAC(ch,axpys)( *alpha, rho1, *chi11 ); \ } \ \ /* x1 = x1 + alpha * A12 * x2; */ \ kfp_df \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_ahead, \ f, \ alpha, \ A12, cs_at, rs_at, \ x2, incx, \ one, \ x1, incx, \ cntx \ ); \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ i = m - iter - f; \ n_ahead = i; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A10 = a + (i )*rs_at + (0 )*cs_at; \ x1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* x1 = alpha * A11 * x1; */ \ for ( k = 0; k < f; ++k ) \ { \ l = f - k - 1; \ f_ahead = l; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a10t = A11 + (l )*rs_at + (0 )*cs_at; \ chi11 = x1 + (l )*incx; \ x01 = x1 + (0 )*incx; \ \ /* chi11 = alpha * alpha11 * chi11; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \ \ /* chi11 = chi11 + alpha * a10t * x01; */ \ PASTEMAC(ch,set0s)( rho1 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \ } \ else \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \ } \ PASTEMAC(ch,axpys)( *alpha, rho1, *chi11 ); \ } \ \ /* x1 = x1 + alpha * A10 * x0; */ \ kfp_df \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_ahead, \ f, \ alpha, \ A10, cs_at, rs_at, \ x0, incx, \ one, \ x1, incx, \ cntx \ ); \ } \ } \ } INSERT_GENTFUNC_BASIC0( trmv_unf_var1 ) cython-blis-0.9.1/blis/_src/frame/2/trmv/bli_trmv_unf_var2.c000066400000000000000000000142161427272030600236510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* A01; \ ctype* A11; \ ctype* A21; \ ctype* a01; \ ctype* alpha11; \ ctype* a21; \ ctype* x0; \ ctype* x1; \ ctype* x2; \ ctype* x01; \ ctype* chi11; \ ctype* x21; \ ctype alpha_alpha11_conj; \ ctype alpha_chi11; \ dim_t iter, i, k, j, l; \ dim_t b_fuse, f; \ dim_t n_behind, f_behind; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_behind = i; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A01 = a + (0 )*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* x0 = x0 + alpha * A01 * x1; */ \ kfp_af \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_behind, \ f, \ alpha, \ A01, rs_at, cs_at, \ x1, incx, \ x0, incx, \ cntx \ ); \ \ /* x1 = alpha * A11 * x1; */ \ for ( k = 0; k < f; ++k ) \ { \ l = k; \ f_behind = l; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a01 = A11 + (0 )*rs_at + (l )*cs_at; \ chi11 = x1 + (l )*incx; \ x01 = x1 + (0 )*incx; \ \ /* x01 = x01 + alpha * chi11 * a01; */ \ PASTEMAC(ch,scal2s)( *alpha, *chi11, alpha_chi11 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpyjs)( alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \ } \ else \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpys)( alpha_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \ } \ \ /* chi11 = alpha * alpha11 * chi11; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \ } \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ i = m - iter - f; \ n_behind = iter; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A21 = a + (i+f)*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ x2 = x + (i+f)*incx; \ \ /* x2 = x2 + alpha * A21 * x1; */ \ kfp_af \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_behind, \ f, \ alpha, \ A21, rs_at, cs_at, \ x1, incx, \ x2, incx, \ cntx \ ); \ \ /* x1 = alpha * A11 * x1; */ \ for ( k = 0; k < f; ++k ) \ { \ l = f - k - 1; \ f_behind = k; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a21 = A11 + (l+1)*rs_at + (l )*cs_at; \ chi11 = x1 + (l )*incx; \ x21 = x1 + (l+1)*incx; \ \ /* x21 = x21 + alpha * chi11 * a21; */ \ PASTEMAC(ch,scal2s)( *alpha, *chi11, alpha_chi11 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpyjs)( alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \ } \ else \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,axpys)( alpha_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \ } \ \ /* chi11 = alpha * alpha11 * chi11; */ \ PASTEMAC(ch,copys)( *alpha, alpha_alpha11_conj ); \ if ( bli_is_nonunit_diag( diaga ) ) \ PASTEMAC(ch,scalcjs)( conja, *alpha11, alpha_alpha11_conj ); \ PASTEMAC(ch,scals)( alpha_alpha11_conj, *chi11 ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( trmv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trmv/bli_trmv_var.h000066400000000000000000000051641427272030600227260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trmv/bli_trmv_var_oapi.c000066400000000000000000000056121427272030600237270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(varname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uploa = bli_obj_uplo( a ); \ trans_t transa = bli_obj_conjtrans_status( a ); \ diag_t diaga = bli_obj_diag( a ); \ \ dim_t m = bli_obj_length( a ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ uploa, \ transa, \ diaga, \ m, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ cntx \ ); \ } \ GENFRONT( trmv, trmv_unb_var1 ) GENFRONT( trmv, trmv_unb_var2 ) GENFRONT( trmv, trmv_unf_var1 ) GENFRONT( trmv, trmv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trmv/other/000077500000000000000000000000001427272030600212025ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_cntl.c000066400000000000000000000140401427272030600242030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern packm_t* packm_cntl; extern packv_t* packv_cntl; extern unpackv_t* unpackv_cntl; extern gemv_t* gemv_cntl_rp_bs_dot; extern gemv_t* gemv_cntl_rp_bs_axpy; extern gemv_t* gemv_cntl_cp_bs_dot; extern gemv_t* gemv_cntl_cp_bs_axpy; trmv_t* trmv_cntl_bs_ke_nrow_tcol = NULL; trmv_t* trmv_cntl_bs_ke_ncol_trow = NULL; trmv_t* trmv_cntl_ge_nrow_tcol = NULL; trmv_t* trmv_cntl_ge_ncol_trow = NULL; void bli_trmv_cntl_init() { // Create control trees for the lowest-level kernels. These trees induce // operations on (presumably) relatively small block-subvector problems. trmv_cntl_bs_ke_nrow_tcol = bli_trmv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, 0, NULL, NULL, NULL, NULL, NULL, NULL ); trmv_cntl_bs_ke_ncol_trow = bli_trmv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT2, 0, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here we choose a // variant that prioritizes keeping a subvector of x in cache. trmv_cntl_ge_nrow_tcol = bli_trmv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, // use var1 to maximize x1 usage BLIS_M2, packm_cntl, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) gemv_cntl_rp_bs_dot, // gemv_rp needed by var1 NULL, // gemv_cp not needed by var1 trmv_cntl_bs_ke_nrow_tcol, unpackv_cntl ); // unpack x1 (if packed) trmv_cntl_ge_ncol_trow = bli_trmv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, // use var1 to maximize x1 usage BLIS_M2, packm_cntl, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) gemv_cntl_rp_bs_axpy, // gemv_rp needed by var1 NULL, // gemv_cp not needed by var1 trmv_cntl_bs_ke_ncol_trow, unpackv_cntl ); // unpack x1 (if packed) } void bli_trmv_cntl_finalize() { bli_cntl_free_node( trmv_cntl_bs_ke_nrow_tcol ); bli_cntl_free_node( trmv_cntl_bs_ke_ncol_trow ); bli_cntl_free_node( trmv_cntl_ge_nrow_tcol ); bli_cntl_free_node( trmv_cntl_ge_ncol_trow ); } trmv_t* bli_trmv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trmv_t* sub_trmv, unpackv_t* sub_unpackv_x1 ) { trmv_t* cntl; cntl = ( trmv_t* ) bli_malloc_intl( sizeof(trmv_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packm_a11 = sub_packm_a11; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_gemv_rp = sub_gemv_rp; cntl->sub_gemv_cp = sub_gemv_cp; cntl->sub_trmv = sub_trmv; cntl->sub_unpackv_x1 = sub_unpackv_x1; return cntl; } void bli_trmv_cntl_obj_init( trmv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trmv_t* sub_trmv, unpackv_t* sub_unpackv_x1 ) { cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_packm_a11 = sub_packm_a11; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_gemv_rp = sub_gemv_rp; cntl->sub_gemv_cp = sub_gemv_cp; cntl->sub_trmv = sub_trmv; cntl->sub_unpackv_x1 = sub_unpackv_x1; } cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_cntl.h000066400000000000000000000061501427272030600242130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct trmv_s { impl_t impl_type; varnum_t var_num; bszid_t bszid; struct packm_s* sub_packm_a11; struct packv_s* sub_packv_x1; struct gemv_s* sub_gemv_rp; struct gemv_s* sub_gemv_cp; struct trmv_s* sub_trmv; struct unpackv_s* sub_unpackv_x1; }; typedef struct trmv_s trmv_t; #define bli_cntl_sub_trmv( cntl ) cntl->sub_trmv void bli_trmv_cntl_init( void ); void bli_trmv_cntl_finalize( void ); trmv_t* bli_trmv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trmv_t* sub_trmv, unpackv_t* sub_unpackv_x1 ); void bli_trmv_cntl_obj_init( trmv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trmv_t* sub_trmv, unpackv_t* sub_unpackv_x1 ); cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_front.c000066400000000000000000000135221427272030600243770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern trmv_t* trmv_cntl_bs_ke_nrow_tcol; extern trmv_t* trmv_cntl_bs_ke_ncol_trow; extern trmv_t* trmv_cntl_ge_nrow_tcol; extern trmv_t* trmv_cntl_ge_ncol_trow; void bli_trmv_front ( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx ) { trmv_t* trmv_cntl; num_t dt_targ_a; num_t dt_targ_x; bool a_has_unit_inc; bool x_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmv_check( alpha, a, x ); // Query the target datatypes of each object. dt_targ_a = bli_obj_target_dt( a ); dt_targ_x = bli_obj_target_dt( x ); // Determine whether each operand with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( a ) || bli_obj_is_col_stored( a ) ); x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc ) { // We use two control trees to handle the four cases corresponding to // combinations of transposition and row/column-storage. // The row-stored without transpose and column-stored with transpose // trees are identical. Same for the remaining two trees. if ( bli_obj_has_notrans( a ) ) { if ( bli_obj_is_row_stored( a ) ) trmv_cntl = trmv_cntl_bs_ke_nrow_tcol; else trmv_cntl = trmv_cntl_bs_ke_ncol_trow; } else // if ( bli_obj_has_trans( a ) ) { if ( bli_obj_is_row_stored( a ) ) trmv_cntl = trmv_cntl_bs_ke_ncol_trow; else trmv_cntl = trmv_cntl_bs_ke_nrow_tcol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_has_notrans( a ) ) { if ( bli_obj_is_row_tilted( a ) ) trmv_cntl = trmv_cntl_ge_nrow_tcol; else trmv_cntl = trmv_cntl_ge_ncol_trow; } else // if ( bli_obj_has_trans( a ) ) { if ( bli_obj_is_row_tilted( a ) ) trmv_cntl = trmv_cntl_ge_ncol_trow; else trmv_cntl = trmv_cntl_ge_nrow_tcol; } } // Invoke the internal back-end with the copy-cast of alpha and the // chosen control tree. bli_trmv_int( &alpha_local, a, x, cntx, trmv_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, ao, xo; \ \ inc_t rs_x, cs_x; \ \ rs_x = incx; cs_x = m * incx; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ \ bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_diag( diaga, &ao ); \ \ bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \ \ PASTEMAC0(opname)( &alphao, \ &ao, \ &xo, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( trmv_front ) cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_front.h000066400000000000000000000041501427272030600244010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_trmv_front ( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( trmv_front ) cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_int.c000066400000000000000000000120171427272030600240370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T trmv_fp typedef void (*FUNCPTR_T)( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ); static FUNCPTR_T vars[2][3][3] = { // lower triangular { // unblocked unblocked with fusing blocked { bli_trmv_unb_var1, bli_trmv_unf_var1, bli_trmv_l_blk_var1 }, { bli_trmv_unb_var2, bli_trmv_unf_var2, bli_trmv_l_blk_var2 }, { NULL, NULL, NULL }, }, // upper triangular { // unblocked unblocked with fusing blocked { bli_trmv_unb_var1, bli_trmv_unf_var1, bli_trmv_u_blk_var1 }, { bli_trmv_unb_var2, bli_trmv_unf_var2, bli_trmv_u_blk_var2 }, { NULL, NULL, NULL }, } }; void bli_trmv_int( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ) { varnum_t n; impl_t i; bool uplo; FUNCPTR_T f; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trmv_check( alpha, a, x ); // If A or x has a zero dimension, return early. if ( bli_obj_has_zero_dim( a ) ) return; if ( bli_obj_has_zero_dim( x ) ) return; // Alias A in case we need to induce a transformation (ie: transposition). bli_obj_alias_to( a, &a_local ); // NOTE: to support cases where B is complex and A is real, we will // need to have the default side case be BLIS_RIGHT and then express // the left case in terms of it, rather than the other way around. // Determine uplo (for indexing to the correct function pointer). if ( bli_obj_is_lower( &a_local ) ) uplo = 0; else uplo = 1; // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply toggle the uplo value to cause the // correct algorithm to be induced. When that algorithm partitions into // A, it will grab the correct subpartitions, which will inherit A's // transposition bit and thus downstream subproblems will do the right // thing. Alternatively, we could accomplish the same end goal by // inducing a transposition, via bli_obj_induce_trans(), in the code // block below. That macro function swaps dimensions, strides, and // offsets. As an example, given a lower triangular, column-major matrix // that needs a transpose, we would induce that transposition by recasting // the object as an upper triangular, row-major matrix (with no transpose // needed). Note that how we choose to handle transposition here does NOT // affect the optimal choice of kernel (ie: a column-major column panel // matrix with transpose times a vector would use the same kernel as a // row-major row panel matrix with no transpose times a vector). if ( bli_obj_has_trans( &a_local ) ) { //bli_obj_induce_trans( &a_local ); //bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); if ( uplo == 1 ) uplo = 0; else uplo = 1; } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[uplo][n][i]; // Invoke the variant. f( alpha, &a_local, x, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_int.h000066400000000000000000000034541427272030600240510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_trmv_int( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ); cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_l_blk_var1.c000066400000000000000000000100751427272030600252630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trmv_l_blk_var1( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ) { obj_t a11, a11_pack; obj_t a10; obj_t x1, x1_pack; obj_t x0; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A10, x1, and x0. bli_acquire_mpart_br2tl( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_br2tl( BLIS_SUBPART10, ij, b_alg, a, &a10 ); bli_acquire_vpart_b2f( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_b2f( BLIS_SUBPART0, ij, b_alg, x, &x0 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x1 = alpha * tril( A11 ) * x1; bli_trmv_int( alpha, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trmv( cntl ) ); // x1 = x1 + alpha * A10 * x0; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a10, &x0, &BLIS_ONE, &x1_pack, cntx, bli_cntl_sub_gemv_rp( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_l_blk_var2.c000066400000000000000000000100751427272030600252640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trmv_l_blk_var2( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ) { obj_t a11, a11_pack; obj_t a21; obj_t x1, x1_pack; obj_t x2; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A21, x1, and x2. bli_acquire_mpart_br2tl( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_br2tl( BLIS_SUBPART21, ij, b_alg, a, &a21 ); bli_acquire_vpart_b2f( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_b2f( BLIS_SUBPART2, ij, b_alg, x, &x2 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x2 = x2 + alpha * A21 * x1; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a21, &x1_pack, &BLIS_ONE, &x2, cntx, bli_cntl_sub_gemv_cp( cntl ) ); // x1 = alpha * tril( A11 ) * x1; bli_trmv_int( alpha, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trmv( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_u_blk_var1.c000066400000000000000000000100751427272030600252740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trmv_u_blk_var1( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ) { obj_t a11, a11_pack; obj_t a12; obj_t x1, x1_pack; obj_t x2; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A12, x1, and x2. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART12, ij, b_alg, a, &a12 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, x, &x2 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x1 = alpha * triu( A11 ) * x1; bli_trmv_int( alpha, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trmv( cntl ) ); // x1 = x1 + alpha * A12 * x2; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a12, &x2, &BLIS_ONE, &x1_pack, cntx, bli_cntl_sub_gemv_rp( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/trmv/other/bli_trmv_u_blk_var2.c000066400000000000000000000100751427272030600252750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trmv_u_blk_var2( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trmv_t* cntl ) { obj_t a11, a11_pack; obj_t a01; obj_t x1, x1_pack; obj_t x0; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A21, x1, and x2. bli_acquire_mpart_br2tl( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_br2tl( BLIS_SUBPART01, ij, b_alg, a, &a01 ); bli_acquire_vpart_b2f( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_b2f( BLIS_SUBPART0, ij, b_alg, x, &x0 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x0 = x0 + alpha * A01 * x1; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, alpha, &a01, &x1_pack, &BLIS_ONE, &x0, cntx, bli_cntl_sub_gemv_cp( cntl ) ); // x1 = alpha * triu( A11 ) * x1; bli_trmv_int( alpha, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trmv( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/trsv/000077500000000000000000000000001427272030600200675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/trsv/bli_trsv.h000066400000000000000000000034701427272030600220700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" #include "bli_trsv_var.h" cython-blis-0.9.1/blis/_src/frame/2/trsv/bli_trsv_unb_var1.c000066400000000000000000000106501427272030600236560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* a10t; \ ctype* alpha11; \ ctype* a12t; \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype alpha11_conj; \ ctype rho; \ dim_t iter, i; \ dim_t n_behind; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ /* x = alpha * x; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ alpha, \ x, incx, \ cntx, \ NULL \ ); \ \ PASTECH(ch,dotv_ker_ft) kfp_tv; \ \ /* Query the context for the kernel function pointer. */ \ kfp_tv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_behind = iter; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a12t = a + (i )*rs_at + (i+1)*cs_at; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ \ /* chi1 = chi1 - a12t * x2; */ \ kfp_tv \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_behind, \ a12t, cs_at, \ x2, incx, \ &rho, \ cntx \ ); \ PASTEMAC(ch,subs)( rho, *chi1 ); \ \ /* chi1 = chi1 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \ } \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_behind = i; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a10t = a + (i )*rs_at + (0 )*cs_at; \ chi1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* chi1 = chi1 - a10t * x0; */ \ kfp_tv \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_behind, \ a10t, cs_at, \ x0, incx, \ &rho, \ cntx \ ); \ PASTEMAC(ch,subs)( rho, *chi1 ); \ \ /* chi1 = chi1 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( trsv_unb_var1 ) cython-blis-0.9.1/blis/_src/frame/2/trsv/bli_trsv_unb_var2.c000066400000000000000000000106241427272030600236600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* a01; \ ctype* alpha11; \ ctype* a21; \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype alpha11_conj; \ ctype minus_chi1; \ dim_t iter, i; \ dim_t n_ahead; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ /* x = alpha * x; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ alpha, \ x, incx, \ cntx, \ NULL \ ); \ \ PASTECH(ch,axpyv_ker_ft) kfp_av; \ \ /* Query the context for the kernel function pointer. */ \ kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_ahead = i; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a01 = a + (0 )*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* chi1 = chi1 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \ } \ \ /* x0 = x0 - chi1 * a01; */ \ PASTEMAC(ch,neg2s)( *chi1, minus_chi1 ); \ kfp_av \ ( \ conja, \ n_ahead, \ &minus_chi1, \ a01, rs_at, \ x0, incx, \ cntx \ ); \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_ahead = m - iter - 1; \ alpha11 = a + (i )*rs_at + (i )*cs_at; \ a21 = a + (i+1)*rs_at + (i )*cs_at; \ chi1 = x + (i )*incx; \ x2 = x + (i+1)*incx; \ \ /* chi1 = chi1 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi1 ); \ } \ \ /* x2 = x2 - chi1 * a21; */ \ PASTEMAC(ch,neg2s)( *chi1, minus_chi1 ); \ kfp_av \ ( \ conja, \ n_ahead, \ &minus_chi1, \ a21, rs_at, \ x2, incx, \ cntx \ ); \ } \ } \ } INSERT_GENTFUNC_BASIC0( trsv_unb_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trsv/bli_trsv_unf_var1.c000066400000000000000000000144151427272030600236650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* one = PASTEMAC(ch,1); \ ctype* minus_one = PASTEMAC(ch,m1); \ ctype* A10; \ ctype* A11; \ ctype* A12; \ ctype* a10t; \ ctype* alpha11; \ ctype* a12t; \ ctype* x0; \ ctype* x1; \ ctype* x2; \ ctype* x01; \ ctype* chi11; \ ctype* x21; \ ctype alpha11_conj; \ ctype rho1; \ dim_t iter, i, k, j, l; \ dim_t b_fuse, f; \ dim_t n_behind, f_behind; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ /* x = alpha * x; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ alpha, \ x, incx, \ cntx, \ NULL \ ); \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ i = m - iter - f; \ n_behind = iter; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A12 = a + (i )*rs_at + (i+f)*cs_at; \ x1 = x + (i )*incx; \ x2 = x + (i+f)*incx; \ \ /* x1 = x1 - A12 * x2; */ \ kfp_df \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_behind, \ f, \ minus_one, \ A12, cs_at, rs_at, \ x2, incx, \ one, \ x1, incx, \ cntx \ ); \ \ /* x1 = x1 / triu( A11 ); */ \ for ( k = 0; k < f; ++k ) \ { \ l = f - k - 1; \ f_behind = k; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a12t = A11 + (l )*rs_at + (l+1)*cs_at; \ chi11 = x1 + (l )*incx; \ x21 = x1 + (l+1)*incx; \ \ /* chi11 = chi11 - a12t * x21; */ \ PASTEMAC(ch,set0s)( rho1 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,dotjs)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \ } \ else \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,dots)( *(a12t + j*cs_at), *(x21 + j*incx), rho1 ); \ } \ PASTEMAC(ch,subs)( rho1, *chi11 ); \ \ /* chi11 = chi11 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ } \ } \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_behind = i; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A10 = a + (i )*rs_at + (0 )*cs_at; \ x1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* x1 = x1 - A10 * x0; */ \ kfp_df \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_behind, \ f, \ minus_one, \ A10, cs_at, rs_at, \ x0, incx, \ one, \ x1, incx, \ cntx \ ); \ \ /* x1 = x1 / tril( A11 ); */ \ for ( k = 0; k < f; ++k ) \ { \ l = k; \ f_behind = l; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a10t = A11 + (l )*rs_at + (0 )*cs_at; \ chi11 = x1 + (l )*incx; \ x01 = x1 + (0 )*incx; \ \ /* chi11 = chi11 - a10t * x01; */ \ PASTEMAC(ch,set0s)( rho1 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,dotjs)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \ } \ else \ { \ for ( j = 0; j < f_behind; ++j ) \ PASTEMAC(ch,dots)( *(a10t + j*cs_at), *(x01 + j*incx), rho1 ); \ } \ PASTEMAC(ch,subs)( rho1, *chi11 ); \ \ /* chi11 = chi11 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( trsv_unf_var1 ) cython-blis-0.9.1/blis/_src/frame/2/trsv/bli_trsv_unf_var2.c000066400000000000000000000142741427272030600236710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ ctype* minus_one = PASTEMAC(ch,m1); \ ctype* A01; \ ctype* A11; \ ctype* A21; \ ctype* a01; \ ctype* alpha11; \ ctype* a21; \ ctype* x0; \ ctype* x1; \ ctype* x2; \ ctype* x01; \ ctype* chi11; \ ctype* x21; \ ctype alpha11_conj; \ ctype minus_chi11; \ dim_t iter, i, k, j, l; \ dim_t b_fuse, f; \ dim_t n_ahead, f_ahead; \ inc_t rs_at, cs_at; \ uplo_t uploa_trans; \ conj_t conja; \ \ /* x = alpha * x; */ \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m, \ alpha, \ x, incx, \ cntx, \ NULL \ ); \ \ if ( bli_does_notrans( transa ) ) \ { \ rs_at = rs_a; \ cs_at = cs_a; \ uploa_trans = uploa; \ } \ else /* if ( bli_does_trans( transa ) ) */ \ { \ rs_at = cs_a; \ cs_at = rs_a; \ uploa_trans = bli_uplo_toggled( uploa ); \ } \ \ conja = bli_extract_conj( transa ); \ \ PASTECH(ch,axpyf_ker_ft) kfp_af; \ \ /* Query the context for the kernel function pointer and fusing factor. */ \ kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \ \ /* We reduce all of the possible cases down to just lower/upper. */ \ if ( bli_is_upper( uploa_trans ) ) \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_b( iter, m, b_fuse ); \ i = m - iter - f; \ n_ahead = i; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A01 = a + (0 )*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ x0 = x + (0 )*incx; \ \ /* x1 = x1 / triu( A11 ); */ \ for ( k = 0; k < f; ++k ) \ { \ l = f - k - 1; \ f_ahead = l; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a01 = A11 + (0 )*rs_at + (l )*cs_at; \ chi11 = x1 + (l )*incx; \ x01 = x1 + (0 )*incx; \ \ /* chi11 = chi11 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ } \ \ /* x01 = x01 - chi11 * a01; */ \ PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpyjs)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \ } \ else \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpys)( minus_chi11, *(a01 + j*rs_at), *(x01 + j*incx) ); \ } \ } \ \ /* x0 = x0 - A01 * x1; */ \ kfp_af \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_ahead, \ f, \ minus_one, \ A01, rs_at, cs_at, \ x1, incx, \ x0, incx, \ cntx \ ); \ } \ } \ else /* if ( bli_is_lower( uploa_trans ) ) */ \ { \ for ( iter = 0; iter < m; iter += f ) \ { \ f = bli_determine_blocksize_dim_f( iter, m, b_fuse ); \ i = iter; \ n_ahead = m - iter - f; \ A11 = a + (i )*rs_at + (i )*cs_at; \ A21 = a + (i+f)*rs_at + (i )*cs_at; \ x1 = x + (i )*incx; \ x2 = x + (i+f)*incx; \ \ /* x1 = x1 / tril( A11 ); */ \ for ( k = 0; k < f; ++k ) \ { \ l = k; \ f_ahead = f - k - 1; \ alpha11 = A11 + (l )*rs_at + (l )*cs_at; \ a21 = A11 + (l+1)*rs_at + (l )*cs_at; \ chi11 = x1 + (l )*incx; \ x21 = x1 + (l+1)*incx; \ \ /* chi11 = chi11 / alpha11; */ \ if ( bli_is_nonunit_diag( diaga ) ) \ { \ PASTEMAC(ch,copycjs)( conja, *alpha11, alpha11_conj ); \ PASTEMAC(ch,invscals)( alpha11_conj, *chi11 ); \ } \ \ /* x21 = x21 - chi11 * a21; */ \ PASTEMAC(ch,neg2s)( *chi11, minus_chi11 ); \ if ( bli_is_conj( conja ) ) \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpyjs)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \ } \ else \ { \ for ( j = 0; j < f_ahead; ++j ) \ PASTEMAC(ch,axpys)( minus_chi11, *(a21 + j*rs_at), *(x21 + j*incx) ); \ } \ } \ \ /* x2 = x2 - A21 * x1; */ \ kfp_af \ ( \ conja, \ BLIS_NO_CONJUGATE, \ n_ahead, \ f, \ minus_one, \ A21, rs_at, cs_at, \ x1, incx, \ x2, incx, \ cntx \ ); \ } \ } \ } INSERT_GENTFUNC_BASIC0( trsv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trsv/bli_trsv_var.h000066400000000000000000000051641427272030600227420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trsv/bli_trsv_var_oapi.c000066400000000000000000000056121427272030600237430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(varname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uploa = bli_obj_uplo( a ); \ trans_t transa = bli_obj_conjtrans_status( a ); \ diag_t diaga = bli_obj_diag( a ); \ \ dim_t m = bli_obj_length( a ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,_unb,_vft) f = \ PASTEMAC(varname,_qfp)( dt ); \ \ f \ ( \ uploa, \ transa, \ diaga, \ m, \ buf_alpha, \ buf_a, rs_a, cs_a, \ buf_x, incx, \ cntx \ ); \ } \ GENFRONT( trsv, trsv_unb_var1 ) GENFRONT( trsv, trsv_unb_var2 ) GENFRONT( trsv, trsv_unf_var1 ) GENFRONT( trsv, trsv_unf_var2 ) cython-blis-0.9.1/blis/_src/frame/2/trsv/other/000077500000000000000000000000001427272030600212105ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_cntl.c000066400000000000000000000147571427272030600242360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern scalv_t* scalv_cntl; extern packm_t* packm_cntl; extern packv_t* packv_cntl; extern unpackv_t* unpackv_cntl; extern gemv_t* gemv_cntl_rp_bs_dot; extern gemv_t* gemv_cntl_rp_bs_axpy; extern gemv_t* gemv_cntl_cp_bs_dot; extern gemv_t* gemv_cntl_cp_bs_axpy; trsv_t* trsv_cntl_bs_ke_nrow_tcol = NULL; trsv_t* trsv_cntl_bs_ke_ncol_trow = NULL; trsv_t* trsv_cntl_ge_nrow_tcol = NULL; trsv_t* trsv_cntl_ge_ncol_trow = NULL; void bli_trsv_cntl_init() { // Create control trees for the lowest-level kernels. These trees induce // operations on (presumably) relatively small block-subvector problems. trsv_cntl_bs_ke_nrow_tcol = bli_trsv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT1, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); trsv_cntl_bs_ke_ncol_trow = bli_trsv_cntl_obj_create( BLIS_UNB_FUSED, BLIS_VARIANT2, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL ); // Create control trees for generally large problems. Here we choose a // variant that prioritizes keeping a subvector of x in cache. trsv_cntl_ge_nrow_tcol = bli_trsv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, // use var1 to maximize x1 usage BLIS_M2, scalv_cntl, // scale x up-front packm_cntl, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) gemv_cntl_rp_bs_dot, // gemv_rp needed by var1 NULL, // gemv_cp not needed by var1 trsv_cntl_bs_ke_nrow_tcol, unpackv_cntl ); // unpack x1 (if needed) trsv_cntl_ge_ncol_trow = bli_trsv_cntl_obj_create( BLIS_BLOCKED, BLIS_VARIANT1, // use var1 to maximize x1 usage BLIS_M2, scalv_cntl, // scale x up-front packm_cntl, // pack A11 (if needed) packv_cntl, // pack x1 (if needed) gemv_cntl_rp_bs_axpy, // gemv_rp needed by var1 NULL, // gemv_cp not needed by var1 trsv_cntl_bs_ke_ncol_trow, unpackv_cntl ); // unpack x1 (if needed) } void bli_trsv_cntl_finalize() { bli_cntl_free_node( trsv_cntl_bs_ke_nrow_tcol ); bli_cntl_free_node( trsv_cntl_bs_ke_ncol_trow ); bli_cntl_free_node( trsv_cntl_ge_nrow_tcol ); bli_cntl_free_node( trsv_cntl_ge_ncol_trow ); } trsv_t* bli_trsv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trsv_t* sub_trsv, unpackv_t* sub_unpackv_x1 ) { trsv_t* cntl; cntl = ( trsv_t* ) bli_malloc_intl( sizeof(trsv_t) ); cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_scalv = sub_scalv; cntl->sub_packm_a11 = sub_packm_a11; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_gemv_rp = sub_gemv_rp; cntl->sub_gemv_cp = sub_gemv_cp; cntl->sub_trsv = sub_trsv; cntl->sub_unpackv_x1 = sub_unpackv_x1; return cntl; } void bli_trsv_cntl_obj_init( trsv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trsv_t* sub_trsv, unpackv_t* sub_unpackv_x1 ) { cntl->impl_type = impl_type; cntl->var_num = var_num; cntl->bszid = bszid; cntl->sub_scalv = sub_scalv; cntl->sub_packm_a11 = sub_packm_a11; cntl->sub_packv_x1 = sub_packv_x1; cntl->sub_gemv_rp = sub_gemv_rp; cntl->sub_gemv_cp = sub_gemv_cp; cntl->sub_trsv = sub_trsv; cntl->sub_unpackv_x1 = sub_unpackv_x1; } cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_cntl.h000066400000000000000000000063621427272030600242340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ struct trsv_s { impl_t impl_type; varnum_t var_num; bszid_t bszid; struct scalv_s* sub_scalv; struct packm_s* sub_packm_a11; struct packv_s* sub_packv_x1; struct gemv_s* sub_gemv_rp; struct gemv_s* sub_gemv_cp; struct trsv_s* sub_trsv; struct unpackv_s* sub_unpackv_x1; }; typedef struct trsv_s trsv_t; #define bli_cntl_sub_trsv( cntl ) cntl->sub_trsv void bli_trsv_cntl_init( void ); void bli_trsv_cntl_finalize( void ); trsv_t* bli_trsv_cntl_obj_create( impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trsv_t* sub_trsv, unpackv_t* sub_unpackv_x1 ); void bli_trsv_cntl_obj_init( trsv_t* cntl, impl_t impl_type, varnum_t var_num, bszid_t bszid, scalv_t* sub_scalv, packm_t* sub_packm_a11, packv_t* sub_packv_x1, gemv_t* sub_gemv_rp, gemv_t* sub_gemv_cp, trsv_t* sub_trsv, unpackv_t* sub_unpackv_x1 ); cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_front.c000066400000000000000000000130761427272030600244170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern trsv_t* trsv_cntl_bs_ke_nrow_tcol; extern trsv_t* trsv_cntl_bs_ke_ncol_trow; extern trsv_t* trsv_cntl_ge_nrow_tcol; extern trsv_t* trsv_cntl_ge_ncol_trow; void bli_trsv_front ( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx ) { trsv_t* trsv_cntl; num_t dt_targ_a; num_t dt_targ_x; bool a_has_unit_inc; bool x_has_unit_inc; obj_t alpha_local; num_t dt_alpha; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trsv_check( alpha, a, x ); // Query the target datatypes of each object. dt_targ_a = bli_obj_dt( a ); dt_targ_x = bli_obj_dt( x ); // Determine whether each operand with unit stride. a_has_unit_inc = ( bli_obj_is_row_stored( a ) || bli_obj_is_col_stored( a ) ); x_has_unit_inc = ( bli_obj_vector_inc( x ) == 1 ); // Create an object to hold a copy-cast of alpha. Notice that we use // the type union of the target datatypes of a and x to prevent any // unnecessary loss of information during the computation. dt_alpha = bli_dt_union( dt_targ_a, dt_targ_x ); bli_obj_scalar_init_detached_copy_of( dt_alpha, BLIS_NO_CONJUGATE, alpha, &alpha_local ); // If all operands have unit stride, we choose a control tree for calling // the unblocked implementation directly without any blocking. if ( a_has_unit_inc && x_has_unit_inc ) { if ( bli_obj_has_notrans( a ) ) { if ( bli_obj_is_row_stored( a ) ) trsv_cntl = trsv_cntl_bs_ke_nrow_tcol; else trsv_cntl = trsv_cntl_bs_ke_ncol_trow; } else // if ( bli_obj_has_trans( a ) ) { if ( bli_obj_is_row_stored( a ) ) trsv_cntl = trsv_cntl_bs_ke_ncol_trow; else trsv_cntl = trsv_cntl_bs_ke_nrow_tcol; } } else { // Mark objects with unit stride as already being packed. This prevents // unnecessary packing from happening within the blocked algorithm. if ( a_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_UNSPEC, a ); if ( x_has_unit_inc ) bli_obj_set_pack_schema( BLIS_PACKED_VECTOR, x ); // Here, we make a similar choice as above, except that (1) we look // at storage tilt, and (2) we choose a tree that performs blocking. if ( bli_obj_has_notrans( a ) ) { if ( bli_obj_is_row_tilted( a ) ) trsv_cntl = trsv_cntl_ge_nrow_tcol; else trsv_cntl = trsv_cntl_ge_ncol_trow; } else // if ( bli_obj_has_trans( a ) ) { if ( bli_obj_is_row_tilted( a ) ) trsv_cntl = trsv_cntl_ge_ncol_trow; else trsv_cntl = trsv_cntl_ge_nrow_tcol; } } // Invoke the internal back-end with the copy-cast of alpha and the // chosen control tree. bli_trsv_int( &alpha_local, a, x, cntx, trsv_cntl ); } // // Define BLAS-like interfaces with homogeneous-typed operands. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao, ao, xo; \ \ inc_t rs_x, cs_x; \ \ rs_x = incx; cs_x = m * incx; \ \ bli_obj_create_1x1_with_attached_buffer( dt, alpha, &alphao ); \ \ bli_obj_create_with_attached_buffer( dt, m, m, a, rs_a, cs_a, &ao ); \ bli_obj_create_with_attached_buffer( dt, m, 1, x, rs_x, cs_x, &xo ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_diag( diaga, &ao ); \ \ bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \ \ PASTEMAC0(opname)( &alphao, \ &ao, \ &xo, \ cntx ); \ } INSERT_GENTFUNC_BASIC0( trsv_front ) cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_front.h000066400000000000000000000041471427272030600244230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_trsv_front ( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC( trsv_front ) cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_int.c000066400000000000000000000120201427272030600240450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T trsv_fp typedef void (*FUNCPTR_T)( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ); static FUNCPTR_T vars[2][3][3] = { // lower triangular { // unblocked unblocked with fusing blocked { bli_trsv_unb_var1, bli_trsv_unf_var1, bli_trsv_l_blk_var1 }, { bli_trsv_unb_var2, bli_trsv_unf_var2, bli_trsv_l_blk_var2 }, { NULL, NULL, NULL }, }, // upper triangular { // unblocked unblocked with fusing blocked { bli_trsv_unb_var1, bli_trsv_unf_var1, bli_trsv_u_blk_var1 }, { bli_trsv_unb_var2, bli_trsv_unf_var2, bli_trsv_u_blk_var2 }, { NULL, NULL, NULL }, } }; void bli_trsv_int( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ) { varnum_t n; impl_t i; bool uplo; FUNCPTR_T f; obj_t a_local; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_trsv_check( alpha, a, x ); // If A or x has a zero dimension, return early. if ( bli_obj_has_zero_dim( a ) ) return; if ( bli_obj_has_zero_dim( x ) ) return; // Alias A in case we need to induce a transformation (ie: transposition). bli_obj_alias_to( a, &a_local ); // NOTE: to support cases where B is complex and A is real, we will // need to have the default side case be BLIS_RIGHT and then express // the left case in terms of it, rather than the other way around. // Determine uplo (for indexing to the correct function pointer). if ( bli_obj_is_lower( &a_local ) ) uplo = 0; else uplo = 1; // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply toggle the uplo value to cause the // correct algorithm to be induced. When that algorithm partitions into // A, it will grab the correct subpartitions, which will inherit A's // transposition bit and thus downstream subproblems will do the right // thing. Alternatively, we could accomplish the same end goal by // inducing a transposition, via bli_obj_induce_trans(), in the code // block below. That macro function swaps dimensions, strides, and // offsets. As an example, given a lower triangular, column-major matrix // that needs a transpose, we would induce that transposition by recasting // the object as an upper triangular, row-major matrix (with no transpose // needed). Note that how we choose to handle transposition here does NOT // affect the optimal choice of kernel (ie: a column-major column panel // matrix with transpose times a vector would use the same kernel as a // row-major row panel matrix with no transpose times a vector). if ( bli_obj_has_trans( &a_local ) ) { //bli_obj_induce_trans( &a_local ); //bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); if ( uplo == 1 ) uplo = 0; else uplo = 1; } // Extract the variant number and implementation type. n = bli_cntl_var_num( cntl ); i = bli_cntl_impl_type( cntl ); // Index into the variant array to extract the correct function pointer. f = vars[uplo][n][i]; // Invoke the variant. f( alpha, &a_local, x, cntx, cntl ); } cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_int.h000066400000000000000000000034541427272030600240650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_trsv_int( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ); cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_l_blk_var1.c000066400000000000000000000102551427272030600252770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trsv_l_blk_var1( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ) { obj_t a11, a11_pack; obj_t a10; obj_t x1, x1_pack; obj_t x0; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // x = alpha * x; bli_scalv_int( alpha, x, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A10, x1, and x0. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART10, ij, b_alg, a, &a10 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART0, ij, b_alg, x, &x0 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x1 = x1 - A10 * x0; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, &BLIS_MINUS_ONE, &a10, &x0, &BLIS_ONE, &x1_pack, cntx, bli_cntl_sub_gemv_rp( cntl ) ); // x1 = x1 / tril( A11 ); bli_trsv_int( &BLIS_ONE, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trsv( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_l_blk_var2.c000066400000000000000000000102551427272030600253000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trsv_l_blk_var2( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ) { obj_t a11, a11_pack; obj_t a21; obj_t x1, x1_pack; obj_t x2; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // x = alpha * x; bli_scalv_int( alpha, x, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_f( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A21, x1, and x2. bli_acquire_mpart_tl2br( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_tl2br( BLIS_SUBPART21, ij, b_alg, a, &a21 ); bli_acquire_vpart_f2b( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_f2b( BLIS_SUBPART2, ij, b_alg, x, &x2 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x1 = x1 / tril( A11 ); bli_trsv_int( &BLIS_ONE, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trsv( cntl ) ); // x2 = x2 - A21 * x1; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, &BLIS_MINUS_ONE, &a21, &x1_pack, &BLIS_ONE, &x2, cntx, bli_cntl_sub_gemv_cp( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_u_blk_var1.c000066400000000000000000000102551427272030600253100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trsv_u_blk_var1( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ) { obj_t a11, a11_pack; obj_t a12; obj_t x1, x1_pack; obj_t x2; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // x = alpha * x; bli_scalv_int( alpha, x, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A12, x1, and x2. bli_acquire_mpart_br2tl( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_br2tl( BLIS_SUBPART12, ij, b_alg, a, &a12 ); bli_acquire_vpart_b2f( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_b2f( BLIS_SUBPART2, ij, b_alg, x, &x2 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x1 = x1 - A12 * x2; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, &BLIS_MINUS_ONE, &a12, &x2, &BLIS_ONE, &x1_pack, cntx, bli_cntl_sub_gemv_rp( cntl ) ); // x1 = x1 / tril( A11 ); bli_trsv_int( &BLIS_ONE, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trsv( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/2/trsv/other/bli_trsv_u_blk_var2.c000066400000000000000000000102551427272030600253110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trsv_u_blk_var2( obj_t* alpha, obj_t* a, obj_t* x, cntx_t* cntx, trsv_t* cntl ) { obj_t a11, a11_pack; obj_t a01; obj_t x1, x1_pack; obj_t x0; dim_t mn; dim_t ij; dim_t b_alg; // Initialize objects for packing. bli_obj_init_pack( &a11_pack ); bli_obj_init_pack( &x1_pack ); // Query dimension. mn = bli_obj_length( a ); // x = alpha * x; bli_scalv_int( alpha, x, cntx, bli_cntl_sub_scalv( cntl ) ); // Partition diagonally. for ( ij = 0; ij < mn; ij += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize_b( ij, mn, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A11, A01, x1, and x0. bli_acquire_mpart_br2tl( BLIS_SUBPART11, ij, b_alg, a, &a11 ); bli_acquire_mpart_br2tl( BLIS_SUBPART01, ij, b_alg, a, &a01 ); bli_acquire_vpart_b2f( BLIS_SUBPART1, ij, b_alg, x, &x1 ); bli_acquire_vpart_b2f( BLIS_SUBPART0, ij, b_alg, x, &x0 ); // Initialize objects for packing A11 and x1 (if needed). bli_packm_init( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_init( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // Copy/pack A11, x1 (if needed). bli_packm_int( &a11, &a11_pack, cntx, bli_cntl_sub_packm_a11( cntl ), &BLIS_PACKM_SINGLE_THREADED ); bli_packv_int( &x1, &x1_pack, cntx, bli_cntl_sub_packv_x1( cntl ) ); // x1 = x1 / tril( A11 ); bli_trsv_int( &BLIS_ONE, &a11_pack, &x1_pack, cntx, bli_cntl_sub_trsv( cntl ) ); // x0 = x0 - A01 * x1; bli_gemv_int( BLIS_NO_TRANSPOSE, BLIS_NO_CONJUGATE, &BLIS_MINUS_ONE, &a01, &x1_pack, &BLIS_ONE, &x0, cntx, bli_cntl_sub_gemv_cp( cntl ) ); // Copy/unpack x1 (if x1 was packed). bli_unpackv_int( &x1_pack, &x1, cntx, bli_cntl_sub_unpackv_x1( cntl ) ); } // If any packing buffers were acquired within packm, release them back // to the memory manager. bli_packm_release( &a11_pack, bli_cntl_sub_packm_a11( cntl ) ); bli_packv_release( &x1_pack, bli_cntl_sub_packv_x1( cntl ) ); } cython-blis-0.9.1/blis/_src/frame/3/000077500000000000000000000000001427272030600170725ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/bli_l3.h000066400000000000000000000062201427272030600204070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_l3_cntl.h" #include "bli_l3_check.h" #include "bli_l3_int.h" #include "bli_l3_packab.h" // Define function types. //#include "bli_l3_ft_ex.h" #include "bli_l3_ft_ukr.h" #include "bli_l3_oft.h" #include "bli_l3_oft_var.h" #include "bli_l3_blocksize.h" #include "bli_l3_direct.h" #include "bli_l3_prune.h" #include "bli_l3_schema.h" // Prototype object APIs (basic and expert). #include "bli_l3_oapi.h" #include "bli_l3_oapi_ex.h" // Prototype typed APIs (basic and expert). #include "bli_l3_tapi.h" #include "bli_l3_tapi_ex.h" // Define function types for small/unpacked handlers/kernels. #include "bli_l3_sup_oft.h" #include "bli_l3_sup_ft_ker.h" // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. #include "bli_l3_sup.h" // Prototype reference implementation of small/unpacked matrix handler. #include "bli_l3_sup_ref.h" #include "bli_l3_sup_int.h" #include "bli_l3_sup_vars.h" #include "bli_l3_sup_packm_a.h" #include "bli_l3_sup_packm_b.h" #include "bli_l3_sup_packm_var.h" // Prototype microkernel wrapper APIs. #include "bli_l3_ukr_oapi.h" #include "bli_l3_ukr_tapi.h" // Generate function pointer arrays for tapi microkernel functions. #include "bli_l3_ukr_fpa.h" // Operation-specific headers. #include "bli_gemm.h" #include "bli_hemm.h" #include "bli_symm.h" #include "bli_trmm.h" #include "bli_trmm3.h" #include "bli_trsm.h" #include "bli_gemmt.h" cython-blis-0.9.1/blis/_src/frame/3/bli_l3_blocksize.c000066400000000000000000000246151427272030600224570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ) { opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_GEMMT ) return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRMM ) return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx ); else if ( family == BLIS_TRSM ) return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx ); // This should never execute. return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx ); } // ----------------------------------------------------------------------------- // // NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize // function to determine the kc blocksize so that we can implement the // "nudging" of kc to be a multiple of mr or nr, as needed. // #undef GENFRONT #define GENFRONT( opname, l3op ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ) \ { \ if ( direct == BLIS_FWD ) \ return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \ else \ return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \ } GENFRONT( gemm_determine_kc, gemm ) GENFRONT( gemmt_determine_kc, gemmt ) GENFRONT( trmm_determine_kc, trmm ) GENFRONT( trsm_determine_kc, trsm ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ) \ { \ num_t dt; \ blksz_t* bsize; \ dim_t mnr; \ dim_t b_alg, b_max; \ dim_t b_use; \ \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that is moving "forward" (ie: top to bottom, left to right, top-left to bottom-right). */ \ \ /* bli_*_determine_kc_b(): We assume that this function is being called from an algorithm that is moving "backward" (ie: bottom to top, right to left, bottom-right to top-left). */ \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ dt = bli_obj_exec_dt( a ); \ bsize = bli_cntx_get_blksz( bszid, cntx ); \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Nudge the default and maximum kc blocksizes up to the nearest multiple of MR if A is Hermitian or symmetric, or NR if B is Hermitian or symmetric. If neither case applies, then we leave the blocksizes unchanged. */ \ if ( bli_obj_root_is_herm_or_symm( a ) ) \ { \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ } \ else if ( bli_obj_root_is_herm_or_symm( b ) ) \ { \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ } \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ } GENFRONT( gemm_determine_kc_f, f ) GENFRONT( gemm_determine_kc_b, b ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ) \ { \ num_t dt; \ blksz_t* bsize; \ dim_t b_alg, b_max; \ dim_t b_use; \ \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that is moving "forward" (ie: top to bottom, left to right, top-left to bottom-right). */ \ \ /* bli_*_determine_kc_b(): We assume that this function is being called from an algorithm that is moving "backward" (ie: bottom to top, right to left, bottom-right to top-left). */ \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ dt = bli_obj_exec_dt( a ); \ bsize = bli_cntx_get_blksz( bszid, cntx ); \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Notice that for gemmt, we do not need to perform any special handling for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ } GENFRONT( gemmt_determine_kc_f, f ) GENFRONT( gemmt_determine_kc_b, b ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ) \ { \ num_t dt; \ blksz_t* bsize; \ dim_t mnr; \ dim_t b_alg, b_max; \ dim_t b_use; \ \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that is moving "forward" (ie: top to bottom, left to right, top-left to bottom-right). */ \ \ /* bli_*_determine_kc_b(): We assume that this function is being called from an algorithm that is moving "backward" (ie: bottom to top, right to left, bottom-right to top-left). */ \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ dt = bli_obj_exec_dt( a ); \ bsize = bli_cntx_get_blksz( bszid, cntx ); \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Nudge the default and maximum kc blocksizes up to the nearest multiple of MR if the triangular matrix is on the left, or NR if the triangular matrix is one the right. */ \ if ( bli_obj_root_is_triangular( a ) ) \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ else \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ } GENFRONT( trmm_determine_kc_f, f ) GENFRONT( trmm_determine_kc_b, b ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname, chdir ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ) \ { \ num_t dt; \ blksz_t* bsize; \ dim_t mnr; \ dim_t b_alg, b_max; \ dim_t b_use; \ \ /* bli_*_determine_kc_f(): We assume that this function is being called from an algorithm that is moving "forward" (ie: top to bottom, left to right, top-left to bottom-right). */ \ \ /* bli_*_determine_kc_b(): We assume that this function is being called from an algorithm that is moving "backward" (ie: bottom to top, right to left, bottom-right to top-left). */ \ \ /* Extract the execution datatype and use it to query the corresponding blocksize and blocksize maximum values from the blksz_t object. */ \ dt = bli_obj_exec_dt( a ); \ bsize = bli_cntx_get_blksz( bszid, cntx ); \ b_alg = bli_blksz_get_def( dt, bsize ); \ b_max = bli_blksz_get_max( dt, bsize ); \ \ /* Nudge the default and maximum kc blocksizes up to the nearest multiple of MR. We always use MR (rather than sometimes using NR) because even when the triangle is on the right, packing of that matrix uses MR, since only left-side trsm micro-kernels are supported. */ \ mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ b_alg = bli_align_dim_to_mult( b_alg, mnr ); \ b_max = bli_align_dim_to_mult( b_max, mnr ); \ \ /* Call the bli_determine_blocksize_[fb]_sub() helper routine defined in bli_blksz.c */ \ b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \ \ return b_use; \ } GENFRONT( trsm_determine_kc_f, f ) GENFRONT( trsm_determine_kc_b, b ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_blocksize.h000066400000000000000000000053001427272030600224520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_check.c000066400000000000000000000320431427272030600215410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemm_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { //err_t e_val; // Check basic properties of the operation. bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // Check object structure. // NOTE: Can't perform these checks as long as bli_gemm_check() is called // from bli_l3_int(), which is in the execution path for structured // level-3 operations such as hemm. //e_val = bli_check_general_object( a ); //bli_check_error_code( e_val ); //e_val = bli_check_general_object( b ); //bli_check_error_code( e_val ); } void bli_gemmt_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Check basic properties of the operation. bli_gemmt_basic_check( alpha, a, b, beta, c, cntx ); // Check matrix squareness. e_val = bli_check_square_object( c ); bli_check_error_code( e_val ); } void bli_hemm_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Perform checks common to hemm/symm/trmm/trsm. bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); // Check object structure. e_val = bli_check_hermitian_object( a ); bli_check_error_code( e_val ); } void bli_herk_check ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; obj_t ah; // Alias A to A^H so we can perform dimension checks. bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); // Check basic properties of the operation. bli_herk_basic_check( alpha, a, &ah, beta, c, cntx ); // Check for real-valued alpha and beta. e_val = bli_check_real_valued_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_real_valued_object( beta ); bli_check_error_code( e_val ); // Check matrix structure. e_val = bli_check_hermitian_object( c ); bli_check_error_code( e_val ); } void bli_her2k_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; obj_t ah, bh; // Alias A and B to A^H and B^H so we can perform dimension checks. bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah ); bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, b, &bh ); // Check basic properties of the operation. bli_her2k_basic_check( alpha, a, &bh, b, &ah, beta, c, cntx ); // Check for real-valued beta. e_val = bli_check_real_valued_object( beta ); bli_check_error_code( e_val ); // Check matrix structure. e_val = bli_check_hermitian_object( c ); bli_check_error_code( e_val ); } void bli_symm_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Check basic properties of the operation. bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); // Check object structure. e_val = bli_check_symmetric_object( a ); bli_check_error_code( e_val ); } void bli_syrk_check ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; obj_t at; // Alias A to A^T so we can perform dimension checks. bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); // Check basic properties of the operation. bli_herk_basic_check( alpha, a, &at, beta, c, cntx ); // Check matrix structure. e_val = bli_check_symmetric_object( c ); bli_check_error_code( e_val ); } void bli_syr2k_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; obj_t at, bt; // Alias A and B to A^T and B^T so we can perform dimension checks. bli_obj_alias_with_trans( BLIS_TRANSPOSE, a, &at ); bli_obj_alias_with_trans( BLIS_TRANSPOSE, b, &bt ); // Check basic properties of the operation. bli_her2k_basic_check( alpha, a, &bt, b, &at, beta, c, cntx ); // Check matrix structure. e_val = bli_check_symmetric_object( c ); bli_check_error_code( e_val ); } void bli_trmm3_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Perform checks common to hemm/symm/trmm/trsm. bli_hemm_basic_check( side, alpha, a, b, beta, c, cntx ); // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); } void bli_trmm_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx ) { err_t e_val; // Perform checks common to hemm/symm/trmm/trsm. bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); } void bli_trsm_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx ) { err_t e_val; // Perform checks common to hemm/symm/trmm/trsm. bli_hemm_basic_check( side, alpha, a, b, &BLIS_ZERO, b, cntx ); // Check object structure. e_val = bli_check_triangular_object( a ); bli_check_error_code( e_val ); } // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Perform standard checks. bli_l3_basic_check( alpha, a, b, beta, c, cntx ); // Check object dimensions. e_val = bli_check_level3_dims( a, b, c ); bli_check_error_code( e_val ); #ifdef BLIS_ENABLE_GEMM_MD // Skip checking for consistent datatypes between A, B, and C since // that is totally valid for mixed-datatype gemm. // When mixing datatypes, make sure that alpha does not have a non-zero // imaginary component. if ( bli_obj_dt( c ) != bli_obj_dt( a ) || bli_obj_dt( c ) != bli_obj_dt( b ) || bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) if ( !bli_obj_imag_is_zero( alpha ) ) { bli_print_msg( "Mixed-datatype gemm does not yet support alpha with a non-zero imaginary component. Please contact BLIS developers for further support.", __FILE__, __LINE__ ); bli_abort(); } #else // BLIS_DISABLE_GEMM_MD // Check for consistent datatypes. // NOTE: We only perform these tests when mixed datatype support is // disabled. e_val = bli_check_consistent_object_datatypes( c, a ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( c, b ); bli_check_error_code( e_val ); #endif } void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Perform standard checks. bli_l3_basic_check( alpha, a, b, beta, c, cntx ); // Check object dimensions. e_val = bli_check_level3_dims( a, b, c ); bli_check_error_code( e_val ); } void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Perform standard checks. bli_l3_basic_check( alpha, a, b, beta, c, cntx ); // Check object dimensions. if ( bli_is_left( side ) ) { e_val = bli_check_level3_dims( a, b, c ); bli_check_error_code( e_val ); } else // if ( bli_is_right( side ) ) { e_val = bli_check_level3_dims( b, a, c ); bli_check_error_code( e_val ); } // Check matrix squareness. e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( c, a ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( c, b ); bli_check_error_code( e_val ); } void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Perform standard checks. bli_l3_basic_check( alpha, a, ah, beta, c, cntx ); // Check object dimensions. e_val = bli_check_level3_dims( a, ah, c ); bli_check_error_code( e_val ); // Check matrix squareness. e_val = bli_check_square_object( c ); bli_check_error_code( e_val ); // Check matrix structure. e_val = bli_check_general_object( a ); bli_check_error_code( e_val ); e_val = bli_check_general_object( ah ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( c, a ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( c, ah ); bli_check_error_code( e_val ); } void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Perform standard checks. bli_l3_basic_check( alpha, a, bh, beta, c, cntx ); bli_l3_basic_check( alpha, b, ah, beta, c, cntx ); // Check object dimensions. e_val = bli_check_level3_dims( a, bh, c ); bli_check_error_code( e_val ); e_val = bli_check_level3_dims( b, ah, c ); bli_check_error_code( e_val ); // Check matrix squareness. e_val = bli_check_square_object( c ); bli_check_error_code( e_val ); // Check matrix structure. e_val = bli_check_general_object( a ); bli_check_error_code( e_val ); e_val = bli_check_general_object( bh ); bli_check_error_code( e_val ); e_val = bli_check_general_object( b ); bli_check_error_code( e_val ); e_val = bli_check_general_object( ah ); bli_check_error_code( e_val ); // Check for consistent datatypes. e_val = bli_check_consistent_object_datatypes( c, a ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( c, ah ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( c, b ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_datatypes( c, bh ); bli_check_error_code( e_val ); } void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( b ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( c ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_scalar_object( alpha ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( beta ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( b ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( c ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( alpha ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( b ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( beta ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( c ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_check.h000066400000000000000000000074561427272030600215600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_cntl.c000066400000000000000000000071311427272030600214240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ) { // If the control tree pointer is NULL, we construct a default // tree as a function of the operation family. if ( cntl_orig == NULL ) { if ( family == BLIS_GEMM || family == BLIS_GEMMT || family == BLIS_TRMM ) { *cntl_use = bli_gemm_cntl_create ( rntm, family, schema_a, schema_b, bli_obj_ker_fn( c ) ); } else // if ( family == BLIS_TRSM ) { side_t side; if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT; else side = BLIS_RIGHT; *cntl_use = bli_trsm_cntl_create ( rntm, side, schema_a, schema_b, bli_obj_ker_fn( c ) ); } } else { // If the user provided a control tree, create a copy and use it // instead (so that threads can use its local tree as a place to // cache things like pack mem_t entries). *cntl_use = bli_cntl_copy( rntm, cntl_orig ); // Recursively set the family fields of the newly copied control tree // nodes. bli_cntl_mark_family( family, *cntl_use ); } } void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ) { // NOTE: We don't actually need to call separate _cntl_free() functions // for gemm and trsm; it is merely an unnecessary mirroring of behavior // from the _create() side (which must call different functions based // on the family). opid_t family = bli_cntl_family( cntl_use ); if ( family == BLIS_GEMM || family == BLIS_GEMMT || family == BLIS_TRMM ) { bli_gemm_cntl_free( rntm, cntl_use, thread ); } else // if ( family == BLIS_TRSM ) { bli_trsm_cntl_free( rntm, cntl_use, thread ); } } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_cntl.h000066400000000000000000000041631427272030600214330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_direct.c000066400000000000000000000074241427272030600217430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ) { // Query the operation family. opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return bli_gemm_direct( a, b, c ); else if ( family == BLIS_GEMMT ) return bli_gemmt_direct( a, b, c ); else if ( family == BLIS_TRMM ) return bli_trmm_direct( a, b, c ); else if ( family == BLIS_TRSM ) return bli_trsm_direct( a, b, c ); // This should never execute. return BLIS_FWD; } // ----------------------------------------------------------------------------- dir_t bli_gemm_direct ( obj_t* a, obj_t* b, obj_t* c ) { // For gemm, movement may be forwards (or backwards). return BLIS_FWD; } dir_t bli_gemmt_direct ( obj_t* a, obj_t* b, obj_t* c ) { // For gemmt, movement may be forwards (or backwards). return BLIS_FWD; } dir_t bli_trmm_direct ( obj_t* a, obj_t* b, obj_t* c ) { dir_t direct; // For trmm, movement for the parameter cases is as follows: // - left,lower: backwards // - left,upper: forwards // - right,lower: forwards // - right,upper: backwards if ( bli_obj_root_is_triangular( a ) ) { if ( bli_obj_root_is_lower( a ) ) direct = BLIS_BWD; else direct = BLIS_FWD; } else // if ( bli_obj_root_is_triangular( b ) ) { if ( bli_obj_root_is_lower( b ) ) direct = BLIS_FWD; else direct = BLIS_BWD; } return direct; } dir_t bli_trsm_direct ( obj_t* a, obj_t* b, obj_t* c ) { dir_t direct; // For trsm, movement for the parameter cases is as follows: // - left,lower: forwards // - left,upper: backwards // - right,lower: backwards // - right,upper: forwards if ( bli_obj_root_is_triangular( a ) ) { if ( bli_obj_root_is_lower( a ) ) direct = BLIS_FWD; else direct = BLIS_BWD; } else // if ( bli_obj_root_is_triangular( b ) ) { if ( bli_obj_root_is_lower( b ) ) direct = BLIS_BWD; else direct = BLIS_FWD; } return direct; } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_direct.h000066400000000000000000000041051427272030600217410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ft_ukr.h000066400000000000000000000063071427272030600217670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ind.c000066400000000000000000000172431427272030600212430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // This array tracks whether a particular operation is implemented for each of // the induced methods. static bool bli_l3_ind_oper_impl[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS] = { /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ /* 1m */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE }, /* nat */ { TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE } }; // // NOTE: "2" is used instead of BLIS_NUM_FP_TYPES/2. // // BLIS provides APIs to modify this state during runtime. So, it's possible for one // application thread to modify the state before another starts the corresponding // BLIS operation. This is solved by making the induced method status array local to // threads. static BLIS_THREAD_LOCAL bool bli_l3_ind_oper_st[BLIS_NUM_IND_METHODS][BLIS_NUM_LEVEL3_OPS][2] = { /* gemm gemmt hemm herk her2k symm syrk syr2k trmm3 trmm trsm */ /* c z */ /* 1m */ { {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE}, {FALSE,FALSE} }, /* nat */ { {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE}, {TRUE,TRUE} }, }; // ----------------------------------------------------------------------------- #undef GENFUNC #define GENFUNC( opname, optype ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ) \ { \ return bli_l3_ind_oper_find_avail( optype, dt ); \ } //bool PASTEMAC(opname,ind_has_avail)( num_t dt ) //{ // return bli_ind_oper_has_avail( optype, dt ); //} GENFUNC( gemm, BLIS_GEMM ) GENFUNC( gemmt, BLIS_GEMMT ) GENFUNC( hemm, BLIS_HEMM ) GENFUNC( symm, BLIS_SYMM ) GENFUNC( trmm3, BLIS_TRMM3 ) GENFUNC( trmm, BLIS_TRMM ) GENFUNC( trsm, BLIS_TRSM ) // ----------------------------------------------------------------------------- #if 0 bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ) { bool enabled; bool stat; // If the datatype is real, it is never available. if ( !bli_is_complex( dt ) ) return FALSE; enabled = bli_l3_ind_oper_is_impl( oper, method ); stat = bli_l3_ind_oper_get_enable( oper, method, dt ); return ( enabled == TRUE && stat == TRUE ); } #endif // ----------------------------------------------------------------------------- ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ) { bli_init_once(); ind_t im; // If the datatype is real, return native execution. if ( !bli_is_complex( dt ) ) return BLIS_NAT; // If the operation is not level-3, return native execution. if ( !bli_opid_is_level3( oper ) ) return BLIS_NAT; // Iterate over all induced methods and search for the first one // that is available (ie: both implemented and enabled) for the // current operation and datatype. for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im ) { bool enabled = bli_l3_ind_oper_is_impl( oper, im ); bool stat = bli_l3_ind_oper_get_enable( oper, im, dt ); if ( enabled == TRUE && stat == TRUE ) return im; } // This return statement should never execute since the native index // should be found even if all induced methods are unavailable. We // include it simply to avoid a compiler warning. return BLIS_NAT; } // ----------------------------------------------------------------------------- void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ) { opid_t iop; if ( !bli_is_complex( dt ) ) return; // Iterate over all level-3 operation ids. for ( iop = 0; iop < BLIS_NUM_LEVEL3_OPS; ++iop ) { bli_l3_ind_oper_set_enable( iop, method, dt, status ); } } // ----------------------------------------------------------------------------- void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ) { ind_t im; if ( !bli_is_complex( dt ) ) return; if ( !bli_opid_is_level3( oper ) ) return; for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im ) { // Native execution should always stay enabled. if ( im == BLIS_NAT ) continue; // When we come upon the requested method, enable it for the given // operation and datatype. Otherwise, disable it. if ( im == method ) bli_l3_ind_oper_set_enable( oper, im, dt, TRUE ); else bli_l3_ind_oper_set_enable( oper, im, dt, FALSE ); } } void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ) { ind_t im; if ( !bli_is_complex( dt ) ) return; if ( !bli_opid_is_level3( oper ) ) return; for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im ) { // Native execution should always stay enabled. if ( im != BLIS_NAT ) bli_l3_ind_oper_set_enable( oper, im, dt, status ); } } // ----------------------------------------------------------------------------- // A mutex to allow synchronous access to the bli_l3_ind_oper_st array. static bli_pthread_mutex_t oper_st_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ) { num_t idt; if ( !bli_is_complex( dt ) ) return; if ( !bli_opid_is_level3( oper ) ) return; // Disallow changing status of native execution. if ( method == BLIS_NAT ) return; idt = bli_ind_map_cdt_to_index( dt ); // Acquire the mutex protecting bli_l3_ind_oper_st. bli_pthread_mutex_lock( &oper_st_mutex ); // BEGIN CRITICAL SECTION { bli_l3_ind_oper_st[ method ][ oper ][ idt ] = status; } // END CRITICAL SECTION // Release the mutex protecting bli_l3_ind_oper_st. bli_pthread_mutex_unlock( &oper_st_mutex ); } bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ) { num_t idt = bli_ind_map_cdt_to_index( dt ); bool r_val; { r_val = bli_l3_ind_oper_st[ method ][ oper ][ idt ]; } return r_val; } // ----------------------------------------------------------------------------- bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ) { return bli_l3_ind_oper_impl[ method ][ oper ]; } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ind.h000066400000000000000000000053631427272030600212500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); /*bool PASTEMAC(opname,ind_has_avail)( num_t dt ); */ GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ind_ukr.h000066400000000000000000000062551427272030600221320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-3 micro-kernels. // // 1m micro-kernels #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTPROT_BASIC0( gemm1m_ukr_name ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm1m_u_ukr_name ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm1m_u_ukr_name ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_int.c000066400000000000000000000111311427272030600212510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local; obj_t b_local; obj_t c_local; // Return early if the current control tree node is NULL. if ( bli_cntl_is_null( cntl ) ) return; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_basic_check( alpha, a, b, beta, c, cntx ); // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) { return; } // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_barrier( thread ); return; } // If A or B is marked as being filled with zeros, scale C by beta and // return early. if ( bli_obj_is_zeros( a ) || bli_obj_is_zeros( b ) ) { // This should never execute. bli_abort(); if ( bli_thread_am_ochief( thread ) ) bli_scalm( beta, c ); bli_thread_barrier( thread ); return; } // Alias A, B, and C in case we need to update attached scalars. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); // Ensure that a valid packing function is set on A and B. if ( !bli_obj_pack_fn( &a_local ) ) bli_obj_set_pack_fn( bli_packm_blk_var1, &a_local ); if ( !bli_obj_pack_fn( &b_local ) ) bli_obj_set_pack_fn( bli_packm_blk_var1, &b_local ); // If we are about to call a leaf-level implementation, and matrix C // still needs a transposition, then we must induce one by swapping the // strides and dimensions. Note that this transposition would normally // be handled explicitly in the packing of C, but if C is not being // packed, this is our last chance to handle the transposition. //if ( bli_cntl_is_leaf( cntl ) && bli_obj_has_trans( c ) ) if ( bli_obj_has_trans( c ) ) { bli_obj_induce_trans( &c_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &c_local ); } // If alpha is non-unit, typecast and apply it to the scalar attached // to B, unless it happens to be triangular. if ( bli_obj_root_is_triangular( b ) ) { if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) bli_obj_scalar_apply_scalar( alpha, &a_local ); } else // if ( bli_obj_root_is_triangular( b ) ) { if ( !bli_obj_equals( alpha, &BLIS_ONE ) ) bli_obj_scalar_apply_scalar( alpha, &b_local ); } // If beta is non-unit, typecast and apply it to the scalar attached // to C. if ( !bli_obj_equals( beta, &BLIS_ONE ) ) bli_obj_scalar_apply_scalar( beta, &c_local ); // Create the next node in the thrinfo_t structure. bli_thrinfo_grow( rntm, cntl, thread ); // Extract the function pointer from the current control tree node. l3_var_oft f = bli_cntl_var_func( cntl ); // Invoke the variant. f ( &a_local, &b_local, &c_local, cntx, rntm, cntl, thread ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_int.h000066400000000000000000000035421427272030600212650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_oapi.c000066400000000000000000000064131427272030600214160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based interfaces (basic). // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC(opname,_ex)( alpha, a, b, beta, c, NULL, NULL ); \ } GENFRONT( gemm ) GENFRONT( gemmt ) GENFRONT( her2k ) GENFRONT( syr2k ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC(opname,_ex)( side, alpha, a, b, beta, c, NULL, NULL ); \ } GENFRONT( hemm ) GENFRONT( symm ) GENFRONT( trmm3 ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC(opname,_ex)( alpha, a, beta, c, NULL, NULL ); \ } GENFRONT( herk ) GENFRONT( syrk ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC(opname,_ex)( side, alpha, a, b, NULL, NULL ); \ } GENFRONT( trmm ) GENFRONT( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_oapi.h000066400000000000000000000053611427272030600214240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_oapi_ex.c000066400000000000000000000411611427272030600221110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based interfaces (expert). // // If a sandbox was enabled, we forgo defining bli_gemm_ex() since it will be // defined in the sandbox environment. #ifndef BLIS_ENABLE_SANDBOX void PASTEMAC(gemm,BLIS_OAPI_EX_SUF) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); // If the rntm is non-NULL, it may indicate that we should forgo sup // handling altogether. bool enable_sup = TRUE; if ( rntm != NULL ) enable_sup = bli_rntm_l3_sup( rntm ); if ( enable_sup ) { // Execute the small/unpacked oapi handler. If it finds that the problem // does not fall within the thresholds that define "small", or for some // other reason decides not to use the small/unpacked implementation, // the function returns with BLIS_FAILURE, which causes execution to // proceed towards the conventional implementation. err_t result = bli_gemmsup( alpha, a, b, beta, c, cntx, rntm ); if ( result == BLIS_SUCCESS ) { return; } } // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // Default to using native execution. num_t dt = bli_obj_dt( c ); ind_t im = BLIS_NAT; // If each matrix operand has a complex storage datatype, try to get an // induced method (if one is available and enabled). NOTE: Allowing // precisions to vary while using 1m, which is what we do here, is unique // to gemm; other level-3 operations use 1m only if all storage datatypes // are equal (and they ignore the computation precision). if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_complex( b ) ) { // Find the highest priority induced method that is both enabled and // available for the current operation. (If an induced method is // available but not enabled, or simply unavailable, BLIS_NAT will // be returned here.) im = bli_gemmind_find_avail( dt ); } // If necessary, obtain a valid context from the gks using the induced // method id determined above. if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); // Check the operands. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL ); } #endif void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // Default to using native execution. num_t dt = bli_obj_dt( c ); ind_t im = BLIS_NAT; // If all matrix operands are complex and of the same storage datatype, try // to get an induced method (if one is available and enabled). if ( bli_obj_dt( a ) == bli_obj_dt( c ) && bli_obj_dt( b ) == bli_obj_dt( c ) && bli_obj_is_complex( c ) ) { // Find the highest priority induced method that is both enabled and // available for the current operation. (If an induced method is // available but not enabled, or simply unavailable, BLIS_NAT will // be returned here.) im = bli_gemmtind_find_avail( dt ); } // If necessary, obtain a valid context from the gks using the induced // method id determined above. if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); // Check the operands. if ( bli_error_checking_is_enabled() ) bli_gemmt_check( alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. bli_gemmt_front( alpha, a, b, beta, c, cntx, rntm, NULL ); } void PASTEMAC(her2k,BLIS_OAPI_EX_SUF) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); obj_t ah; obj_t bh; obj_t alphah; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_her2k_check( alpha, a, b, beta, c, cntx ); bli_obj_alias_to( alpha, &alphah ); bli_obj_toggle_conj( &alphah ); bli_obj_alias_to( a, &ah ); bli_obj_toggle_trans( &ah ); bli_obj_toggle_conj( &ah ); bli_obj_alias_to( b, &bh ); bli_obj_toggle_trans( &bh ); bli_obj_toggle_conj( &bh ); // Invoke gemmt twice, using beta only the first time. PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bh, beta, c, cntx, rntm ); PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( &alphah, b, &ah, &BLIS_ONE, c, cntx, rntm ); // The Hermitian rank-2k product was computed as alpha*A*B'+alpha'*B*A', even for // the diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-2k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, c ); } void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); obj_t at; obj_t bt; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syr2k_check( alpha, a, b, beta, c, cntx ); bli_obj_alias_to( b, &bt ); bli_obj_toggle_trans( &bt ); bli_obj_alias_to( a, &at ); bli_obj_toggle_trans( &at ); // Invoke gemmt twice, using beta only the first time. PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &bt, beta, c, cntx, rntm ); PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, b, &at, &BLIS_ONE, c, cntx, rntm ); } void PASTEMAC(hemm,BLIS_OAPI_EX_SUF) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // Default to using native execution. num_t dt = bli_obj_dt( c ); ind_t im = BLIS_NAT; // If all matrix operands are complex and of the same storage datatype, try // to get an induced method (if one is available and enabled). if ( bli_obj_dt( a ) == bli_obj_dt( c ) && bli_obj_dt( b ) == bli_obj_dt( c ) && bli_obj_is_complex( c ) ) { // Find the highest priority induced method that is both enabled and // available for the current operation. (If an induced method is // available but not enabled, or simply unavailable, BLIS_NAT will // be returned here.) im = bli_hemmind_find_avail( dt ); } // If necessary, obtain a valid context from the gks using the induced // method id determined above. if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); // Check the operands. if ( bli_error_checking_is_enabled() ) bli_hemm_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. bli_hemm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); } void PASTEMAC(symm,BLIS_OAPI_EX_SUF) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // Default to using native execution. num_t dt = bli_obj_dt( c ); ind_t im = BLIS_NAT; // If all matrix operands are complex and of the same storage datatype, try // to get an induced method (if one is available and enabled). if ( bli_obj_dt( a ) == bli_obj_dt( c ) && bli_obj_dt( b ) == bli_obj_dt( c ) && bli_obj_is_complex( c ) ) { // Find the highest priority induced method that is both enabled and // available for the current operation. (If an induced method is // available but not enabled, or simply unavailable, BLIS_NAT will // be returned here.) im = bli_symmind_find_avail( dt ); } // If necessary, obtain a valid context from the gks using the induced // method id determined above. if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); // Check the operands. if ( bli_error_checking_is_enabled() ) bli_symm_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. bli_symm_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); } void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // Default to using native execution. num_t dt = bli_obj_dt( c ); ind_t im = BLIS_NAT; // If all matrix operands are complex and of the same storage datatype, try // to get an induced method (if one is available and enabled). if ( bli_obj_dt( a ) == bli_obj_dt( c ) && bli_obj_dt( b ) == bli_obj_dt( c ) && bli_obj_is_complex( c ) ) { // Find the highest priority induced method that is both enabled and // available for the current operation. (If an induced method is // available but not enabled, or simply unavailable, BLIS_NAT will // be returned here.) im = bli_trmm3ind_find_avail( dt ); } // If necessary, obtain a valid context from the gks using the induced // method id determined above. if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); // Check the operands. if ( bli_error_checking_is_enabled() ) bli_trmm3_check( side, alpha, a, b, beta, c, cntx ); // Invoke the operation's front-end and request the default control tree. bli_trmm3_front( side, alpha, a, b, beta, c, cntx, rntm, NULL ); } void PASTEMAC(herk,BLIS_OAPI_EX_SUF) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); obj_t ah; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_herk_check( alpha, a, beta, c, cntx ); bli_obj_alias_to( a, &ah ); bli_obj_toggle_trans( &ah ); bli_obj_toggle_conj( &ah ); PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &ah, beta, c, cntx, rntm ); // The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the // diagonal elements. Mathematically, the imaginary components of // diagonal elements of a Hermitian rank-k product should always be // zero. However, in practice, they sometimes accumulate meaningless // non-zero values. To prevent this, we explicitly set those values // to zero before returning. bli_setid( &BLIS_ZERO, c ); } void PASTEMAC(syrk,BLIS_OAPI_EX_SUF) ( obj_t* alpha, obj_t* a, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); obj_t at; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_syrk_check( alpha, a, beta, c, cntx ); bli_obj_alias_to( a, &at ); bli_obj_toggle_trans( &at ); PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)( alpha, a, &at, beta, c, cntx, rntm ); } void PASTEMAC(trmm,BLIS_OAPI_EX_SUF) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // Default to using native execution. num_t dt = bli_obj_dt( b ); ind_t im = BLIS_NAT; // If all matrix operands are complex and of the same storage datatype, try // to get an induced method (if one is available and enabled). if ( bli_obj_dt( a ) == bli_obj_dt( b ) && bli_obj_is_complex( b ) ) { // Find the highest priority induced method that is both enabled and // available for the current operation. (If an induced method is // available but not enabled, or simply unavailable, BLIS_NAT will // be returned here.) im = bli_trmmind_find_avail( dt ); } // If necessary, obtain a valid context from the gks using the induced // method id determined above. if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); // Check the operands. if ( bli_error_checking_is_enabled() ) bli_trmm_check( side, alpha, a, b, cntx ); // Invoke the operation's front-end and request the default control tree. bli_trmm_front( side, alpha, a, b, cntx, rntm, NULL ); } void PASTEMAC(trsm,BLIS_OAPI_EX_SUF) ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm ) { bli_init_once(); // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // Default to using native execution. num_t dt = bli_obj_dt( b ); ind_t im = BLIS_NAT; // If all matrix operands are complex and of the same storage datatype, try // to get an induced method (if one is available and enabled). if ( bli_obj_dt( a ) == bli_obj_dt( b ) && bli_obj_is_complex( b ) ) { // Find the highest priority induced method that is both enabled and // available for the current operation. (If an induced method is // available but not enabled, or simply unavailable, BLIS_NAT will // be returned here.) im = bli_trsmind_find_avail( dt ); } // If necessary, obtain a valid context from the gks using the induced // method id determined above. if ( cntx == NULL ) cntx = bli_gks_query_ind_cntx( im, dt ); // Check the operands. if ( bli_error_checking_is_enabled() ) bli_trsm_check( side, alpha, a, b, cntx ); // Invoke the operation's front-end and request the default control tree. bli_trsm_front( side, alpha, a, b, cntx, rntm, NULL ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_oapi_ex.h000066400000000000000000000057521427272030600221240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_oft.h000066400000000000000000000056051427272030600212650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif cython-blis-0.9.1/blis/_src/frame/3/bli_l3_oft_var.h000066400000000000000000000040131427272030600221250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif cython-blis-0.9.1/blis/_src/frame/3/bli_l3_packab.c000066400000000000000000000064011427272030600217040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a_local, a_pack; bli_obj_alias_to( a, &a_local ); if ( bli_obj_has_trans( a ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } // Pack matrix A according to the control tree node. bli_packm_int ( &a_local, &a_pack, cntx, rntm, cntl, thread ); // Proceed with execution using packed matrix A. bli_l3_int ( &BLIS_ONE, &a_pack, b, &BLIS_ONE, c, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } // ----------------------------------------------------------------------------- void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t bt_local, bt_pack; // We always pass B^T to bli_l3_packm. bli_obj_alias_to( b, &bt_local ); if ( bli_obj_has_trans( b ) ) { bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &bt_local ); } else { bli_obj_induce_trans( &bt_local ); } // Pack matrix B according to the control tree node. bli_packm_int ( &bt_local, &bt_pack, cntx, rntm, cntl, thread ); // Transpose packed object back to B. bli_obj_induce_trans( &bt_pack ); // Proceed with execution using packed matrix B. bli_l3_int ( &BLIS_ONE, a, &bt_pack, &BLIS_ONE, c, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_packab.h000066400000000000000000000037511427272030600217160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_prune.c000066400000000000000000000140171427272030600216160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" /* void bli_l3_prune_unref_mparts_m ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ) { // Query the operation family. opid_t family = bli_cntl_family( cntl ); if ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm. else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c ); else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c ); } */ #undef GENFRONT #define GENFRONT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ) \ { \ /* Query the operation family. */ \ opid_t family = bli_cntl_family( cntl ); \ \ if ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \ else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \ else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \ } GENFRONT( m ) GENFRONT( n ) GENFRONT( k ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_prune_unref_mparts_m) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ) \ { \ /* No pruning is necessary for gemm. */ \ } \ void PASTEMAC(opname,_prune_unref_mparts_n) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ) \ { \ /* No pruning is necessary for gemm. */ \ } \ void PASTEMAC(opname,_prune_unref_mparts_k) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ) \ { \ /* No pruning is necessary for gemm. */ \ } GENFRONT( gemm ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_prune_unref_mparts_m) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c \ ) \ { \ /* Prune any unreferenced part from the subpartition of C (that would be encountered from partitioning in the m dimension) and adjust the subpartition of A accordingly. */ \ bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M ); \ } \ void PASTEMAC(opname,_prune_unref_mparts_n) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c \ ) \ { \ /* Prune any unreferenced part from the subpartition of C (that would be encountered from partitioning in the n dimension) and adjust the subpartition of Ah accordingly. */ \ bli_prune_unref_mparts( c, BLIS_N, ah, BLIS_N ); \ } \ void PASTEMAC(opname,_prune_unref_mparts_k) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c \ ) \ { \ /* As long as A and Ah are general in structure, no pruning should be for the k dimension. */ \ } GENFRONT( gemmt ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_prune_unref_mparts_m) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ) \ { \ /* Prune any unreferenced part from the subpartition of A (that would be encountered from partitioning in the m dimension) and adjust the subpartition of C accordingly. */ \ bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M ); \ } \ void PASTEMAC(opname,_prune_unref_mparts_n) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ) \ { \ /* Prune any unreferenced part from the subpartition of B (that would be encountered from partitioning in the n dimension) and adjust the subpartition of C accordingly. */ \ bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N ); \ } \ void PASTEMAC(opname,_prune_unref_mparts_k) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ) \ { \ /* Prune any unreferenced part from the subpartition of A (that would be encountered from partitioning in the k dimension) and adjust the subpartition of B accordingly. */ \ bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M ); \ \ /* Prune any unreferenced part from the subpartition of B (that would be encountered from partitioning in the k dimension) and adjust the subpartition of A accordingly. */ \ bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N ); \ } GENFRONT( trmm ) GENFRONT( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_prune.h000066400000000000000000000045361427272030600216300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_schema.c000066400000000000000000000062071427272030600217270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ) { // Begin with pack schemas for native execution. pack_t schema_a = BLIS_PACKED_ROW_PANELS; pack_t schema_b = BLIS_PACKED_COL_PANELS; // When executing the 1m method, choose the appropriate pack schemas based // on the microkernel preference encoded within the current cntx_t (which // was presumably returned by the gks). if ( bli_cntx_method( cntx ) == BLIS_1M ) { num_t dt = bli_obj_domain( c ) | bli_obj_comp_prec( c ); // Note that bli_cntx_l3_vir_ukr_prefers_cols_dt() will use the real // projection of dt to query the preference of the corresponding native // real-domain microkernel. This is what ultimately determines which // variant of 1m is applicable. if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) { schema_a = BLIS_PACKED_ROW_PANELS_1E; schema_b = BLIS_PACKED_COL_PANELS_1R; } else { schema_a = BLIS_PACKED_ROW_PANELS_1R; schema_b = BLIS_PACKED_COL_PANELS_1E; } } // Embed the schemas into the objects for A and B. This is a sort of hack // for communicating the desired pack schemas to bli_gemm_cntl_create() // (via bli_l3_thread_decorator() and bli_l3_cntl_create_if()). This allows // us to subsequently access the schemas from the control tree, which // hopefully reduces some confusion, particularly in bli_packm_init(). bli_obj_set_pack_schema( schema_a, a ); bli_obj_set_pack_schema( schema_b, b ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_schema.h000066400000000000000000000033721427272030600217340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup.c000066400000000000000000000157361427272030600213050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. #ifdef BLIS_DISABLE_SUP_HANDLING return BLIS_FAILURE; #endif // Return early if this is a mixed-datatype computation. if ( bli_obj_dt( c ) != bli_obj_dt( a ) || bli_obj_dt( c ) != bli_obj_dt( b ) || bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) return BLIS_FAILURE; // Obtain a valid (native) context from the gks if necessary. // NOTE: This must be done before calling the _check() function, since // that function assumes the context pointer is valid. if ( cntx == NULL ) cntx = bli_gks_query_cntx(); // Return early if a microkernel preference-induced transposition would // have been performed and shifted the dimensions outside of the space // of sup-handled problems. if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) ) { const num_t dt = bli_obj_dt( c ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width_after_trans( a ); // Pass in m and n reversed, which simulates a transposition of the // entire operation pursuant to the microkernel storage preference. if ( !bli_cntx_l3_sup_thresh_is_met( dt, n, m, k, cntx ) ) return BLIS_FAILURE; } else // ukr_prefers_storage_of( c, ... ) { const num_t dt = bli_obj_dt( c ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width_after_trans( a ); if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, n, k, cntx ) ) return BLIS_FAILURE; } // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } #if 0 const num_t dt = bli_obj_dt( c ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width_after_trans( a ); const dim_t tm = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ); const dim_t tn = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ); const dim_t tk = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ); printf( "dims: %d %d %d (threshs: %d %d %d)\n", (int)m, (int)n, (int)k, (int)tm, (int)tn, (int)tk ); #endif // We've now ruled out the following two possibilities: // - the ukernel prefers the operation as-is, and the sup thresholds are // unsatisfied. // - the ukernel prefers a transposed operation, and the sup thresholds are // unsatisfied after taking into account the transposition. // This implies that the sup thresholds (at least one of them) are met. // and the small/unpacked handler should be called. // NOTE: The sup handler is free to enforce a stricter threshold regime // if it so chooses, in which case it can/should return BLIS_FAILURE. // Query the small/unpacked handler from the context and invoke it. gemmsup_oft gemmsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMM, cntx ); return gemmsup_fp ( alpha, a, b, beta, c, cntx, rntm ); } err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // Return early if small matrix handling is disabled at configure-time. #ifdef BLIS_DISABLE_SUP_HANDLING return BLIS_FAILURE; #endif // Return early if this is a mixed-datatype computation. if ( bli_obj_dt( c ) != bli_obj_dt( a ) || bli_obj_dt( c ) != bli_obj_dt( b ) || bli_obj_comp_prec( c ) != bli_obj_prec( c ) ) return BLIS_FAILURE; // Obtain a valid (native) context from the gks if necessary. // NOTE: This must be done before calling the _check() function, since // that function assumes the context pointer is valid. if ( cntx == NULL ) cntx = bli_gks_query_cntx(); // Return early if the problem dimensions exceed their sup thresholds. // Notice that we do not bother to check whether the microkernel // prefers or dislikes the storage of C, since the same check is called // for either way. { const num_t dt = bli_obj_dt( c ); const dim_t m = bli_obj_length( c ); const dim_t k = bli_obj_width_after_trans( a ); if ( !bli_cntx_l3_sup_thresh_is_met( dt, m, m, k, cntx ) ) return BLIS_FAILURE; } // Initialize a local runtime with global settings if necessary. Note // that in the case that a runtime is passed in, we make a local copy. rntm_t rntm_l; if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; } else { rntm_l = *rntm; rntm = &rntm_l; } // We've now ruled out the possibility that the sup thresholds are // unsatisfied. // This implies that the sup thresholds (at least one of them) are met. // and the small/unpacked handler should be called. // NOTE: The sup handler is free to enforce a stricter threshold regime // if it so chooses, in which case it can/should return BLIS_FAILURE. // Query the small/unpacked handler from the context and invoke it. gemmtsup_oft gemmtsup_fp = bli_cntx_get_l3_sup_handler( BLIS_GEMMT, cntx ); return gemmtsup_fp ( alpha, a, b, beta, c, cntx, rntm ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup.h000066400000000000000000000037441427272030600213060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_ft_ker.h000066400000000000000000000046461427272030600226420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_int.c000066400000000000000000000341301427272030600221440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ) { #if 0 //bli_gemmsup_ref_var2 //bli_gemmsup_ref_var1 #if 0 bli_gemmsup_ref_var1n #else #endif const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); if ( is_rrr_rrc_rcr_crr ) { bli_gemmsup_ref_var2m ( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } else { bli_gemmsup_ref_var2m ( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm ); } return BLIS_SUCCESS; #endif const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const bool auto_factor = bli_rntm_auto_factor( rntm ); const dim_t n_threads = bli_rntm_num_threads( rntm ); bool use_bp = TRUE; dim_t jc_new; dim_t ic_new; if ( is_primary ) { // This branch handles: // - rrr rrc rcr crr for row-preferential kernels // - rcc crc ccr ccc for column-preferential kernels const dim_t mu = m / MR; const dim_t nu = n / NR; // Decide which algorithm to use (block-panel var2m or panel-block // var1n) based on the number of micropanels in the m and n dimensions. // Also, recalculate the automatic thread factorization. if ( mu >= nu ) use_bp = TRUE; else /* if ( mu < nu ) */ use_bp = FALSE; // If the parallel thread factorization was automatic, we update it // with a new factorization based on the matrix dimensions in units // of micropanels. if ( auto_factor ) { if ( use_bp ) { // In the block-panel algorithm, the m dimension is parallelized // with ic_nt and the n dimension is parallelized with jc_nt. bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); } else // if ( !use_bp ) { // In the panel-block algorithm, the m dimension is parallelized // with jc_nt and the n dimension is parallelized with ic_nt. bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); } // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); bli_l3_sup_thrinfo_update_root( rntm, thread ); } if ( use_bp ) { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); #endif // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() bli_gemmsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); } else // use_pb { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var1n primary\n" ); #endif // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() bli_gemmsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); // *requires nudging of nc up to be a multiple of mr. } } else { // This branch handles: // - rrr rrc rcr crr for column-preferential kernels // - rcc crc ccr ccc for row-preferential kernels const dim_t mu = n / MR; // the n becomes m after a transposition const dim_t nu = m / NR; // the m becomes n after a transposition // Decide which algorithm to use (block-panel var2m or panel-block // var1n) based on the number of micropanels in the m and n dimensions. // Also, recalculate the automatic thread factorization. if ( mu >= nu ) use_bp = TRUE; else /* if ( mu < nu ) */ use_bp = FALSE; // If the parallel thread factorization was automatic, we update it // with a new factorization based on the matrix dimensions in units // of micropanels. if ( auto_factor ) { if ( use_bp ) { // In the block-panel algorithm, the m dimension is parallelized // with ic_nt and the n dimension is parallelized with jc_nt. bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); } else // if ( !use_bp ) { // In the panel-block algorithm, the m dimension is parallelized // with jc_nt and the n dimension is parallelized with ic_nt. bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); } // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); bli_l3_sup_thrinfo_update_root( rntm, thread ); } if ( use_bp ) { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var2m non-primary\n" ); #endif // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans bli_gemmsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); } else // use_pb { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var1n non-primary\n" ); #endif // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans bli_gemmsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); // *requires nudging of mc up to be a multiple of nr. } } // Return success so that the caller knows that we computed the solution. return BLIS_SUCCESS; } // ----------------------------------------------------------------------------- err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ) { const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); const bool is_rrr_rrc_rcr_crr = ( stor_id == BLIS_RRR || stor_id == BLIS_RRC || stor_id == BLIS_RCR || stor_id == BLIS_CRR ); const bool is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr; const num_t dt = bli_obj_dt( c ); const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool is_primary = ( row_pref ? is_rrr_rrc_rcr_crr : is_rcc_crc_ccr_ccc ); const dim_t m = bli_obj_length( c ); const dim_t n = m; const dim_t MR = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); const dim_t NR = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); const bool auto_factor = bli_rntm_auto_factor( rntm ); const dim_t n_threads = bli_rntm_num_threads( rntm ); bool use_bp = TRUE; dim_t jc_new; dim_t ic_new; if ( is_primary ) { // This branch handles: // - rrr rrc rcr crr for row-preferential kernels // - rcc crc ccr ccc for column-preferential kernels const dim_t mu = m / MR; const dim_t nu = n / NR; // Decide which algorithm to use (block-panel var2m or panel-block // var1n) based on the number of micropanels in the m and n dimensions. // Also, recalculate the automatic thread factorization. if ( mu >= nu ) use_bp = TRUE; else /* if ( mu < nu ) */ use_bp = FALSE; // If the parallel thread factorization was automatic, we update it // with a new factorization based on the matrix dimensions in units // of micropanels. if ( auto_factor ) { if ( use_bp ) { // In the block-panel algorithm, the m dimension is parallelized // with ic_nt and the n dimension is parallelized with jc_nt. bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); } else // if ( !use_bp ) { // In the panel-block algorithm, the m dimension is parallelized // with jc_nt and the n dimension is parallelized with ic_nt. bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); } // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); bli_l3_sup_thrinfo_update_root( rntm, thread ); } if ( use_bp ) { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var2m primary\n" ); #endif // block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2() #if 0 bli_gemmtsup_ref_var2m( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); #endif } else // use_pb { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var1n primary\n" ); #endif // panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1() #if 0 bli_gemmtsup_ref_var1n( BLIS_NO_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); #endif // *requires nudging of nc up to be a multiple of mr. } } else { // This branch handles: // - rrr rrc rcr crr for column-preferential kernels // - rcc crc ccr ccc for row-preferential kernels const dim_t mu = n / MR; // the n becomes m after a transposition const dim_t nu = m / NR; // the m becomes n after a transposition // Decide which algorithm to use (block-panel var2m or panel-block // var1n) based on the number of micropanels in the m and n dimensions. // Also, recalculate the automatic thread factorization. if ( mu >= nu ) use_bp = TRUE; else /* if ( mu < nu ) */ use_bp = FALSE; // If the parallel thread factorization was automatic, we update it // with a new factorization based on the matrix dimensions in units // of micropanels. if ( auto_factor ) { if ( use_bp ) { // In the block-panel algorithm, the m dimension is parallelized // with ic_nt and the n dimension is parallelized with jc_nt. bli_thread_partition_2x2( n_threads, mu, nu, &ic_new, &jc_new ); } else // if ( !use_bp ) { // In the panel-block algorithm, the m dimension is parallelized // with jc_nt and the n dimension is parallelized with ic_nt. bli_thread_partition_2x2( n_threads, mu, nu, &jc_new, &ic_new ); } // Update the ways of parallelism for the jc and ic loops, and then // update the current thread's root thrinfo_t node according to the // new ways of parallelism value for the jc loop. bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm ); bli_l3_sup_thrinfo_update_root( rntm, thread ); } if ( use_bp ) { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var2m non-primary\n" ); #endif // panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans #if 0 bli_gemmtsup_ref_var2m( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); #endif } else // use_pb { #ifdef TRACEVAR if ( bli_thread_am_ochief( thread ) ) printf( "bli_l3_sup_int(): var1n non-primary\n" ); #endif // block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans #if 0 bli_gemmtsup_ref_var1n( BLIS_TRANSPOSE, alpha, a, b, beta, c, stor_id, cntx, rntm, thread ); #endif // *requires nudging of mc up to be a multiple of nr. } } // Return success so that the caller knows that we computed the solution. return BLIS_SUCCESS; } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_int.h000066400000000000000000000040371427272030600221540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_ker.h000066400000000000000000000045031427272030600221410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-3 kernels on small/unpacked matrices. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ker_prot.h. #undef GENTPROT #define GENTPROT GEMMSUP_KER_PROT INSERT_GENTPROT_BASIC0( gemmsup_rv_ukr_name ) INSERT_GENTPROT_BASIC0( gemmsup_rg_ukr_name ) INSERT_GENTPROT_BASIC0( gemmsup_cv_ukr_name ) INSERT_GENTPROT_BASIC0( gemmsup_cg_ukr_name ) INSERT_GENTPROT_BASIC0( gemmsup_rd_ukr_name ) INSERT_GENTPROT_BASIC0( gemmsup_cd_ukr_name ) INSERT_GENTPROT_BASIC0( gemmsup_gx_ukr_name ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_ker_prot.h000066400000000000000000000044251427272030600232100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_oft.h000066400000000000000000000040411427272030600221450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019-20, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_packm_a.c000066400000000000000000000305161427272030600227510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ /* Inspect whether we are going to be packing matrix A. */ \ if ( will_pack == FALSE ) \ { \ } \ else /* if ( will_pack == TRUE ) */ \ { \ /* NOTE: This "rounding up" of the last upanel is actually optional for the rrc/crc cases, but absolutely necessary for the other cases since we NEED that last micropanel to have the same ldim (cs_p) as the other micropanels. Why? So that millikernels can use the same upanel ldim for all iterations of the ir loop. */ \ const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ const dim_t k_pack = k; \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, then we need to acquire a block from the memory broker. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ { \ /* Acquire directly to the chief thread's mem_t that was passed in. It needs to be that mem_t struct, and not a local (temporary) mem_t, since there is no barrier until after packing is finished, which could allow a race condition whereby the chief thread exits the current function before the other threads have a chance to copy from it. (A barrier would fix that race condition, but then again, I prefer to keep barriers to a minimum.) */ \ bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ pack_buf_type, \ mem \ ); \ } \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ if ( !bli_thread_am_ochief( thread ) ) \ { \ *mem = *mem_p; \ } \ } \ else /* if ( bli_mem_is_alloc( mem ) ) */ \ { \ /* If the mem_t entry provided by the caller does NOT contain a NULL buffer, then a block has already been acquired from the memory broker and cached by the caller. */ \ \ /* As a sanity check, we should make sure that the mem_t object isn't associated with a block that is too small compared to the size of the packed matrix buffer that is needed, according to the value computed above. */ \ siz_t mem_size = bli_mem_size( mem ); \ \ if ( mem_size < size_needed ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ { \ /* The chief thread releases the existing block associated with the mem_t, and then re-acquires a new block, saving the associated mem_t to its passed-in mem_t. (See coment above for why the acquisition needs to be directly to the chief thread's passed-in mem_t and not a local (temporary) mem_t. */ \ bli_pba_release \ ( \ rntm, \ mem \ ); \ bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ pack_buf_type, \ mem \ ); \ } \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ if ( !bli_thread_am_ochief( thread ) ) \ { \ *mem = *mem_p; \ } \ } \ else \ { \ /* If the mem_t entry is already allocated and sufficiently large, then we use it as-is. No action is needed. */ \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ /* Inspect whether we previously packed matrix A. */ \ if ( did_pack == FALSE ) \ { \ /* If we didn't pack matrix A, there's nothing to be done. */ \ } \ else /* if ( did_pack == TRUE ) */ \ { \ if ( thread != NULL ) \ if ( bli_thread_am_ochief( thread ) ) \ { \ /* Check the mem_t entry provided by the caller. Only proceed if it is allocated, which it should be. */ \ if ( bli_mem_is_alloc( mem ) ) \ { \ bli_pba_release \ ( \ rntm, \ mem \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ /* Inspect whether we are going to be packing matrix A. */ \ if ( will_pack == FALSE ) \ { \ *m_max = m; \ *k_max = k; \ \ /* Set the parameters for use with no packing of A (ie: using the source matrix A directly). */ \ { \ /* Use the strides of the source matrix as the final values. */ \ *rs_p = rs_x; \ *cs_p = cs_x; \ \ *pd_p = mr; \ *ps_p = mr * rs_x; \ \ /* Set the schema to "not packed" to indicate that packing will be skipped. */ \ *schema = BLIS_NOT_PACKED; \ } \ \ /* Since we won't be packing, simply update the buffer address provided by the caller to point to source matrix. */ \ *p = x; \ } \ else /* if ( will_pack == TRUE ) */ \ { \ /* NOTE: This is "rounding up" of the last upanel is actually optional for the rrc/crc cases, but absolutely necessary for the other cases since we NEED that last micropanel to have the same ldim (cs_p) as the other micropanels. Why? So that millikernels can use the same upanel ldim for all iterations of the ir loop. */ \ *m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \ *k_max = k; \ \ /* Determine the dimensions and strides for the packed matrix A. */ \ if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) \ { \ /* stor3_t id values _RRC and _CRC: pack A to plain row storage. */ \ *rs_p = k; \ *cs_p = 1; \ \ *pd_p = mr; \ *ps_p = mr * k; \ \ /* Set the schema to "row packed" to indicate packing to plain row storage. */ \ *schema = BLIS_PACKED_ROWS; \ } \ else \ { \ /* All other stor3_t ids: pack A to column-stored row-panels. */ \ *rs_p = 1; \ *cs_p = mr; \ \ *pd_p = mr; \ *ps_p = mr * k; \ \ /* Set the schema to "packed row panels" to indicate packing to conventional column-stored row panels. */ \ *schema = BLIS_PACKED_ROW_PANELS; \ } \ \ /* Set the buffer address provided by the caller to point to the memory associated with the mem_t entry acquired from the memory broker. */ \ *p = bli_mem_buffer( mem ); \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_init_a ) // // Define BLAS-like interfaces to the variant chooser. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ pack_t schema; \ dim_t m_max; \ dim_t k_max; \ dim_t pd_p; \ \ /* Prepare the packing destination buffer. If packing is not requested, this function will reduce to a no-op. */ \ PASTEMAC(ch,packm_sup_init_mem_a) \ ( \ will_pack, \ pack_buf_type, \ m_alloc, k_alloc, mr, \ cntx, \ rntm, \ mem, \ thread \ ); \ \ /* Determine the packing buffer and related parameters for matrix A. If A will not be packed, then a_use will be set to point to a and the _a_use strides will be set accordingly. */ \ PASTEMAC(ch,packm_sup_init_a) \ ( \ will_pack, \ stor_id, \ &schema, \ m, k, mr, \ &m_max, &k_max, \ a, rs_a, cs_a, \ p, rs_p, cs_p, \ &pd_p, ps_p, \ cntx, \ mem, \ thread \ ); \ \ /* Inspect whether we are going to be packing matrix A. */ \ if ( will_pack == FALSE ) \ { \ /* If we aren't going to pack matrix A, then there's nothing to do. */ \ \ /* printf( "blis_ packm_sup_a: not packing A.\n" ); \ */ \ } \ else /* if ( will_pack == TRUE ) */ \ { \ if ( schema == BLIS_PACKED_ROWS ) \ { \ /* printf( "blis_ packm_sup_a: packing A to rows.\n" ); \ */ \ \ /* For plain packing by rows, use var2. */ \ PASTEMAC(ch,packm_sup_var2) \ ( \ transc, \ schema, \ m, \ k, \ kappa, \ a, rs_a, cs_a, \ *p, *rs_p, *cs_p, \ cntx, \ thread \ ); \ } \ else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \ { \ /* printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \ */ \ \ /* For packing to column-stored row panels, use var1. */ \ PASTEMAC(ch,packm_sup_var1) \ ( \ transc, \ schema, \ m, \ k, \ m_max, \ k_max, \ kappa, \ a, rs_a, cs_a, \ *p, *rs_p, *cs_p, \ pd_p, *ps_p, \ cntx, \ thread \ ); \ } \ \ /* Barrier so that packing is done before computation. */ \ bli_thread_barrier( thread ); \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_a ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_packm_a.h000066400000000000000000000077051427272030600227620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_packm_b.c000066400000000000000000000305431427272030600227520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ /* Inspect whether we are going to be packing matrix B. */ \ if ( will_pack == FALSE ) \ { \ } \ else /* if ( will_pack == TRUE ) */ \ { \ /* NOTE: This "rounding up" of the last upanel is actually optional for the rrc/crc cases, but absolutely necessary for the other cases since we NEED that last micropanel to have the same ldim (cs_p) as the other micropanels. Why? So that millikernels can use the same upanel ldim for all iterations of the ir loop. */ \ const dim_t k_pack = k; \ const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ \ /* Barrier to make sure all threads are caught up and ready to begin the packm stage. */ \ bli_thread_barrier( thread ); \ \ /* Compute the size of the memory block eneded. */ \ siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \ \ /* Check the mem_t entry provided by the caller. If it is unallocated, then we need to acquire a block from the memory broker. */ \ if ( bli_mem_is_unalloc( mem ) ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ { \ /* Acquire directly to the chief thread's mem_t that was passed in. It needs to be that mem_t struct, and not a local (temporary) mem_t, since there is no barrier until after packing is finished, which could allow a race condition whereby the chief thread exits the current function before the other threads have a chance to copy from it. (A barrier would fix that race condition, but then again, I prefer to keep barriers to a minimum.) */ \ bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ pack_buf_type, \ mem \ ); \ } \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ if ( !bli_thread_am_ochief( thread ) ) \ { \ *mem = *mem_p; \ } \ } \ else /* if ( bli_mem_is_alloc( mem ) ) */ \ { \ /* If the mem_t entry provided by the caller does NOT contain a NULL buffer, then a block has already been acquired from the memory broker and cached by the caller. */ \ \ /* As a sanity check, we should make sure that the mem_t object isn't associated with a block that is too small compared to the size of the packed matrix buffer that is needed, according to the value computed above. */ \ siz_t mem_size = bli_mem_size( mem ); \ \ if ( mem_size < size_needed ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ { \ /* The chief thread releases the existing block associated with the mem_t, and then re-acquires a new block, saving the associated mem_t to its passed-in mem_t. (See coment above for why the acquisition needs to be directly to the chief thread's passed-in mem_t and not a local (temporary) mem_t. */ \ bli_pba_release \ ( \ rntm, \ mem \ ); \ bli_pba_acquire_m \ ( \ rntm, \ size_needed, \ pack_buf_type, \ mem \ ); \ } \ \ /* Broadcast the address of the chief thread's passed-in mem_t to all threads. */ \ mem_t* mem_p = bli_thread_broadcast( thread, mem ); \ \ /* Non-chief threads: Copy the contents of the chief thread's passed-in mem_t to the passed-in mem_t for this thread. (The chief thread already has the mem_t, so it does not need to perform any copy.) */ \ if ( !bli_thread_am_ochief( thread ) ) \ { \ *mem = *mem_p; \ } \ } \ else \ { \ /* If the mem_t entry is already allocated and sufficiently large, then we use it as-is. No action is needed. */ \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ /* Inspect whether we previously packed matrix A. */ \ if ( did_pack == FALSE ) \ { \ /* If we didn't pack matrix A, there's nothing to be done. */ \ } \ else /* if ( did_pack == TRUE ) */ \ { \ if ( thread != NULL ) \ if ( bli_thread_am_ochief( thread ) ) \ { \ /* Check the mem_t entry provided by the caller. Only proceed if it is allocated, which it should be. */ \ if ( bli_mem_is_alloc( mem ) ) \ { \ bli_pba_release \ ( \ rntm, \ mem \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ /* Inspect whether we are going to be packing matrix B. */ \ if ( will_pack == FALSE ) \ { \ *k_max = k; \ *n_max = n; \ \ /* Set the parameters for use with no packing of B (ie: using the source matrix B directly). */ \ { \ /* Use the strides of the source matrix as the final values. */ \ *rs_p = rs_x; \ *cs_p = cs_x; \ \ *pd_p = nr; \ *ps_p = nr * cs_x; \ \ /* Set the schema to "not packed" to indicate that packing will be skipped. */ \ *schema = BLIS_NOT_PACKED; \ } \ \ /* Since we won't be packing, simply update the buffer address provided by the caller to point to source matrix. */ \ *p = x; \ } \ else /* if ( will_pack == TRUE ) */ \ { \ /* NOTE: This is "rounding up" of the last upanel is actually optional for the rrc/crc cases, but absolutely necessary for the other cases since we NEED that last micropanel to have the same ldim (cs_p) as the other micropanels. Why? So that millikernels can use the same upanel ldim for all iterations of the ir loop. */ \ *k_max = k; \ *n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \ \ /* Determine the dimensions and strides for the packed matrix B. */ \ if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) \ { \ /* stor3_t id values _RRC and _CRC: pack B to plain row storage. */ \ *rs_p = 1; \ *cs_p = k; \ \ *pd_p = nr; \ *ps_p = k * nr; \ \ /* Set the schema to "column packed" to indicate packing to plain column storage. */ \ *schema = BLIS_PACKED_COLUMNS; \ } \ else \ { \ /* All other stor3_t ids: pack B to row-stored column-panels. */ \ *rs_p = nr; \ *cs_p = 1; \ \ *pd_p = nr; \ *ps_p = k * nr; \ \ /* Set the schema to "packed column panels" to indicate packing to conventional row-stored column panels. */ \ *schema = BLIS_PACKED_COL_PANELS; \ } \ \ /* Set the buffer address provided by the caller to point to the memory associated with the mem_t entry acquired from the memory broker. */ \ *p = bli_mem_buffer( mem ); \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_init_b ) // // Define BLAS-like interfaces to the variant chooser. // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ) \ { \ pack_t schema; \ dim_t k_max; \ dim_t n_max; \ dim_t pd_p; \ \ /* Prepare the packing destination buffer. If packing is not requested, this function will reduce to a no-op. */ \ PASTEMAC(ch,packm_sup_init_mem_b) \ ( \ will_pack, \ pack_buf_type, \ k_alloc, n_alloc, nr, \ cntx, \ rntm, \ mem, \ thread \ ); \ \ /* Determine the packing buffer and related parameters for matrix B. If B will not be packed, then b_use will be set to point to b and the _b_use strides will be set accordingly. */ \ PASTEMAC(ch,packm_sup_init_b) \ ( \ will_pack, \ stor_id, \ &schema, \ k, n, nr, \ &k_max, &n_max, \ b, rs_b, cs_b, \ p, rs_p, cs_p, \ &pd_p, ps_p, \ cntx, \ mem, \ thread \ ); \ \ /* Inspect whether we are going to be packing matrix B. */ \ if ( will_pack == FALSE ) \ { \ /* If we aren't going to pack matrix B, then there's nothing to do. */ \ \ /* printf( "blis_ packm_sup_b: not packing B.\n" ); \ */ \ } \ else /* if ( will_pack == TRUE ) */ \ { \ if ( schema == BLIS_PACKED_COLUMNS ) \ { \ /* printf( "blis_ packm_sup_b: packing B to columns.\n" ); \ */ \ \ /* For plain packing by columns, use var2. */ \ PASTEMAC(ch,packm_sup_var2) \ ( \ transc, \ schema, \ k, \ n, \ kappa, \ b, rs_b, cs_b, \ *p, *rs_p, *cs_p, \ cntx, \ thread \ ); \ } \ else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \ { \ /* printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \ */ \ \ /* For packing to row-stored column panels, use var1. */ \ PASTEMAC(ch,packm_sup_var1) \ ( \ transc, \ schema, \ k, \ n, \ k_max, \ n_max, \ kappa, \ b, rs_b, cs_b, \ *p, *rs_p, *cs_p, \ pd_p, *ps_p, \ cntx, \ thread \ ); \ } \ \ /* Barrier so that packing is done before computation. */ \ bli_thread_barrier( thread ); \ } \ } INSERT_GENTFUNC_BASIC0( packm_sup_b ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_packm_b.h000066400000000000000000000077051427272030600227630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_packm_var.c000066400000000000000000000356261427272030600233300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-like interfaces to the variants. // #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict c_cast = c; \ ctype* restrict p_cast = p; \ \ dim_t iter_dim; \ dim_t n_iter; \ dim_t it, ic; \ dim_t ic0; \ doff_t ic_inc; \ dim_t panel_len_full; \ dim_t panel_len_i; \ dim_t panel_len_max; \ dim_t panel_len_max_i; \ dim_t panel_dim_i; \ dim_t panel_dim_max; \ inc_t vs_c; \ inc_t ldc; \ inc_t ldp, p_inc; \ conj_t conjc; \ \ \ /* Extract the conjugation bit from the transposition argument. */ \ conjc = bli_extract_conj( transc ); \ \ /* If c needs a transposition, induce it so that we can more simply express the remaining parameters and code. */ \ if ( bli_does_trans( transc ) ) \ { \ bli_swap_incs( &rs_c, &cs_c ); \ bli_toggle_trans( &transc ); \ } \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of micro-panel, not the storage in the micro-panel. Hence the mismatch in "row" and "column" semantics. */ \ bool row_stored = bli_is_col_packed( schema ); \ /*bool col_stored = bli_is_row_packed( schema );*/ \ \ /* If the row storage flag indicates row storage, then we are packing to column panels; otherwise, if the strides indicate column storage, we are packing to row panels. */ \ if ( row_stored ) \ { \ /* Prepare to pack to row-stored column panels. */ \ iter_dim = n; \ panel_len_full = m; \ panel_len_max = m_max; \ panel_dim_max = pd_p; \ vs_c = cs_c; \ ldc = rs_c; \ ldp = rs_p; \ } \ else /* if ( col_stored ) */ \ { \ /* Prepare to pack to column-stored row panels. */ \ iter_dim = m; \ panel_len_full = n; \ panel_len_max = n_max; \ panel_dim_max = pd_p; \ vs_c = rs_c; \ ldc = cs_c; \ ldp = cs_p; \ } \ \ /* Compute the total number of iterations we'll need. */ \ n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \ \ /* Set the initial values and increments for indices related to C and P based on whether reverse iteration was requested. */ \ { \ ic0 = 0; \ ic_inc = panel_dim_max; \ } \ \ ctype* restrict p_begin = p_cast; \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ const dim_t nt = bli_thread_n_way( thread ); \ const dim_t tid = bli_thread_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ ( void )tid; \ \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ ic += ic_inc, it += 1 ) \ { \ panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \ \ ctype* restrict c_begin = c_cast + (ic )*vs_c; \ \ ctype* restrict c_use = c_begin; \ ctype* restrict p_use = p_begin; \ \ { \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ /* The definition of bli_packm_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ PASTEMAC(ch,packm_cxk) \ ( \ conjc, \ schema, \ panel_dim_i, \ panel_dim_max, \ panel_len_i, \ panel_len_max_i, \ kappa_cast, \ c_use, vs_c, ldc, \ p_use, ldp, \ cntx \ ); \ } \ \ /* NOTE: This value is equivalent to ps_p. */ \ p_inc = ps_p; \ } \ \ p_begin += p_inc; \ \ /* if ( row_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ if ( !row_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ */ \ } \ \ } INSERT_GENTFUNCR_BASIC( packm, packm_sup_var1 ) /* if ( row_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_var2: b", m, n, \ c_cast, rs_c, cs_c, "%4.1f", "" ); \ if ( col_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_var2: a", m, n, \ c_cast, rs_c, cs_c, "%4.1f", "" ); \ */ /* if ( row_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b packed", *m_panel_max, *n_panel_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ else \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a packed", *m_panel_max, *n_panel_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ */ \ \ /* if ( col_stored ) { \ if ( bli_thread_work_id( thread ) == 0 ) \ { \ printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (a = %p, ap = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: a", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: ap", *m_panel_max, *n_panel_max, \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ bli_thread_barrier( thread ); \ } \ else { \ if ( bli_thread_work_id( thread ) == 0 ) \ { \ printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ bli_thread_barrier( thread ); \ if ( bli_thread_work_id( thread ) == 1 ) \ { \ printf( "packm_blk_var1: thread %lu (b = %p, bp = %p)\n", bli_thread_work_id( thread ), c_use, p_use ); \ fflush( stdout ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: b", *m_panel_use, *n_panel_use, \ ( ctype* )c_use, rs_c, cs_c, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "packm_blk_var1: bp", *m_panel_max, *n_panel_max, \ ( ctype* )p_use, rs_p, cs_p, "%4.1f", "" ); \ fflush( stdout ); \ } \ bli_thread_barrier( thread ); \ } \ */ /* PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_rpi", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ */ /* if ( row_stored ) { \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_r", *m_panel_max, *n_panel_max, \ ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: b_i", *m_panel_max, *n_panel_max, \ (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_r", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ inc_t is_b = rs_p * *m_panel_max; \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: bp_i", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use + is_b, rs_p, cs_p, "%4.1f", "" ); \ } \ */ /* if ( col_stored ) { \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_r", *m_panel_max, *n_panel_max, \ ( ctype_r* )c_use, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: a_i", *m_panel_max, *n_panel_max, \ (( ctype_r* )c_use)+rs_c, 2*rs_c, 2*cs_c, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_r", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use, rs_p, cs_p, "%4.1f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "packm_var2: ap_i", *m_panel_max, *n_panel_max, \ ( ctype_r* )p_use + p_inc, rs_p, cs_p, "%4.1f", "" ); \ } \ */ #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict c_cast = c; \ ctype* restrict p_cast = p; \ \ dim_t iter_dim; \ dim_t n_iter; \ dim_t it; \ dim_t vector_len; \ inc_t incc, ldc; \ inc_t incp, ldp; \ conj_t conjc; \ \ \ /* Extract the conjugation bit from the transposition argument. */ \ conjc = bli_extract_conj( transc ); \ \ /* If c needs a transposition, induce it so that we can more simply express the remaining parameters and code. */ \ if ( bli_does_trans( transc ) ) \ { \ bli_swap_incs( &rs_c, &cs_c ); \ bli_toggle_trans( &transc ); \ } \ \ /* Create flags to incidate row or column storage. Note that the schema bit that encodes row or column is describing the form of micro-panel, not the storage in the micro-panel. Hence the mismatch in "row" and "column" semantics. */ \ bool col_stored = bli_is_col_packed( schema ); \ /*bool row_stored = bli_is_row_packed( schema );*/ \ \ if ( col_stored ) \ { \ /* Prepare to pack to a column-stored matrix. */ \ iter_dim = n; \ vector_len = m; \ incc = rs_c; \ ldc = cs_c; \ incp = 1; \ ldp = cs_p; \ } \ else /* if ( row_stored ) */ \ { \ /* Prepare to pack to a row-stored matrix. */ \ iter_dim = m; \ vector_len = n; \ incc = cs_c; \ ldc = rs_c; \ incp = 1; \ ldp = rs_p; \ } \ \ /* Compute the total number of iterations we'll need. */ \ n_iter = iter_dim; \ \ \ ctype* restrict p_begin = p_cast; \ \ /* Query the number of threads and thread ids from the current thread's packm thrinfo_t node. */ \ const dim_t nt = bli_thread_n_way( thread ); \ const dim_t tid = bli_thread_work_id( thread ); \ \ /* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \ ( void )nt; \ ( void )tid; \ \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( it = 0; it < n_iter; it += 1 ) \ { \ ctype* restrict c_begin = c_cast + (it )*ldc; \ \ ctype* restrict c_use = c_begin; \ ctype* restrict p_use = p_begin; \ \ { \ /* The definition of bli_packm_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \ ( \ conjc, \ vector_len, \ kappa_cast, \ c_use, incc, \ p_use, incp, \ cntx, \ NULL \ ); \ } \ \ } \ \ p_begin += ldp; \ \ /* if ( row_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: b packed", panel_len_max, panel_dim_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ if ( !row_stored ) \ PASTEMAC(ch,fprintm)( stdout, "packm_sup_var1: a packed", panel_dim_max, panel_len_max, \ p_use, rs_p, cs_p, "%5.2f", "" ); \ */ \ } \ } INSERT_GENTFUNCR_BASIC( packm, packm_sup_var2 ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_packm_var.h000066400000000000000000000054621427272030600233300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_ref.c000066400000000000000000000126101427272030600221250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // This function implements the default gemmsup handler. If you are a // BLIS developer and wish to use a different gemmsup handler, please // register a different function pointer in the context in your // sub-configuration's bli_cntx_init_*() function. // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemm_check( alpha, a, b, beta, c, cntx ); #if 0 // NOTE: This special case handling is done within the variants. // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { bli_scalm( beta, c ); return BLIS_SUCCESS; } #endif const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b ); // Don't use the small/unpacked implementation if one of the matrices // uses general stride. NOTE: We check for this here, in bli_gemmsup_ref() // (and not in the calling function, bli_gemmsup()), because we consider // this way of handling general stride to be part of the implementation // and not necessarily a general-purpose solution that would apply to all // possible gemmsup handlers. Similarly, we check for it here (and not in // the internal thread entry point, bli_gemmsup_int()) because we don't // want to have to manage the multiple return values from the threads, // which we would have to process into a single return value and then // return from the parallel/threaded region. if ( stor_id == BLIS_XXX ) return BLIS_FAILURE; // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop. bli_rntm_set_ways_from_rntm_sup ( bli_obj_length( c ), bli_obj_width( c ), bli_obj_width( a ), rntm ); #if 0 printf( "rntm.pack_a = %d\n", ( int )bli_rntm_pack_a( rntm ) ); printf( "rntm.pack_b = %d\n", ( int )bli_rntm_pack_b( rntm ) ); //bli_rntm_set_pack_a( 0, rntm ); //bli_rntm_set_pack_b( 0, rntm ); #endif return bli_l3_sup_thread_decorator ( bli_gemmsup_int, BLIS_GEMM, // operation family id alpha, a, b, beta, c, cntx, rntm ); } // ----------------------------------------------------------------------------- err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // This function implements the default gemmtsup handler. If you are a // BLIS developer and wish to use a different gemmtsup handler, please // register a different function pointer in the context in your // sub-configuration's bli_cntx_init_*() function. // Check parameters. if ( bli_error_checking_is_enabled() ) bli_gemmt_check( alpha, a, b, beta, c, cntx ); #if 0 // NOTE: This special case handling is done within the variants. // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // If A or B has a zero dimension, scale C by beta and return early. if ( bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { bli_scalm( beta, c ); return BLIS_SUCCESS; } #endif // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop. bli_rntm_set_ways_from_rntm_sup ( bli_obj_length( c ), bli_obj_width( c ), bli_obj_width( a ), rntm ); return bli_l3_sup_thread_decorator ( bli_gemmtsup_int, BLIS_GEMMT, // operation family id alpha, a, b, beta, c, cntx, rntm ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_ref.h000066400000000000000000000037541427272030600221430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019 - 2000, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_var12.c000066400000000000000000000536001427272030600223100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemmsup_fp typedef void (*FUNCPTR_T) ( conj_t conja, conj_t conjb, dim_t m, dim_t n, dim_t k, void* restrict alpha, void* restrict a, inc_t rs_a, inc_t cs_a, void* restrict b, inc_t rs_b, inc_t cs_b, void* restrict beta, void* restrict c, inc_t rs_c, inc_t cs_c, stor3_t eff_id, cntx_t* restrict cntx, rntm_t* restrict rntm ); #if 0 // // -- var2 --------------------------------------------------------------------- // static FUNCPTR_T GENARRAY(ftypes_var2,gemmsup_ref_var2); void bli_gemmsup_ref_var2 ( trans_t trans, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, stor3_t eff_id, cntx_t* cntx, rntm_t* rntm ) { #if 0 obj_t at, bt; bli_obj_alias_to( a, &at ); bli_obj_alias_to( b, &bt ); // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width( &at ); void* restrict buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); void* restrict buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #else const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); dim_t k; void* restrict buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; void* restrict buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { k = bli_obj_width( a ); rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else // if ( bli_obj_has_trans( a ) ) { // Assign the variables with an implicit transposition. k = bli_obj_length( a ); rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else // if ( bli_obj_has_trans( b ) ) { // Assign the variables with an implicit transposition. rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #endif // Index into the type combination array to extract the correct // function pointer. FUNCPTR_T f = ftypes_var2[dt_exec]; // Invoke the function. f ( conja, conjb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, eff_id, cntx, rntm ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm \ ) \ { \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* If alpha is zero, scale by beta and return. */ \ if ( PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ { \ PASTEMAC(ch,scalm) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m, n, \ beta, \ c, rs_c, cs_c \ ); \ return; \ } \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for various blocksizes. */ \ const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c * NC; \ const inc_t jcstep_b = cs_b * NC; \ \ const inc_t pcstep_a = cs_a * KC; \ const inc_t pcstep_b = rs_b * KC; \ \ const inc_t icstep_c = rs_c * MC; \ const inc_t icstep_a = rs_a * MC; \ \ const inc_t jrstep_c = cs_c * NR; \ const inc_t jrstep_b = cs_b * NR; \ \ const inc_t irstep_c = rs_c * MR; \ const inc_t irstep_a = rs_a * MR; \ \ /* Query a stor3_t enum value to characterize the problem. Examples: BLIS_RRR, BLIS_RRC, BLIS_RCR, BLIS_RCC, etc. NOTE: If any matrix is general-stored, we use the all-purpose sup microkernel corresponding to the stor3_t enum value BLIS_XXX. */ \ const stor3_t stor_id = bli_stor3_from_strides( rs_c, cs_c, \ rs_a, cs_a, rs_b, cs_b ); \ \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ ctype* restrict c_00 = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ \ ctype* restrict one = PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ /* Compute number of primary and leftover components of the outer dimensions. NOTE: Functionally speaking, we compute jc_iter as: jc_iter = n / NC; if ( jc_left ) ++jc_iter; However, this is implemented as: jc_iter = ( n + NC - 1 ) / NC; This avoids a branch at the cost of two additional integer instructions. The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in similar manner. */ \ const dim_t jc_iter = ( n + NC - 1 ) / NC; \ const dim_t jc_left = n % NC; \ \ const dim_t pc_iter = ( k + KC - 1 ) / KC; \ const dim_t pc_left = k % KC; \ \ const dim_t ic_iter = ( m + MC - 1 ) / MC; \ const dim_t ic_left = m % MC; \ \ const dim_t jc_inc = 1; \ const dim_t pc_inc = 1; \ const dim_t ic_inc = 1; \ const dim_t jr_inc = 1; \ const dim_t ir_inc = 1; \ \ /* Loop over the n dimension (NC rows/columns at a time). */ \ for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \ { \ const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \ \ ctype* restrict b_jc = b_00 + jj * jcstep_b; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ \ const dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ const dim_t jr_left = nc_cur % NR; \ \ /* Loop over the k dimension (KC rows/columns at a time). */ \ for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \ { \ const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \ \ ctype* restrict a_pc = a_00 + pp * pcstep_a; \ ctype* restrict b_pc = b_jc + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \ \ /* Loop over the m dimension (MC rows at a time). */ \ for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \ { \ const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \ \ ctype* restrict a_ic = a_pc + ii * icstep_a; \ ctype* restrict c_ic = c_jc + ii * icstep_c; \ \ const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ const dim_t ir_left = mc_cur % MR; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ \ ctype* restrict b_jr = b_pc + j * jrstep_b; \ ctype* restrict c_jr = c_ic + j * jrstep_c; \ \ /* ctype* restrict b2 = b_jr; \ */ \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( dim_t i = 0; i < ir_iter; i += ir_inc ) \ { \ const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ \ ctype* restrict a_ir = a_ic + i * irstep_a; \ ctype* restrict c_ir = c_jr + i * irstep_c; \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ /* ctype* restrict a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \ if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \ { \ a2 = a_00; \ b2 = bli_gemm_get_next_b_upanel( b_jr, jrstep_b, jr_inc ); \ if ( bli_is_last_iter( j, jr_iter, 0, 1 ) ) \ b2 = b_00; \ } \ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ */ \ \ /* Invoke the gemmsup micro-kernel. */ \ gemmsup_ker \ ( \ conja, \ conjb, \ mr_cur, \ nr_cur, \ kc_cur, \ alpha_cast, \ a_ir, rs_a, cs_a, \ b_jr, rs_b, cs_b, \ beta_use, \ c_ir, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2 ) // // -- var1 --------------------------------------------------------------------- // static FUNCPTR_T GENARRAY(ftypes_var1,gemmsup_ref_var1); void bli_gemmsup_ref_var1 ( trans_t trans, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, stor3_t eff_id, cntx_t* cntx, rntm_t* rntm ) { #if 0 obj_t at, bt; bli_obj_alias_to( a, &at ); bli_obj_alias_to( b, &bt ); // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width( &at ); void* restrict buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); void* restrict buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #else const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); dim_t k; void* restrict buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; void* restrict buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { k = bli_obj_width( a ); rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else // if ( bli_obj_has_trans( a ) ) { // Assign the variables with an implicit transposition. k = bli_obj_length( a ); rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else // if ( bli_obj_has_trans( b ) ) { // Assign the variables with an implicit transposition. rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #endif // Index into the type combination array to extract the correct // function pointer. FUNCPTR_T f = ftypes_var1[dt_exec]; // Invoke the function. f ( conja, conjb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, eff_id, cntx, rntm ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm \ ) \ { \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* If alpha is zero, scale by beta and return. */ \ if ( PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ { \ PASTEMAC(ch,scalm) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m, n, \ beta, \ c, rs_c, cs_c \ ); \ return; \ } \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for various blocksizes. */ \ const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ \ /* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \ const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = rs_c * NC; \ const inc_t jcstep_a = rs_a * NC; \ \ const inc_t pcstep_a = cs_a * KC; \ const inc_t pcstep_b = rs_b * KC; \ \ const inc_t icstep_c = cs_c * MC; \ const inc_t icstep_b = cs_b * MC; \ \ const inc_t jrstep_c = rs_c * MR; \ const inc_t jrstep_a = rs_a * MR; \ \ const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ \ /* Query a stor3_t enum value to characterize the problem. Examples: BLIS_RRR, BLIS_RRC, BLIS_RCR, BLIS_RCC, etc. NOTE: If any matrix is general-stored, we use the all-purpose sup microkernel corresponding to the stor3_t enum value BLIS_XXX. */ \ const stor3_t stor_id = bli_stor3_from_strides( rs_c, cs_c, \ rs_a, cs_a, rs_b, cs_b ); \ \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ ctype* restrict c_00 = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ \ ctype* restrict one = PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ /* Compute number of primary and leftover components of the outer dimensions. NOTE: Functionally speaking, we compute jc_iter as: jc_iter = m / NC; if ( jc_left ) ++jc_iter; However, this is implemented as: jc_iter = ( m + NC - 1 ) / NC; This avoids a branch at the cost of two additional integer instructions. The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in similar manner. */ \ const dim_t jc_iter = ( m + NC - 1 ) / NC; \ const dim_t jc_left = m % NC; \ \ const dim_t pc_iter = ( k + KC - 1 ) / KC; \ const dim_t pc_left = k % KC; \ \ const dim_t ic_iter = ( n + MC - 1 ) / MC; \ const dim_t ic_left = n % MC; \ \ const dim_t jc_inc = 1; \ const dim_t pc_inc = 1; \ const dim_t ic_inc = 1; \ const dim_t jr_inc = 1; \ const dim_t ir_inc = 1; \ \ /* Loop over the m dimension (NC rows/columns at a time). */ \ for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \ { \ const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \ \ ctype* restrict a_jc = a_00 + jj * jcstep_a; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ \ const dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ const dim_t jr_left = nc_cur % MR; \ \ /* Loop over the k dimension (KC rows/columns at a time). */ \ for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \ { \ const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \ \ ctype* restrict a_pc = a_jc + pp * pcstep_a; \ ctype* restrict b_pc = b_00 + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \ \ /* Loop over the n dimension (MC rows at a time). */ \ for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \ { \ const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \ \ ctype* restrict b_ic = b_pc + ii * icstep_b; \ ctype* restrict c_ic = c_jc + ii * icstep_c; \ \ const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ const dim_t ir_left = mc_cur % NR; \ \ /* Loop over the m dimension (NR columns at a time). */ \ for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ \ ctype* restrict a_jr = a_pc + j * jrstep_a; \ ctype* restrict c_jr = c_ic + j * jrstep_c; \ \ /* Loop over the n dimension (MR rows at a time). */ \ for ( dim_t i = 0; i < ir_iter; i += ir_inc ) \ { \ const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \ \ ctype* restrict b_ir = b_ic + i * irstep_b; \ ctype* restrict c_ir = c_jr + i * irstep_c; \ \ /* Invoke the gemmsup micro-kernel. */ \ gemmsup_ker \ ( \ conja, \ conjb, \ mr_cur, \ nr_cur, \ kc_cur, \ alpha_cast, \ a_jr, rs_a, cs_a, \ b_ir, rs_b, cs_b, \ beta_use, \ c_ir, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1 ) #endif cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_var1n2m.c000066400000000000000000001317601427272030600226470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemmsup_fp typedef void (*FUNCPTR_T) ( bool packa, bool packb, conj_t conja, conj_t conjb, dim_t m, dim_t n, dim_t k, void* restrict alpha, void* restrict a, inc_t rs_a, inc_t cs_a, void* restrict b, inc_t rs_b, inc_t cs_b, void* restrict beta, void* restrict c, inc_t rs_c, inc_t cs_c, stor3_t eff_id, cntx_t* restrict cntx, rntm_t* restrict rntm, thrinfo_t* restrict thread ); // // -- var1n -------------------------------------------------------------------- // static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n); void bli_gemmsup_ref_var1n ( trans_t trans, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, stor3_t eff_id, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ) { #if 0 obj_t at, bt; bli_obj_alias_to( a, &at ); bli_obj_alias_to( b, &bt ); // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } const num_t dt = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width( &at ); void* restrict buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); void* restrict buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #else const num_t dt = bli_obj_dt( c ); const bool packa = bli_rntm_pack_a( rntm ); const bool packb = bli_rntm_pack_b( rntm ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); dim_t k; void* restrict buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; void* restrict buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { k = bli_obj_width( a ); rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else // if ( bli_obj_has_trans( a ) ) { // Assign the variables with an implicit transposition. k = bli_obj_length( a ); rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else // if ( bli_obj_has_trans( b ) ) { // Assign the variables with an implicit transposition. rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #endif // Index into the type combination array to extract the correct // function pointer. FUNCPTR_T f = ftypes_var1n[dt]; #if 1 // Optimize some storage/packing cases by transforming them into others. // These optimizations are expressed by changing trans and/or eff_id. bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); #endif if ( bli_is_notrans( trans ) ) { // Invoke the function. f ( packa, packb, conja, conjb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, eff_id, cntx, rntm, thread ); } else { // Invoke the function (transposing the operation). f ( packb, packa, conjb, // swap the conj values. conja, n, // swap the m and n dimensions. m, k, buf_alpha, buf_b, cs_b, rs_b, // swap the positions of A and B. buf_a, cs_a, rs_a, // swap the strides of A and B. buf_beta, buf_c, cs_c, rs_c, // swap the strides of C. bli_stor3_trans( eff_id ), // transpose the stor3_t id. cntx, rntm, thread ); } } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t stor_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* If m or n is zero, return immediately. */ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* If k < 1 or alpha is zero, scale by beta and return. */ \ if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ { \ PASTEMAC(ch,scalm) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m, n, \ beta, \ c, rs_c, cs_c \ ); \ } \ return; \ } \ \ /* This transposition of the stor3_t id value is inherent to variant 1. The reason: we assume that variant 2 is the "main" variant. The consequence of this is that we assume that the millikernels that iterate over m are registered to the "primary" kernel group associated with the kernel IO preference; similarly, mkernels that iterate over n are assumed to be registered to the "non-primary" group associated with the ("non-primary") anti-preference. Note that this pattern holds regardless of whether the mkernel set has a row or column preference.) See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \ stor_id = bli_stor3_trans( stor_id ); \ \ /* Query the context for various blocksizes. */ \ const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( packa && packb ) \ { \ KC = KC0; \ } \ else if ( packb ) \ { \ if ( stor_id == BLIS_RRR || \ stor_id == BLIS_CCC ) KC = KC0; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( stor_id == BLIS_RCR || \ stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ else KC = KC0; \ } \ else if ( packa ) \ { \ if ( stor_id == BLIS_RRR || \ stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( stor_id == BLIS_RCR || \ stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ else KC = KC0; \ } \ else /* if ( !packa && !packb ) */ \ { \ if ( FALSE ) KC = KC0; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( m <= MR && n <= NR ) KC = KC0; \ else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ } \ \ /* Nudge NC up to a multiple of MR and MC up to a multiple of NR. NOTE: This is unique to variant 1 (ie: not performed in variant 2) because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \ const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ \ /* Query the maximum blocksize for MR, which implies a maximum blocksize extension for the final iteration. */ \ const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ const dim_t MRE = MRM - MR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = rs_c; \ const inc_t jcstep_a = rs_a; \ \ const inc_t pcstep_a = cs_a; \ const inc_t pcstep_b = rs_b; \ \ const inc_t icstep_c = cs_c; \ const inc_t icstep_b = cs_b; \ \ const inc_t jrstep_c = rs_c * MR; \ \ /* const inc_t jrstep_a = rs_a * MR; \ \ const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ */ \ \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ ctype* restrict c_00 = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ \ /* Make local copies of beta and one scalars to prevent any unnecessary sharing of cache lines between the cores' caches. */ \ ctype beta_local = *beta_cast; \ ctype one_local = *PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ /* Parse and interpret the contents of the rntm_t object to properly set the ways of parallelism for each loop. */ \ /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ \ /* Initialize a mem_t entry for A and B. Strictly speaking, this is only needed for the matrix we will be packing (if any), but we do it unconditionally to be safe. An alternative way of initializing the mem_t entries is: bli_mem_clear( &mem_a ); \ bli_mem_clear( &mem_b ); \ */ \ mem_t mem_a = BLIS_MEM_INITIALIZER; \ mem_t mem_b = BLIS_MEM_INITIALIZER; \ \ /* Define an array of bszid_t ids, which will act as our substitute for the cntl_t tree. NOTE: These bszid_t values, and their order, match that of the bp algorithm (variant 2) because they are not used to query actual blocksizes but rather query the ways of parallelism for the various loops. For example, the 2nd loop in variant 1 partitions in the m dimension (in increments of MR), but parallelizes that m dimension with BLIS_JR_NT. The only difference is that the _packa and _packb arrays have been adjusted for the semantic difference in order in which packa and packb nodes are encountered in the thrinfo tree. That is, this panel-block algorithm partitions an NC x KC submatrix of A to be packed in the 4th loop, and a KC x MC submatrix of B to be packed in the 3rd loop. */ \ /* 5thloop 4thloop packa 3rdloop packb 2ndloop 1stloop ukrloop */ \ bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t* restrict bszids; \ \ /* Set the bszids pointer to the correct bszids array above based on which matrices (if any) are being packed. */ \ if ( packa ) { if ( packb ) bszids = bszids_packab; \ else bszids = bszids_packa; } \ else { if ( packb ) bszids = bszids_packb; \ else bszids = bszids_nopack; } \ \ /* Determine whether we are using more than one thread. */ \ const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \ \ thrinfo_t* restrict thread_jc = NULL; \ thrinfo_t* restrict thread_pc = NULL; \ thrinfo_t* restrict thread_pa = NULL; \ thrinfo_t* restrict thread_ic = NULL; \ thrinfo_t* restrict thread_pb = NULL; \ thrinfo_t* restrict thread_jr = NULL; \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_jc = bszids; \ thread_jc = thread; \ bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ \ /* Compute the JC loop thread range for the current thread. */ \ dim_t jc_start, jc_end; \ bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \ const dim_t m_local = jc_end - jc_start; \ \ /* Compute number of primary and leftover components of the JC loop. */ \ /*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \ const dim_t jc_left = m_local % NC; \ \ /* Loop over the m dimension (NC rows/columns at a time). */ \ /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ { \ /* Calculate the thread's current JC block dimension. */ \ const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ \ ctype* restrict a_jc = a_00 + jj * jcstep_a; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_pc = &bszids_jc[1]; \ thread_pc = bli_thrinfo_sub_node( thread_jc ); \ bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ \ /* Compute the PC loop thread range for the current thread. */ \ const dim_t pc_start = 0, pc_end = k; \ const dim_t k_local = k; \ \ /* Compute number of primary and leftover components of the PC loop. */ \ /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ const dim_t pc_left = k_local % KC; \ \ /* Loop over the k dimension (KC rows/columns at a time). */ \ /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ { \ /* Calculate the thread's current PC block dimension. */ \ const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ \ ctype* restrict a_pc = a_jc + pp * pcstep_a; \ ctype* restrict b_pc = b_00 + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ \ ctype* a_use; \ inc_t rs_a_use, cs_a_use, ps_a_use; \ \ /* Set the bszid_t array and thrinfo_t pointer based on whether we will be packing A. If we won't be packing A, we alias to the _pc variables so that code further down can unconditionally reference the _pa variables. Note that *if* we will be packing A, the thrinfo_t node will have already been created by a previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ bszid_t* restrict bszids_pa; \ if ( packa ) { bszids_pa = &bszids_pc[1]; \ thread_pa = bli_thrinfo_sub_node( thread_pc ); } \ else { bszids_pa = &bszids_pc[0]; \ thread_pa = thread_pc; } \ \ /* Determine the packing buffer and related parameters for matrix A. (If A will not be packed, then a_use will be set to point to a and the _a_use strides will be set accordingly.) Then call the packm sup variant chooser, which will call the appropriate implementation based on the schema deduced from the stor_id. NOTE: packing matrix A in this panel-block algorithm corresponds to packing matrix B in the block-panel algorithm. */ \ PASTEMAC(ch,packm_sup_a) \ ( \ packa, \ BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \ stor_id, /* a "panel of B". */ \ BLIS_NO_TRANSPOSE, \ NC, KC, /* This "panel of B" is (at most) NC x KC. */ \ nc_cur, kc_cur, MR, \ &one_local, \ a_pc, rs_a, cs_a, \ &a_use, &rs_a_use, &cs_a_use, \ &ps_a_use, \ cntx, \ rntm, \ &mem_a, \ thread_pa \ ); \ \ /* Alias a_use so that it's clear this is our current block of matrix A. */ \ ctype* restrict a_pc_use = a_use; \ \ /* We don't need to embed the panel stride of A within the auxinfo_t object because this variant iterates through A in the jr loop, which occurs here, within the macrokernel, not within the millikernel. */ \ /*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_ic = &bszids_pa[1]; \ thread_ic = bli_thrinfo_sub_node( thread_pa ); \ bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ \ /* Compute the IC loop thread range for the current thread. */ \ dim_t ic_start, ic_end; \ bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \ const dim_t n_local = ic_end - ic_start; \ \ /* Compute number of primary and leftover components of the IC loop. */ \ /*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \ const dim_t ic_left = n_local % MC; \ \ /* Loop over the n dimension (MC rows at a time). */ \ /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ { \ /* Calculate the thread's current IC block dimension. */ \ const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ \ ctype* restrict b_ic = b_pc + ii * icstep_b; \ ctype* restrict c_ic = c_jc + ii * icstep_c; \ \ ctype* b_use; \ inc_t rs_b_use, cs_b_use, ps_b_use; \ \ /* Set the bszid_t array and thrinfo_t pointer based on whether we will be packing A. If we won't be packing A, we alias to the _pc variables so that code further down can unconditionally reference the _pa variables. Note that *if* we will be packing A, the thrinfo_t node will have already been created by a previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ bszid_t* restrict bszids_pb; \ if ( packb ) { bszids_pb = &bszids_ic[1]; \ thread_pb = bli_thrinfo_sub_node( thread_ic ); } \ else { bszids_pb = &bszids_ic[0]; \ thread_pb = thread_ic; } \ \ /* Determine the packing buffer and related parameters for matrix B. (If B will not be packed, then b_use will be set to point to b and the _b_use strides will be set accordingly.) Then call the packm sup variant chooser, which will call the appropriate implementation based on the schema deduced from the stor_id. NOTE: packing matrix B in this panel-block algorithm corresponds to packing matrix A in the block-panel algorithm. */ \ PASTEMAC(ch,packm_sup_b) \ ( \ packb, \ BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \ stor_id, /* a "block of A". */ \ BLIS_NO_TRANSPOSE, \ KC, MC, /* This "block of A" is (at most) KC x MC. */ \ kc_cur, mc_cur, NR, \ &one_local, \ b_ic, rs_b, cs_b, \ &b_use, &rs_b_use, &cs_b_use, \ &ps_b_use, \ cntx, \ rntm, \ &mem_b, \ thread_pb \ ); \ \ /* Alias b_use so that it's clear this is our current block of matrix B. */ \ ctype* restrict b_ic_use = b_use; \ \ /* Embed the panel stride of B within the auxinfo_t object. The millikernel will query and use this to iterate through micropanels of B. */ \ bli_auxinfo_set_ps_b( ps_b_use, &aux ); \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_jr = &bszids_pb[1]; \ thread_jr = bli_thrinfo_sub_node( thread_pb ); \ bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ dim_t jr_left = nc_cur % MR; \ \ /* An optimization: allow the last jr iteration to contain up to MRE rows of C and A. (If MRE > MR, the mkernel has agreed to handle these cases.) Note that this prevents us from declaring jr_iter and jr_left as const. NOTE: We forgo this optimization when packing A since packing an extended edge case is not yet supported. */ \ if ( !packa && !is_mt ) \ if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \ { \ jr_iter--; jr_left += MR; \ } \ \ /* Compute the JR loop thread range for the current thread. */ \ dim_t jr_start, jr_end; \ bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ \ /* Loop over the m dimension (NR columns at a time). */ \ /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \ \ /* ctype* restrict a_jr = a_pc + j * jrstep_a; \ */ \ ctype* restrict a_jr = a_pc_use + j * ps_a_use; \ ctype* restrict c_jr = c_ic + j * jrstep_c; \ \ /* const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ const dim_t ir_left = mc_cur % NR; \ */ \ \ /* Loop over the n dimension (MR rows at a time). */ \ { \ /* Invoke the gemmsup millikernel. */ \ gemmsup_ker \ ( \ conja, \ conjb, \ nr_cur, /* Notice: nr_cur <= MR. */ \ mc_cur, /* Recall: mc_cur partitions the n dimension! */ \ kc_cur, \ alpha_cast, \ a_jr, rs_a_use, cs_a_use, \ b_ic_use, rs_b_use, cs_b_use, \ beta_use, \ c_jr, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ \ /* NOTE: This barrier is only needed if we are packing A (since that matrix is packed within the pc loop of this variant). */ \ if ( packa ) bli_thread_barrier( thread_pa ); \ } \ } \ \ /* Release any memory that was acquired for packing matrices A and B. */ \ PASTEMAC(ch,packm_sup_finalize_mem_a) \ ( \ packa, \ rntm, \ &mem_a, \ thread_pa \ ); \ PASTEMAC(ch,packm_sup_finalize_mem_b) \ ( \ packb, \ rntm, \ &mem_b, \ thread_pb \ ); \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n ) // // -- var2m -------------------------------------------------------------------- // static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m); void bli_gemmsup_ref_var2m ( trans_t trans, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, stor3_t eff_id, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ) { #if 0 obj_t at, bt; bli_obj_alias_to( a, &at ); bli_obj_alias_to( b, &bt ); // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } const num_t dt = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width( &at ); void* restrict buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); void* restrict buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #else const num_t dt = bli_obj_dt( c ); const bool packa = bli_rntm_pack_a( rntm ); const bool packb = bli_rntm_pack_b( rntm ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); dim_t k; void* restrict buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; void* restrict buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { k = bli_obj_width( a ); rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else // if ( bli_obj_has_trans( a ) ) { // Assign the variables with an implicit transposition. k = bli_obj_length( a ); rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else // if ( bli_obj_has_trans( b ) ) { // Assign the variables with an implicit transposition. rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt, beta ); #endif // Index into the type combination array to extract the correct // function pointer. FUNCPTR_T f = ftypes_var2m[dt]; #if 1 // Optimize some storage/packing cases by transforming them into others. // These optimizations are expressed by changing trans and/or eff_id. bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx ); #endif if ( bli_is_notrans( trans ) ) { // Invoke the function. f ( packa, packb, conja, conjb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, eff_id, cntx, rntm, thread ); } else { // Invoke the function (transposing the operation). f ( packb, // swap the pack values. packa, conjb, // swap the conj values. conja, n, // swap the m and n dimensions. m, k, buf_alpha, buf_b, cs_b, rs_b, // swap the positions of A and B. buf_a, cs_a, rs_a, // swap the strides of A and B. buf_beta, buf_c, cs_c, rs_c, // swap the strides of C. bli_stor3_trans( eff_id ), // transpose the stor3_t id. cntx, rntm, thread ); } } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t stor_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* If m or n is zero, return immediately. */ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* If k < 1 or alpha is zero, scale by beta and return. */ \ if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ { \ if ( bli_thread_am_ochief( thread ) ) \ { \ PASTEMAC(ch,scalm) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m, n, \ beta, \ c, rs_c, cs_c \ ); \ } \ return; \ } \ \ /* Query the context for various blocksizes. */ \ const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( packa && packb ) \ { \ KC = KC0; \ } \ else if ( packb ) \ { \ if ( stor_id == BLIS_RRR || \ stor_id == BLIS_CCC ) KC = KC0; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( stor_id == BLIS_RCR || \ stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ else KC = KC0; \ } \ else if ( packa ) \ { \ if ( stor_id == BLIS_RRR || \ stor_id == BLIS_CCC ) KC = (( KC0 / 2 ) / 2 ) * 2; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( stor_id == BLIS_RCR || \ stor_id == BLIS_CCR ) KC = (( KC0 / 4 ) / 4 ) * 4; \ else KC = KC0; \ } \ else /* if ( !packa && !packb ) */ \ { \ if ( stor_id == BLIS_RRR || \ stor_id == BLIS_CCC ) KC = KC0; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( m <= MR && n <= NR ) KC = KC0; \ else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ } \ \ /* Query the maximum blocksize for NR, which implies a maximum blocksize extension for the final iteration. */ \ const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ const dim_t NRE = NRM - NR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c; \ const inc_t jcstep_b = cs_b; \ \ const inc_t pcstep_a = cs_a; \ const inc_t pcstep_b = rs_b; \ \ const inc_t icstep_c = rs_c; \ const inc_t icstep_a = rs_a; \ \ const inc_t jrstep_c = cs_c * NR; \ \ /* const inc_t jrstep_b = cs_b * NR; \ ( void )jrstep_b; \ \ const inc_t irstep_c = rs_c * MR; \ const inc_t irstep_a = rs_a * MR; \ */ \ \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ ctype* restrict c_00 = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ \ /* Make local copies of beta and one scalars to prevent any unnecessary sharing of cache lines between the cores' caches. */ \ ctype beta_local = *beta_cast; \ ctype one_local = *PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ /* Parse and interpret the contents of the rntm_t object to properly set the ways of parallelism for each loop. */ \ /*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \ \ /* Initialize a mem_t entry for A and B. Strictly speaking, this is only needed for the matrix we will be packing (if any), but we do it unconditionally to be safe. An alternative way of initializing the mem_t entries is: bli_mem_clear( &mem_a ); \ bli_mem_clear( &mem_b ); \ */ \ mem_t mem_a = BLIS_MEM_INITIALIZER; \ mem_t mem_b = BLIS_MEM_INITIALIZER; \ \ /* Define an array of bszid_t ids, which will act as our substitute for the cntl_t tree. */ \ /* 5thloop 4thloop packb 3rdloop packa 2ndloop 1stloop ukrloop */ \ bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \ bszid_t* restrict bszids; \ \ /* Set the bszids pointer to the correct bszids array above based on which matrices (if any) are being packed. */ \ if ( packa ) { if ( packb ) bszids = bszids_packab; \ else bszids = bszids_packa; } \ else { if ( packb ) bszids = bszids_packb; \ else bszids = bszids_nopack; } \ \ /* Determine whether we are using more than one thread. */ \ const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \ \ thrinfo_t* restrict thread_jc = NULL; \ thrinfo_t* restrict thread_pc = NULL; \ thrinfo_t* restrict thread_pb = NULL; \ thrinfo_t* restrict thread_ic = NULL; \ thrinfo_t* restrict thread_pa = NULL; \ thrinfo_t* restrict thread_jr = NULL; \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_jc = bszids; \ thread_jc = thread; \ bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \ \ /* Compute the JC loop thread range for the current thread. */ \ dim_t jc_start, jc_end; \ bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \ const dim_t n_local = jc_end - jc_start; \ \ /* Compute number of primary and leftover components of the JC loop. */ \ /*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \ const dim_t jc_left = n_local % NC; \ \ /* Loop over the n dimension (NC rows/columns at a time). */ \ /*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \ for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \ { \ /* Calculate the thread's current JC block dimension. */ \ const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \ \ ctype* restrict b_jc = b_00 + jj * jcstep_b; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_pc = &bszids_jc[1]; \ thread_pc = bli_thrinfo_sub_node( thread_jc ); \ bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \ \ /* Compute the PC loop thread range for the current thread. */ \ const dim_t pc_start = 0, pc_end = k; \ const dim_t k_local = k; \ \ /* Compute number of primary and leftover components of the PC loop. */ \ /*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \ const dim_t pc_left = k_local % KC; \ \ /* Loop over the k dimension (KC rows/columns at a time). */ \ /*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \ for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \ { \ /* Calculate the thread's current PC block dimension. */ \ const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \ \ ctype* restrict a_pc = a_00 + pp * pcstep_a; \ ctype* restrict b_pc = b_jc + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \ \ ctype* b_use; \ inc_t rs_b_use, cs_b_use, ps_b_use; \ \ /* Set the bszid_t array and thrinfo_t pointer based on whether we will be packing B. If we won't be packing B, we alias to the _pc variables so that code further down can unconditionally reference the _pb variables. Note that *if* we will be packing B, the thrinfo_t node will have already been created by a previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ bszid_t* restrict bszids_pb; \ if ( packb ) { bszids_pb = &bszids_pc[1]; \ thread_pb = bli_thrinfo_sub_node( thread_pc ); } \ else { bszids_pb = &bszids_pc[0]; \ thread_pb = thread_pc; } \ \ /* Determine the packing buffer and related parameters for matrix B. (If B will not be packed, then a_use will be set to point to b and the _b_use strides will be set accordingly.) Then call the packm sup variant chooser, which will call the appropriate implementation based on the schema deduced from the stor_id. */ \ PASTEMAC(ch,packm_sup_b) \ ( \ packb, \ BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \ stor_id, /* a "panel of B." */ \ BLIS_NO_TRANSPOSE, \ KC, NC, /* This "panel of B" is (at most) KC x NC. */ \ kc_cur, nc_cur, NR, \ &one_local, \ b_pc, rs_b, cs_b, \ &b_use, &rs_b_use, &cs_b_use, \ &ps_b_use, \ cntx, \ rntm, \ &mem_b, \ thread_pb \ ); \ \ /* Alias b_use so that it's clear this is our current block of matrix B. */ \ ctype* restrict b_pc_use = b_use; \ \ /* We don't need to embed the panel stride of B within the auxinfo_t object because this variant iterates through B in the jr loop, which occurs here, within the macrokernel, not within the millikernel. */ \ /*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_ic = &bszids_pb[1]; \ thread_ic = bli_thrinfo_sub_node( thread_pb ); \ bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \ \ /* Compute the IC loop thread range for the current thread. */ \ dim_t ic_start, ic_end; \ bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \ const dim_t m_local = ic_end - ic_start; \ \ /* Compute number of primary and leftover components of the IC loop. */ \ /*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \ const dim_t ic_left = m_local % MC; \ \ /* Loop over the m dimension (MC rows at a time). */ \ /*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \ for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \ { \ /* Calculate the thread's current IC block dimension. */ \ const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \ \ ctype* restrict a_ic = a_pc + ii * icstep_a; \ ctype* restrict c_ic = c_jc + ii * icstep_c; \ \ ctype* a_use; \ inc_t rs_a_use, cs_a_use, ps_a_use; \ \ /* Set the bszid_t array and thrinfo_t pointer based on whether we will be packing B. If we won't be packing A, we alias to the _ic variables so that code further down can unconditionally reference the _pa variables. Note that *if* we will be packing A, the thrinfo_t node will have already been created by a previous call to bli_thrinfo_grow(), since bszid values of BLIS_NO_PART cause the tree to grow by two (e.g. to the next bszid that is a normal bszid_t value). */ \ bszid_t* restrict bszids_pa; \ if ( packa ) { bszids_pa = &bszids_ic[1]; \ thread_pa = bli_thrinfo_sub_node( thread_ic ); } \ else { bszids_pa = &bszids_ic[0]; \ thread_pa = thread_ic; } \ \ /* Determine the packing buffer and related parameters for matrix A. (If A will not be packed, then a_use will be set to point to a and the _a_use strides will be set accordingly.) Then call the packm sup variant chooser, which will call the appropriate implementation based on the schema deduced from the stor_id. */ \ PASTEMAC(ch,packm_sup_a) \ ( \ packa, \ BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \ stor_id, /* a "block of A." */ \ BLIS_NO_TRANSPOSE, \ MC, KC, /* This "block of A" is (at most) MC x KC. */ \ mc_cur, kc_cur, MR, \ &one_local, \ a_ic, rs_a, cs_a, \ &a_use, &rs_a_use, &cs_a_use, \ &ps_a_use, \ cntx, \ rntm, \ &mem_a, \ thread_pa \ ); \ \ /* Alias a_use so that it's clear this is our current block of matrix A. */ \ ctype* restrict a_ic_use = a_use; \ \ /* Embed the panel stride of A within the auxinfo_t object. The millikernel will query and use this to iterate through micropanels of A (if needed). */ \ bli_auxinfo_set_ps_a( ps_a_use, &aux ); \ \ /* Grow the thrinfo_t tree. */ \ bszid_t* restrict bszids_jr = &bszids_pa[1]; \ thread_jr = bli_thrinfo_sub_node( thread_pa ); \ bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \ \ /* Compute number of primary and leftover components of the JR loop. */ \ dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ dim_t jr_left = nc_cur % NR; \ \ /* An optimization: allow the last jr iteration to contain up to NRE columns of C and B. (If NRE > NR, the mkernel has agreed to handle these cases.) Note that this prevents us from declaring jr_iter and jr_left as const. NOTE: We forgo this optimization when packing B since packing an extended edge case is not yet supported. */ \ if ( !packb && !is_mt ) \ if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \ { \ jr_iter--; jr_left += NR; \ } \ \ /* Compute the JR loop thread range for the current thread. */ \ dim_t jr_start, jr_end; \ bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ /*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \ for ( dim_t j = jr_start; j < jr_end; j += 1 ) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ \ /* ctype* restrict b_jr = b_pc_use + j * jrstep_b; \ */ \ ctype* restrict b_jr = b_pc_use + j * ps_b_use; \ ctype* restrict c_jr = c_ic + j * jrstep_c; \ \ /* const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ const dim_t ir_left = mc_cur % MR; \ */ \ \ /* Loop over the m dimension (MR rows at a time). */ \ { \ /* Invoke the gemmsup millikernel. */ \ gemmsup_ker \ ( \ conja, \ conjb, \ mc_cur, \ nr_cur, \ kc_cur, \ alpha_cast, \ a_ic_use, rs_a_use, cs_a_use, \ b_jr, rs_b_use, cs_b_use, \ beta_use, \ c_jr, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ \ /* NOTE: This barrier is only needed if we are packing B (since that matrix is packed within the pc loop of this variant). */ \ if ( packb ) bli_thread_barrier( thread_pb ); \ } \ } \ \ /* Release any memory that was acquired for packing matrices A and B. */ \ PASTEMAC(ch,packm_sup_finalize_mem_a) \ ( \ packa, \ rntm, \ &mem_a, \ thread_pa \ ); \ PASTEMAC(ch,packm_sup_finalize_mem_b) \ ( \ packb, \ rntm, \ &mem_b, \ thread_pb \ ); \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_sup_vars.h000066400000000000000000000142141427272030600223330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_tapi.c000066400000000000000000000201701427272030600214170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-like interfaces with typed operands (basic). // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ transa, \ transb, \ m, n, k, \ alpha, \ a, rs_a, cs_a, \ b, rs_b, cs_b, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNC_BASIC0( gemm ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uploc, \ transa, \ transb, \ m, k, \ alpha, \ a, rs_a, cs_a, \ b, rs_b, cs_b, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNC_BASIC0( gemmt ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, struca ) \ \ void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side, \ uploa, \ conja, \ transb, \ m, n, \ alpha, \ a, rs_a, cs_a, \ b, rs_b, cs_b, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNC_BASIC( hemm, BLIS_HERMITIAN ) INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uploc, \ transa, \ m, k, \ alpha, \ a, rs_a, cs_a, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNCR_BASIC0( herk ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uploc, \ transa, \ transb, \ m, k, \ alpha, \ a, rs_a, cs_a, \ b, rs_b, cs_b, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNCR_BASIC0( her2k ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uploc, \ transa, \ m, k, \ alpha, \ a, rs_a, cs_a, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNC_BASIC0( syrk ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uploc, \ transa, \ transb, \ m, k, \ alpha, \ a, rs_a, cs_a, \ b, rs_b, cs_b, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNC_BASIC0( syr2k ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side, \ uploa, \ transa, \ diaga, \ transb, \ m, n, \ alpha, \ a, rs_a, cs_a, \ b, rs_b, cs_b, \ beta, \ c, rs_c, cs_c, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNC_BASIC0( trmm3 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ) \ { \ /* Invoke the expert interface and request default cntx_t and rntm_t objects. */ \ PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side, \ uploa, \ transa, \ diaga, \ m, n, \ alpha, \ a, rs_a, cs_a, \ b, rs_b, cs_b, \ NULL, \ NULL \ ); \ } INSERT_GENTFUNC_BASIC0( trmm ) INSERT_GENTFUNC_BASIC0( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_tapi.h000066400000000000000000000124311427272030600214250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_tapi_ex.c000066400000000000000000000354121427272030600221200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-like interfaces with typed operands (expert). // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m_a, n_a; \ dim_t m_b, n_b; \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, beta, &betao ); \ \ bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( gemm ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, struca ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t mn_a; \ dim_t m_b, n_b; \ \ bli_set_dim_with_side( side, m, n, &mn_a ); \ bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, beta, &betao ); \ \ bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_conj( conja, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( struca, &ao ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side, \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC( hemm, BLIS_HERMITIAN ) INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt_r = PASTEMAC(chr,type); \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m_a, n_a; \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ \ bli_obj_init_finish_1x1( dt_r, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt_r, beta, &betao ); \ \ bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ \ bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC0( herk ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt_r = PASTEMAC(chr,type); \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m_a, n_a; \ dim_t m_b, n_b; \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt_r, beta, &betao ); \ \ bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( BLIS_HERMITIAN, &co ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC0( her2k ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m_a, n_a; \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, beta, &betao ); \ \ bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ \ bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( syrk ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m_a, n_a; \ dim_t m_b, n_b; \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, beta, &betao ); \ \ bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( BLIS_SYMMETRIC, &co ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( syr2k ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m_a, n_a; \ dim_t m_b, n_b; \ \ bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \ bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, beta, &betao ); \ \ bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m, m, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploc, &co ); \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( gemmt ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t mn_a; \ dim_t m_b, n_b; \ \ bli_set_dim_with_side( side, m, n, &mn_a ); \ bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, beta, &betao ); \ \ bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m, n, c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_diag( diaga, &ao ); \ bli_obj_set_conjtrans( transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side, \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( trmm3 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ \ dim_t mn_a; \ \ bli_set_dim_with_side( side, m, n, &mn_a ); \ \ bli_obj_init_finish_1x1( dt, alpha, &alphao ); \ \ bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m, n, b, rs_b, cs_b, &bo ); \ \ bli_obj_set_uplo( uploa, &ao ); \ bli_obj_set_diag( diaga, &ao ); \ bli_obj_set_conjtrans( transa, &ao ); \ \ bli_obj_set_struc( BLIS_TRIANGULAR, &ao ); \ \ PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side, \ &alphao, \ &ao, \ &bo, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( trmm ) INSERT_GENTFUNC_BASIC0( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_tapi_ex.h000066400000000000000000000134361427272030600221270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_thrinfo.c000066400000000000000000000546211427272030600221430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" void bli_l3_thrinfo_init_single ( thrinfo_t* thread ) { bli_thrinfo_init_single( thread ); } void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ) { bli_thrinfo_free( rntm, thread ); } void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ) { bli_thrinfo_free( rntm, thread ); } // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ) { // Query the global communicator for the total number of threads to use. dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); // Use the thread id passed in as the global communicator id. dim_t gl_comm_id = id; // Use the blocksize id of the current (root) control tree node to // query the top-most ways of parallelism to obtain. bszid_t bszid = bli_cntl_bszid( cntl ); dim_t xx_way = bli_rntm_ways_for( bszid, rntm ); // Determine the work id for this thrinfo_t node. dim_t work_id = gl_comm_id / ( n_threads / xx_way ); // Create the root thrinfo_t node. *thread = bli_thrinfo_create ( rntm, gl_comm, gl_comm_id, xx_way, work_id, TRUE, bszid, NULL ); } // ----------------------------------------------------------------------------- void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ) { // Query the global communicator for the total number of threads to use. dim_t n_threads = bli_thrcomm_num_threads( gl_comm ); // Use the thread id passed in as the global communicator id. dim_t gl_comm_id = id; // Use the BLIS_NC blocksize id to query the top-most ways of parallelism // to obtain. Note that hard-coding BLIS_NC like this is a little bit of a // hack, but it works fine since both of the sup algorithms (bp and pb) use // the cache blocksizes down to the 3rd loop. (See the definitions of // bli_rntm_calc_num_threads_bp() and bli_rntm_calc_num_threads_pb() for // a concise enumeration of these bszid_t ids.) const bszid_t bszid = BLIS_NC; dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm ); // Determine the work id for this thrinfo_t node. dim_t work_id = gl_comm_id / ( n_threads / xx_way ); // Create the root thrinfo_t node. *thread = bli_thrinfo_create ( rntm, gl_comm, gl_comm_id, xx_way, work_id, TRUE, bszid, NULL ); } // ----------------------------------------------------------------------------- void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ) { // Query the current root for the total number of threads to use. const dim_t n_threads = bli_thread_num_threads( thread ); // Query the current root for the (global) comm id. const dim_t gl_comm_id = bli_thread_ocomm_id( thread ); // Query the rntm_t for the updated number of ways of parallelism. const dim_t xx_way = bli_rntm_ways_for( BLIS_NC, rntm ); // Recompute the work id for this thrinfo_t node using the updated // number of ways of parallelism. dim_t work_id = gl_comm_id / ( n_threads / xx_way ); // Save the updated ways of parallelism and work id to the thrinfo_t node. bli_thrinfo_set_n_way( xx_way, thread ); bli_thrinfo_set_work_id( work_id, thread ); } // ----------------------------------------------------------------------------- void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ) { // In order to query the number of threads, we query the only thread we // know exists: thread 0. dim_t n_threads = bli_thread_num_threads( threads[0] ); // For the purposes of printing the "header" information that is common // to the various instances of a thrinfo_t (ie: across all threads), we // choose the last thread in case the problem is so small that there is // only an "edge" case, which will always be assigned to the last thread // (at least for higher levels of partitioning). thrinfo_t* jc_info = threads[n_threads-1]; thrinfo_t* pc_info = NULL; thrinfo_t* pb_info = NULL; thrinfo_t* ic_info = NULL; thrinfo_t* pa_info = NULL; thrinfo_t* jr_info = NULL; thrinfo_t* ir_info = NULL; // Initialize the n_ways and n_threads fields of each thrinfo_t "level" // to -1. More than likely, these will all be overwritten with meaningful // values, but in case some thrinfo_t trees are not fully built (see // next commnet), these will be the placeholder values. dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1, pa_way = -1, jr_way = -1, ir_way = -1; dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1, pa_nt = -1, jr_nt = -1, ir_nt = -1; // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads // may not fully build their thrinfo_t structures--specifically when the // dimension being parallelized is not large enough for each thread to have // even one unit of work (where as unit is usually a single micropanel's // width, MR or NR). if ( !jc_info ) goto print_header; jc_way = bli_thread_n_way( jc_info ); jc_nt = bli_thread_num_threads( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_header; pc_way = bli_thread_n_way( pc_info ); pc_nt = bli_thread_num_threads( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_header; pb_way = bli_thread_n_way( pb_info ); pb_nt = bli_thread_num_threads( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_header; ic_way = bli_thread_n_way( ic_info ); ic_nt = bli_thread_num_threads( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); if ( !pa_info ) goto print_header; pa_way = bli_thread_n_way( pa_info ); pa_nt = bli_thread_num_threads( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_header; jr_way = bli_thread_n_way( jr_info ); jr_nt = bli_thread_num_threads( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_header; ir_way = bli_thread_n_way( ir_info ); ir_nt = bli_thread_num_threads( ir_info ); print_header: printf( " jc kc pb ic pa jr ir\n" ); printf( "xx_nt: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( unsigned long )jc_nt, ( unsigned long )pc_nt, ( unsigned long )pb_nt, ( unsigned long )ic_nt, ( unsigned long )pa_nt, ( unsigned long )jr_nt, ( unsigned long )ir_nt ); printf( "xx_way: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( unsigned long )jc_way, ( unsigned long )pc_way, ( unsigned long )pb_way, ( unsigned long )ic_way, ( unsigned long )pa_way, ( unsigned long )jr_way, ( unsigned long )ir_way ); printf( "============================================\n" ); for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id ) { jc_info = threads[gl_id]; dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1, pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1; dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1, pa_work_id = -1, jr_work_id = -1, ir_work_id = -1; if ( !jc_info ) goto print_thrinfo; jc_comm_id = bli_thread_ocomm_id( jc_info ); jc_work_id = bli_thread_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_thrinfo; pc_comm_id = bli_thread_ocomm_id( pc_info ); pc_work_id = bli_thread_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_thrinfo; pb_comm_id = bli_thread_ocomm_id( pb_info ); pb_work_id = bli_thread_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_thrinfo; ic_comm_id = bli_thread_ocomm_id( ic_info ); ic_work_id = bli_thread_work_id( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); if ( !pa_info ) goto print_thrinfo; pa_comm_id = bli_thread_ocomm_id( pa_info ); pa_work_id = bli_thread_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_thrinfo; jr_comm_id = bli_thread_ocomm_id( jr_info ); jr_work_id = bli_thread_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_thrinfo; ir_comm_id = bli_thread_ocomm_id( ir_info ); ir_work_id = bli_thread_work_id( ir_info ); print_thrinfo: printf( "comm ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( long )jc_comm_id, ( long )pc_comm_id, ( long )pb_comm_id, ( long )ic_comm_id, ( long )pa_comm_id, ( long )jr_comm_id, ( long )ir_comm_id ); printf( "work ids: %4ld %4ld %4ld %4ld %4ld %4ld %4ld\n", ( long )jc_work_id, ( long )pc_work_id, ( long )pb_work_id, ( long )ic_work_id, ( long )pa_work_id, ( long )jr_work_id, ( long )ir_work_id ); printf( "--------------------------------------------\n" ); } } // ----------------------------------------------------------------------------- // ----------------------------------------------------------------------------- // ----------------------------------------------------------------------------- void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ) { // In order to query the number of threads, we query the only thread we // know exists: thread 0. dim_t n_threads = bli_thread_num_threads( threads[0] ); // For the purposes of printing the "header" information that is common // to the various instances of a thrinfo_t (ie: across all threads), we // choose the last thread in case the problem is so small that there is // only an "edge" case, which will always be assigned to the last thread // (at least for higher levels of partitioning). thrinfo_t* jc_info = threads[n_threads-1]; thrinfo_t* pc_info = NULL; thrinfo_t* pb_info = NULL; thrinfo_t* ic_info = NULL; thrinfo_t* pa_info = NULL; thrinfo_t* pa_info0 = NULL; thrinfo_t* jr_info = NULL; thrinfo_t* jr_info0 = NULL; thrinfo_t* ir_info = NULL; thrinfo_t* ir_info0 = NULL; // Initialize the n_ways and n_threads fields of each thrinfo_t "level" // to -1. More than likely, these will all be overwritten with meaningful // values, but in case some thrinfo_t trees are not fully built (see // next commnet), these will be the placeholder values. dim_t jc_way = -1, pc_way = -1, pb_way = -1, ic_way = -1, pa_way = -1, jr_way = -1, ir_way = -1, pa_way0 = -1, jr_way0 = -1, ir_way0 = -1; dim_t jc_nt = -1, pc_nt = -1, pb_nt = -1, ic_nt = -1, pa_nt = -1, jr_nt = -1, ir_nt = -1, pa_nt0 = -1, jr_nt0 = -1, ir_nt0 = -1; // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads // may not fully build their thrinfo_t structures--specifically when the // dimension being parallelized is not large enough for each thread to have // even one unit of work (where as unit is usually a single micropanel's // width, MR or NR). if ( !jc_info ) goto print_header; jc_way = bli_thread_n_way( jc_info ); jc_nt = bli_thread_num_threads( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_header; pc_way = bli_thread_n_way( pc_info ); pc_nt = bli_thread_num_threads( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_header; pb_way = bli_thread_n_way( pb_info ); pb_nt = bli_thread_num_threads( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_header; ic_way = bli_thread_n_way( ic_info ); ic_nt = bli_thread_num_threads( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); // check_header_prenode: if ( !pa_info0 ) goto check_header_node; pa_way0 = bli_thread_n_way( pa_info0 ); pa_nt0 = bli_thread_num_threads( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); if ( !jr_info0 ) goto check_header_node; jr_way0 = bli_thread_n_way( jr_info0 ); jr_nt0 = bli_thread_num_threads( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); if ( !ir_info0 ) goto check_header_node; ir_way0 = bli_thread_n_way( ir_info0 ); ir_nt0 = bli_thread_num_threads( ir_info0 ); check_header_node: if ( !pa_info ) goto print_header; pa_way = bli_thread_n_way( pa_info ); pa_nt = bli_thread_num_threads( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_header; jr_way = bli_thread_n_way( jr_info ); jr_nt = bli_thread_num_threads( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_header; ir_way = bli_thread_n_way( ir_info ); ir_nt = bli_thread_num_threads( ir_info ); print_header: printf( " jc kc pb ic pa jr ir\n" ); printf( "xx_nt: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", ( long )jc_nt, ( long )pc_nt, ( long )pb_nt, ( long )ic_nt, ( long )pa_nt0, ( long )pa_nt, ( long )jr_nt0, ( long )jr_nt, ( long )ir_nt0, ( long )ir_nt ); printf( "xx_way: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", ( long )jc_way, ( long )pc_way, ( long )pb_way, ( long )ic_way, ( long )pa_way0, ( long )pa_way, ( long )jr_way0, ( long )jr_way, ( long )ir_way0, ( long )ir_way ); printf( "==================================================\n" ); for ( dim_t gl_id = 0; gl_id < n_threads; ++gl_id ) { jc_info = threads[gl_id]; #if 1 // NOTE: This cpp branch contains code that is safe to execute // for small problems that are parallelized enough that one or // more threads gets no work. dim_t jc_comm_id = -1, pc_comm_id = -1, pb_comm_id = -1, ic_comm_id = -1, pa_comm_id = -1, jr_comm_id = -1, ir_comm_id = -1, pa_comm_id0 = -1, jr_comm_id0 = -1, ir_comm_id0 = -1; dim_t jc_work_id = -1, pc_work_id = -1, pb_work_id = -1, ic_work_id = -1, pa_work_id = -1, jr_work_id = -1, ir_work_id = -1, pa_work_id0 = -1, jr_work_id0 = -1, ir_work_id0 = -1; if ( !jc_info ) goto print_thrinfo; jc_comm_id = bli_thread_ocomm_id( jc_info ); jc_work_id = bli_thread_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) goto print_thrinfo; pc_comm_id = bli_thread_ocomm_id( pc_info ); pc_work_id = bli_thread_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) goto print_thrinfo; pb_comm_id = bli_thread_ocomm_id( pb_info ); pb_work_id = bli_thread_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) goto print_thrinfo; ic_comm_id = bli_thread_ocomm_id( ic_info ); ic_work_id = bli_thread_work_id( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); // check_thrinfo_prenode: if ( !pa_info0 ) goto check_thrinfo_node; pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); pa_work_id0 = bli_thread_work_id( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); if ( !jr_info0 ) goto check_thrinfo_node; jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); jr_work_id0 = bli_thread_work_id( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); if ( !ir_info0 ) goto check_thrinfo_node; ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); ir_work_id0 = bli_thread_work_id( ir_info0 ); check_thrinfo_node: if ( !pa_info ) goto print_thrinfo; pa_comm_id = bli_thread_ocomm_id( pa_info ); pa_work_id = bli_thread_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) goto print_thrinfo; jr_comm_id = bli_thread_ocomm_id( jr_info ); jr_work_id = bli_thread_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) goto print_thrinfo; ir_comm_id = bli_thread_ocomm_id( ir_info ); ir_work_id = bli_thread_work_id( ir_info ); print_thrinfo: #else dim_t jc_comm_id; dim_t pc_comm_id; dim_t pb_comm_id; dim_t ic_comm_id; dim_t pa_comm_id0, pa_comm_id; dim_t jr_comm_id0, jr_comm_id; dim_t ir_comm_id0, ir_comm_id; dim_t jc_work_id; dim_t pc_work_id; dim_t pb_work_id; dim_t ic_work_id; dim_t pa_work_id0, pa_work_id; dim_t jr_work_id0, jr_work_id; dim_t ir_work_id0, ir_work_id; // NOTE: We must check each thrinfo_t pointer for NULLness. Certain threads // may not fully build their thrinfo_t structures--specifically when the // dimension being parallelized is not large enough for each thread to have // even one unit of work (where as unit is usually a single micropanel's // width, MR or NR). if ( !jc_info ) { jc_comm_id = pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; jc_work_id = pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; } else { jc_comm_id = bli_thread_ocomm_id( jc_info ); jc_work_id = bli_thread_work_id( jc_info ); pc_info = bli_thrinfo_sub_node( jc_info ); if ( !pc_info ) { pc_comm_id = pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; pc_work_id = pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; } else { pc_comm_id = bli_thread_ocomm_id( pc_info ); pc_work_id = bli_thread_work_id( pc_info ); pb_info = bli_thrinfo_sub_node( pc_info ); if ( !pb_info ) { pb_comm_id = ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; pb_work_id = ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; } else { pb_comm_id = bli_thread_ocomm_id( pb_info ); pb_work_id = bli_thread_work_id( pb_info ); ic_info = bli_thrinfo_sub_node( pb_info ); if ( !ic_info ) { ic_comm_id = pa_comm_id = jr_comm_id = ir_comm_id = -1; ic_work_id = pa_work_id = jr_work_id = ir_work_id = -1; } else { ic_comm_id = bli_thread_ocomm_id( ic_info ); ic_work_id = bli_thread_work_id( ic_info ); pa_info0 = bli_thrinfo_sub_prenode( ic_info ); pa_info = bli_thrinfo_sub_node( ic_info ); // Prenode if ( !pa_info0 ) { pa_comm_id0 = jr_comm_id0 = ir_comm_id0 = -1; pa_work_id0 = jr_work_id0 = ir_work_id0 = -1; } else { pa_comm_id0 = bli_thread_ocomm_id( pa_info0 ); pa_work_id0 = bli_thread_work_id( pa_info0 ); jr_info0 = bli_thrinfo_sub_node( pa_info0 ); if ( !jr_info0 ) { jr_comm_id0 = ir_comm_id0 = -1; jr_work_id0 = ir_work_id0 = -1; } else { jr_comm_id0 = bli_thread_ocomm_id( jr_info0 ); jr_work_id0 = bli_thread_work_id( jr_info0 ); ir_info0 = bli_thrinfo_sub_node( jr_info0 ); if ( !ir_info0 ) { ir_comm_id0 = -1; ir_work_id0 = -1; } else { ir_comm_id0 = bli_thread_ocomm_id( ir_info0 ); ir_work_id0 = bli_thread_work_id( ir_info0 ); } } } // Main node if ( !pa_info ) { pa_comm_id = jr_comm_id = ir_comm_id = -1; pa_work_id = jr_work_id = ir_work_id = -1; } else { pa_comm_id = bli_thread_ocomm_id( pa_info ); pa_work_id = bli_thread_work_id( pa_info ); jr_info = bli_thrinfo_sub_node( pa_info ); if ( !jr_info ) { jr_comm_id = ir_comm_id = -1; jr_work_id = ir_work_id = -1; } else { jr_comm_id = bli_thread_ocomm_id( jr_info ); jr_work_id = bli_thread_work_id( jr_info ); ir_info = bli_thrinfo_sub_node( jr_info ); if ( !ir_info ) { ir_comm_id = -1; ir_work_id = -1; } else { ir_comm_id = bli_thread_ocomm_id( ir_info ); ir_work_id = bli_thread_work_id( ir_info ); } } } } } } } #endif printf( "comm ids: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", ( long )jc_comm_id, ( long )pc_comm_id, ( long )pb_comm_id, ( long )ic_comm_id, ( long )pa_comm_id0, ( long )pa_comm_id, ( long )jr_comm_id0, ( long )jr_comm_id, ( long )ir_comm_id0, ( long )ir_comm_id ); printf( "work ids: %4ld %4ld %4ld %4ld %2ld|%2ld %2ld|%2ld %2ld|%2ld\n", ( long )jc_work_id, ( long )pc_work_id, ( long )pb_work_id, ( long )ic_work_id, ( long )pa_work_id0, ( long )pa_work_id, ( long )jr_work_id0, ( long )jr_work_id, ( long )ir_work_id0, ( long )ir_work_id ); printf( "--------------------------------------------------\n" ); } } // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ) { dim_t n_threads = bli_thread_num_threads( threads[0] ); dim_t i; for ( i = 0; i < n_threads; ++i ) bli_l3_thrinfo_free( rntm, threads[i] ); bli_free_intl( threads ); } cython-blis-0.9.1/blis/_src/frame/3/bli_l3_thrinfo.h000066400000000000000000000102771427272030600221470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr.h000066400000000000000000000044561427272030600213010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr_fpa.c000066400000000000000000000041731427272030600221160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( tname, opname ) \ \ GENARRAY_FPA( PASTECH2(tname,_ukr,_vft), \ opname ); \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ) \ { \ return PASTECH(opname,_fpa)[ dt ]; \ } GENFRONT( gemm, gemm_ukernel ) GENFRONT( gemmtrsm, gemmtrsm_l_ukernel ) GENFRONT( gemmtrsm, gemmtrsm_u_ukernel ) GENFRONT( trsm, trsm_l_ukernel ) GENFRONT( trsm, trsm_u_ukernel ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr_fpa.h000066400000000000000000000037571427272030600221320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr_oapi.c000066400000000000000000000155361427272030600223050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENFRONT #define GENFRONT( tname, opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( c ); \ \ dim_t m = bli_obj_length( c ); \ dim_t n = bli_obj_width( c ); \ dim_t k = bli_obj_width( a ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ void* buf_b = bli_obj_buffer_at_off( b ); \ void* buf_c = bli_obj_buffer_at_off( c ); \ inc_t rs_c = bli_obj_row_stride( c ); \ inc_t cs_c = bli_obj_col_stride( c ); \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ void* buf_beta = bli_obj_buffer_for_1x1( dt, beta ); \ \ auxinfo_t data; \ \ /* Fill the auxinfo_t struct in case the micro-kernel uses it. */ \ bli_auxinfo_set_next_a( buf_a, &data ); \ bli_auxinfo_set_next_b( buf_b, &data ); \ bli_auxinfo_set_is_a( 1, &data ); \ bli_auxinfo_set_is_b( 1, &data ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(tname,_ukr,_vft) f = \ PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ m, \ n, \ k, \ buf_alpha, \ buf_a, \ buf_b, \ buf_beta, \ buf_c, rs_c, cs_c, \ &data, \ cntx \ ); \ } \ GENFRONT( gemm, gemm_ukernel ) #undef GENFRONT #define GENFRONT( tname, opname, opnamel, opnameu ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( c11 ); \ \ dim_t m = bli_obj_length( c11 ); \ dim_t n = bli_obj_width( c11 ); \ dim_t k = bli_obj_width( a1x ); \ void* buf_a1x = bli_obj_buffer_at_off( a1x ); \ void* buf_a11 = bli_obj_buffer_at_off( a11 ); \ void* buf_bx1 = bli_obj_buffer_at_off( bx1 ); \ void* buf_b11 = bli_obj_buffer_at_off( b11 ); \ void* buf_c11 = bli_obj_buffer_at_off( c11 ); \ inc_t rs_c = bli_obj_row_stride( c11 ); \ inc_t cs_c = bli_obj_col_stride( c11 ); \ void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha ); \ \ auxinfo_t data; \ \ /* Fill the auxinfo_t struct in case the micro-kernel uses it. */ \ if ( bli_obj_is_lower( a11 ) ) \ { bli_auxinfo_set_next_a( buf_a1x, &data ); } \ else /* if ( bli_obj_is_upper( a11 ) ) */ \ { bli_auxinfo_set_next_a( buf_a11, &data ); } \ bli_auxinfo_set_next_b( buf_bx1, &data ); \ \ /* Invoke the void pointer-based function for the given datatype. */ \ if ( bli_obj_is_lower( a11 ) ) \ { \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(tname,_ukr,_vft) f = \ PASTEMAC(opnamel,_qfp)( dt ); \ \ f \ ( \ m, \ n, \ k, \ buf_alpha, \ buf_a1x, \ buf_a11, \ buf_bx1, \ buf_b11, \ buf_c11, rs_c, cs_c, \ &data, \ cntx \ ); \ } \ else /* if ( bli_obj_is_upper( a11 ) ) */ \ { \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(tname,_ukr,_vft) f = \ PASTEMAC(opnameu,_qfp)( dt ); \ \ f \ ( \ m, \ n, \ k, \ buf_alpha, \ buf_a1x, \ buf_a11, \ buf_bx1, \ buf_b11, \ buf_c11, rs_c, cs_c, \ &data, \ cntx \ ); \ } \ } \ GENFRONT( gemmtrsm, gemmtrsm_ukernel, gemmtrsm_l_ukernel, gemmtrsm_u_ukernel ) #undef GENFRONT #define GENFRONT( tname, opname, opnamel, opnameu ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( c ); \ \ void* buf_a = bli_obj_buffer_at_off( a ); \ void* buf_b = bli_obj_buffer_at_off( b ); \ void* buf_c = bli_obj_buffer_at_off( c ); \ inc_t rs_c = bli_obj_row_stride( c ); \ inc_t cs_c = bli_obj_col_stride( c ); \ \ auxinfo_t data; \ \ /* Fill the auxinfo_t struct in case the micro-kernel uses it. */ \ bli_auxinfo_set_next_a( buf_a, &data ); \ bli_auxinfo_set_next_b( buf_b, &data ); \ bli_auxinfo_set_is_a( 1, &data ); \ bli_auxinfo_set_is_b( 1, &data ); \ \ /* Invoke the void pointer-based function for the given datatype. */ \ if ( bli_obj_is_lower( a ) ) \ { \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(tname,_ukr,_vft) f = \ PASTEMAC(opnamel,_qfp)( dt ); \ \ f \ ( \ buf_a, \ buf_b, \ buf_c, rs_c, cs_c, \ &data, \ cntx \ ); \ } \ else /* if ( bli_obj_is_upper( a ) ) */ \ { \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(tname,_ukr,_vft) f = \ PASTEMAC(opnameu,_qfp)( dt ); \ \ f \ ( \ buf_a, \ buf_b, \ buf_c, rs_c, cs_c, \ &data, \ cntx \ ); \ } \ } \ GENFRONT( trsm, trsm_ukernel, trsm_l_ukernel, trsm_u_ukernel ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr_oapi.h000066400000000000000000000046701427272030600223070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr_prot.h000066400000000000000000000057661427272030600223520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr_tapi.c000066400000000000000000000107631427272030600223070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, tname, kerid ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for the function address of the current datatype's micro-kernel. */ \ PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ f \ ( \ m, \ n, \ k, \ alpha, \ a, \ b, \ beta, \ c, rs_c, cs_c, \ data, \ cntx \ ); \ } \ INSERT_GENTFUNC_BASIC2( gemm_ukernel, gemm, BLIS_GEMM_UKR ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, tname, kerid ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for the function address of the current datatype's micro-kernel. */ \ PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ f \ ( \ m, \ n, \ k, \ alpha, \ a1x, \ a11, \ bx1, \ b11, \ c11, rs_c, cs_c, \ data, \ cntx \ ); \ } \ INSERT_GENTFUNC_BASIC2( gemmtrsm_l_ukernel, gemmtrsm, BLIS_GEMMTRSM_L_UKR ) INSERT_GENTFUNC_BASIC2( gemmtrsm_u_ukernel, gemmtrsm, BLIS_GEMMTRSM_U_UKR ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, tname, kerid ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ bli_init_once(); \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for the function address of the current datatype's micro-kernel. */ \ PASTECH2(ch,tname,_ukr_ft) f = bli_cntx_get_l3_vir_ukr_dt( dt, kerid, cntx ); \ \ /* Invoke the typed function for the given datatype. */ \ f \ ( \ a, \ b, \ c, rs_c, cs_c, \ data, \ cntx \ ); \ } \ INSERT_GENTFUNC_BASIC2( trsm_l_ukernel, trsm, BLIS_TRSM_L_UKR ) INSERT_GENTFUNC_BASIC2( trsm_u_ukernel, trsm, BLIS_TRSM_U_UKR ) cython-blis-0.9.1/blis/_src/frame/3/bli_l3_ukr_tapi.h000066400000000000000000000041651427272030600223130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. #include "bli_l3_ukr.h" cython-blis-0.9.1/blis/_src/frame/3/gemm/000077500000000000000000000000001427272030600200175ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm.h000066400000000000000000000035211427272030600217440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_gemm_cntl.h" #include "bli_gemm_front.h" #include "bli_gemm_var.h" #include "bli_gemm_ind_opt.h" // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD #include "bli_gemm_md.h" #endif cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_blk_var1.c000066400000000000000000000060031427272030600235160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemm_blk_var1 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, c1; dim_t my_start, my_end; dim_t b_alg; // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_range_mdim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end ); // Partition along the m dimension. for ( dim_t i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize( direct, i, my_end, a, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); // Perform gemm subproblem. bli_l3_int ( &BLIS_ONE, &a1, b, &BLIS_ONE, &c1, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } } cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_blk_var2.c000066400000000000000000000060031427272030600235170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemm_blk_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t b1, c1; dim_t my_start, my_end; dim_t b_alg; // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end ); // Partition along the n dimension. for ( dim_t i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize( direct, i, my_end, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); // Perform gemm subproblem. bli_l3_int ( &BLIS_ONE, a, &b1, &BLIS_ONE, &c1, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } } cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_blk_var3.c000066400000000000000000000104421427272030600235220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dim_t b_alg; // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. dim_t k_trans = bli_obj_width_after_trans( a ); // Partition along the k dimension. for ( dim_t i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b, bli_cntl_bszid( cntl ), cntx, cntl ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform gemm subproblem. bli_l3_int ( &BLIS_ONE, &a1, &b1, &BLIS_ONE, c, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal beta scalar on matrix C is non-zero, we must use it // only for the first iteration (and then BLIS_ONE for all others). // And since c is a locally aliased obj_t (see _int() function), we // can simply overwrite the internal beta scalar with BLIS_ONE once // it has been used in the first iteration. However... // Unlike variant 3 of gemm and gemmt, which reset the internal scalar // on C at the end of the first iteration so that subsequent iterations // do not erroneously apply beta more than once, it is important that // this behavior not be applied to trmm. That is because the order of // computation is always such that the beta that is passed into the // macro-kernel must be zero, since the macro-kernel only applies that // beta to (and thus overwrites) the row-panel of C that corresponds to // the current block intersecting the diagonal. It turns out that this // same pattern holds for trmm3 as well--except there, the beta scalar // is potentially non-zero, but is still applied only to the current // row-panel of C, and thus beta is applied to all of C exactly once. // Thus, for neither trmm nor trmm3 should we reset the scalar on C // after the first iteration. if ( bli_cntl_family( cntl ) != BLIS_TRMM ) if ( i == 0 ) bli_obj_scalar_reset( c ); } } cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_cntl.c000066400000000000000000000172701427272030600227650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ) { return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ) { void_fp macro_kernel_fp; // Choose the default macrokernel based on the operation family... if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2; else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; else /* should never execute */ macro_kernel_fp = NULL; // ...unless a non-NULL kernel function pointer is passed in, in which // case we use that instead. if ( ker ) macro_kernel_fp = ker; // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node ( rntm, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node ( rntm, // the thread's runtime structure family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_fp, gemm_cntl_bu_ke ); // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, bli_l3_packa, // pack the left-hand operand BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node ( rntm, family, BLIS_MC, bli_gemm_blk_var1, gemm_cntl_packa ); // Create a node for packing matrix B. cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( rntm, bli_l3_packb, // pack the right-hand operand BLIS_NR, BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_bp ); // Create a node for partitioning the k dimension by KC. cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( rntm, family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packb ); // Create a node for partitioning the n dimension by NC. cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( rntm, family, BLIS_NC, bli_gemm_blk_var2, gemm_cntl_mm_op ); return gemm_cntl_vl_mm; } // ----------------------------------------------------------------------------- // This control tree creation function is disabled because it is no longer used. // (It was originally created in the run up to publishing the 1m journal article, // but was disabled to reduce complexity.) #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family ) { void_fp macro_kernel_p = bli_gemm_ker_var1; // Change the macro-kernel if the operation family is gemmt or trmm. //if ( family == BLIS_GEMMT ) macro_kernel_p = bli_gemmt_x_ker_var2; //else if ( family == BLIS_TRMM ) macro_kernel_p = bli_trmm_xx_ker_var2; // Create two nodes for the macro-kernel. cntl_t* gemm_cntl_ub_ke = bli_gemm_cntl_create_node ( family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); cntl_t* gemm_cntl_pb_ub = bli_gemm_cntl_create_node ( family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_ub_ke ); // Create a node for packing matrix A (which is really the right-hand // operand "B"). cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node ( bli_gemm_packb, // pack the right-hand operand bli_packm_blk_var1, BLIS_MR, BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_COL_PANELS, BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_pb_ub ); // Create a node for partitioning the n dimension by MC. cntl_t* gemm_cntl_op_pb = bli_gemm_cntl_create_node ( family, BLIS_MC, bli_gemm_blk_var2, gemm_cntl_packb ); // Create a node for packing matrix B (which is really the left-hand // operand "A"). cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( bli_gemm_packa, // pack the left-hand operand bli_packm_blk_var1, BLIS_NR, BLIS_KR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? BLIS_PACKED_ROW_PANELS, BLIS_BUFFER_FOR_B_PANEL, gemm_cntl_op_pb ); // Create a node for partitioning the k dimension by KC. cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node ( family, BLIS_KC, bli_gemm_blk_var3, gemm_cntl_packa ); // Create a node for partitioning the m dimension by NC. cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node ( family, BLIS_NC, bli_gemm_blk_var1, gemm_cntl_mm_op ); return gemm_cntl_vl_mm; } #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { bli_cntl_free( rntm, cntl, thread ); } // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ) { return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); } cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_cntl.h000066400000000000000000000051401427272030600227630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_front.c000066400000000000000000000231521427272030600231510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { bli_init_once(); obj_t a_local; obj_t b_local; obj_t c_local; // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) { return; } // If alpha is zero, or if A or B has a zero dimension, scale C by beta // and return early. if ( bli_obj_equals( alpha, &BLIS_ZERO ) || bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { bli_scalm( beta, c ); return; } #if 0 #ifdef BLIS_ENABLE_SMALL_MATRIX // Only handle small problems separately for homogeneous datatypes. if ( bli_obj_dt( a ) == bli_obj_dt( b ) && bli_obj_dt( a ) == bli_obj_dt( c ) && bli_obj_comp_prec( c ) == bli_obj_prec( c ) ) { err_t status = bli_gemm_small( alpha, a, b, beta, c, cntx, cntl ); if ( status == BLIS_SUCCESS ) return; } #endif #endif // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); // Set the obj_t buffer field to the location currently implied by the row // and column offsets and then zero the offsets. If any of the original // obj_t's were views into larger matrices, this step effectively makes // those obj_t's "forget" their lineage. bli_obj_reset_origin( &a_local ); bli_obj_reset_origin( &b_local ); bli_obj_reset_origin( &c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); #ifdef BLIS_ENABLE_GEMM_MD cntx_t cntx_local; // If any of the storage datatypes differ, or if the computation precision // differs from the storage precision of C, utilize the mixed datatype // code path. // NOTE: If we ever want to support the caller setting the computation // domain explicitly, we will need to check the computation dt against the // storage dt of C (instead of the computation precision against the // storage precision of C). if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) || bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) || bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) ) { // Handle mixed datatype cases in bli_gemm_md(), which may modify // the objects or the context. (If the context is modified, cntx // is adjusted to point to cntx_local.) bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx ); } #endif // Next, we handle the possibility of needing to typecast alpha to the // computation datatype and/or beta to the storage datatype of C. // Attach alpha to B, and in the process typecast alpha to the target // datatype of the matrix (which in this case is equal to the computation // datatype). bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local ); // Attach beta to C, and in the process typecast beta to the target // datatype of the matrix (which in this case is equal to the storage // datatype of C). bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta, &c_local ); // Change the alpha and beta pointers to BLIS_ONE since the values have // now been typecast and attached to the matrices above. alpha = &BLIS_ONE; beta = &BLIS_ONE; // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_GEMM, BLIS_LEFT, // ignored for gemm/hemm/symm bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); obj_t* cp = &c_local; obj_t* betap = beta; #ifdef BLIS_ENABLE_GEMM_MD #ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM // If any of the following conditions are met, create a temporary matrix // conformal to C into which we will accumulate the matrix product: // - the storage precision of C differs from the computation precision; // - the domains are mixed as crr; // - the storage format of C does not match the preferred orientation // of the ccr or crc cases. // Then, after the computation is complete, this matrix will be copied // or accumulated back to C. const bool is_ccr_mismatch = ( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ) && !bli_obj_is_col_stored( &c_local ) ); const bool is_crc_mismatch = ( bli_gemm_md_is_crc( &a_local, &b_local, &c_local ) && !bli_obj_is_row_stored( &c_local ) ); obj_t ct; bool use_ct = FALSE; // FGVZ: Consider adding another guard here that only creates and uses a // temporary matrix for accumulation if k < c * kc, where c is some small // constant like 2. And don't forget to use the same conditional for the // castm() and free() at the end. if ( bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) || bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) || is_ccr_mismatch || is_crc_mismatch ) { use_ct = TRUE; } // If we need a temporary matrix conformal to C for whatever reason, // we create it and prepare to use it now. if ( use_ct ) { const dim_t m = bli_obj_length( &c_local ); const dim_t n = bli_obj_width( &c_local ); inc_t rs = bli_obj_row_stride( &c_local ); inc_t cs = bli_obj_col_stride( &c_local ); num_t dt_ct = bli_obj_domain( &c_local ) | bli_obj_comp_prec( &c_local ); // When performing the crr case, accumulate to a contiguously-stored // real matrix so we do not have to repeatedly update C with general // stride. if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ) dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local ); // When performing the mismatched ccr or crc cases, now is the time // to specify the appropriate storage so the gemm_md_c2r_ref() virtual // microkernel can output directly to C (instead of using a temporary // microtile). if ( is_ccr_mismatch ) { rs = 1; cs = m; } else if ( is_crc_mismatch ) { rs = n; cs = 1; } bli_obj_create( dt_ct, m, n, rs, cs, &ct ); const num_t dt_exec = bli_obj_exec_dt( &c_local ); const num_t dt_comp = bli_obj_comp_dt( &c_local ); bli_obj_set_target_dt( dt_ct, &ct ); bli_obj_set_exec_dt( dt_exec, &ct ); bli_obj_set_comp_dt( dt_comp, &ct ); // A naive approach would cast C to the comptuation datatype, // compute with beta, and then cast the result back to the // user-provided output matrix. However, we employ a different // approach that halves the number of memops on C (or its // typecast temporary) by writing the A*B product directly to // temporary storage, and then using xpbym to scale the // output matrix by beta and accumulate/cast the A*B product. //bli_castm( &c_local, &ct ); betap = &BLIS_ZERO; cp = &ct; } #endif #endif // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, &b_local, betap, cp, cntx, rntm, cntl ); #ifdef BLIS_ENABLE_GEMM_MD #ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM // If we created a temporary matrix conformal to C for whatever reason, // we copy/accumulate the result back to C and then release the object. if ( use_ct ) { obj_t beta_local; bli_obj_scalar_detach( &c_local, &beta_local ); //bli_castnzm( &ct, &c_local ); bli_xpbym( &ct, &beta_local, &c_local ); bli_obj_free( &ct ); } #endif #endif } cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_front.h000066400000000000000000000040421427272030600231530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_ker_var1.c000066400000000000000000000042621427272030600235340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if 0 #include "blis.h" void bli_gemm_ker_var1 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { // Implement _ker_var1() in terms of _ker_var2() by transposing the // entire suboperation (which also requires swapping A and B). bli_obj_induce_trans( a ); bli_obj_induce_trans( b ); bli_obj_induce_trans( c ); bli_gemm_ker_var2( b, a, c, cntx, rntm, cntl, thread ); } #endif cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_ker_var2.c000066400000000000000000000261531427272030600235400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" typedef void (*xpbys_mxn_vft) ( dim_t m, dim_t n, void* x, inc_t rs_x, inc_t cs_x, void* b, void* y, inc_t rs_y, inc_t cs_y ); #undef GENTFUNC2 #define GENTFUNC2(ctypex,ctypey,chx,chy,op) \ \ void PASTEMAC2(chx,chy,op) \ ( \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ void* b, \ void* y, inc_t rs_y, inc_t cs_y \ ) \ { \ ctypex* restrict x_cast = x; \ ctypey* restrict b_cast = b; \ ctypey* restrict y_cast = y; \ \ PASTEMAC3(chx,chy,chy,xpbys_mxn) \ ( \ m, n, \ x_cast, rs_x, cs_x, \ b_cast, \ y_cast, rs_y, cs_y \ ); \ } INSERT_GENTFUNC2_BASIC0(xbpys_mxn_fn); INSERT_GENTFUNC2_MIXDP0(xbpys_mxn_fn); static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn); void bli_gemm_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); num_t dt_c = bli_obj_dt( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); char* a_cast = bli_obj_buffer_at_off( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); char* b_cast = bli_obj_buffer_at_off( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); char* c_cast = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); // If any dimension is zero, return immediately. if ( bli_zero_dim3( m, n, k ) ) return; // Detach and multiply the scalars attached to A and B. // NOTE: We know that the internal scalars of A and B are already of the // target datatypes because the necessary typecasting would have already // taken place during bli_packm_init(). obj_t scalar_a; obj_t scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. // NOTE: We know that scalar_b is of type dt_exec due to the above code // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, // and we know that the internal scalar in C is already of the type dt_c // due to the casting in the implementation of bli_obj_scalar_attach(). char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); char* beta_cast = bli_obj_internal_scalar_buffer( c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. // Only employ this optimization if the storage datatype of C is // equal to the execution/computation datatype. #if 1 if ( bli_cntx_method( cntx ) == BLIS_1M ) { bli_gemm_ind_recast_1m_params ( &dt_exec, &dt_c, schema_a, c, &m, &n, &k, &pd_a, &ps_a, &pd_b, &ps_b, &rs_c, &cs_c ); } #endif #ifdef BLIS_ENABLE_GEMM_MD // Tweak parameters in select mixed domain cases (rcc, crc, ccr). if ( bli_cntx_method( cntx ) == BLIS_NAT ) { bli_gemm_md_ker_var2_recast ( &dt_exec, bli_obj_dt( a ), bli_obj_dt( b ), &dt_c, &m, &n, &k, &pd_a, &ps_a, &pd_b, &ps_b, c, &rs_c, &cs_c ); } #endif siz_t dt_size = bli_dt_size( dt_exec ); siz_t dt_c_size = bli_dt_size( dt_c ); // Alias some constants to simpler names. const dim_t MR = pd_a; const dim_t NR = pd_b; //const dim_t PACKMR = cs_a; //const dim_t PACKNR = rs_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); // Query the params field from the obj_t. If it is non-NULL, grab the ukr // field of the params struct. If that function pointer is non-NULL, use it // as our microkernel instead of the default microkernel queried from the // cntx above. gemm_ker_params_t* params = bli_obj_ker_params( c ); gemm_ukr_vft user_ukr = params ? params->ukr : NULL; if ( user_ukr ) gemm_ukr = user_ukr; // Temporary C buffer for edge cases. Note that the strides of this // temporary buffer are set so that they match the storage of the // original C matrix. For example, if C is column-stored, ct will be // column-stored as well. char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); // // Assumptions/assertions: // rs_a == 1 // cs_a == PACKMR // pd_a == MR // ps_a == stride to next micro-panel of A // rs_b == PACKNR // cs_b == 1 // pd_b == NR // ps_b == stride to next micro-panel of B // rs_c == (no assumptions) // cs_c == (no assumptions) // // Compute number of primary and leftover components of the m and n // dimensions. dim_t n_iter = n / NR; dim_t n_left = n % NR; dim_t m_iter = m / MR; dim_t m_left = m % MR; if ( n_left ) ++n_iter; if ( m_left ) ++m_iter; // Determine some increments used to step through A, B, and C. inc_t rstep_a = ps_a * dt_size; inc_t cstep_b = ps_b * dt_size; inc_t rstep_c = rs_c * MR * dt_c_size; inc_t cstep_c = cs_c * NR * dt_c_size; auxinfo_t aux; // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // Save the imaginary stride of A and B to the auxinfo_t object. bli_auxinfo_set_is_a( is_a, &aux ); bli_auxinfo_set_is_b( is_b, &aux ); // Save the virtual microkernel address and the params. bli_auxinfo_set_ukr( gemm_ukr, &aux ); bli_auxinfo_set_params( params, &aux ); // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. dim_t jr_nt = bli_thread_n_way( thread ); dim_t jr_tid = bli_thread_work_id( thread ); dim_t ir_nt = bli_thread_n_way( caucus ); dim_t ir_tid = bli_thread_work_id( caucus ); dim_t jr_start, jr_end; dim_t ir_start, ir_end; dim_t jr_inc, ir_inc; // Determine the thread range and increment for the 2nd and 1st loops. // NOTE: The definition of bli_thread_range_jrir() will depend on whether // slab or round-robin partitioning was requested at configure-time. bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) { char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. char* b2 = b1; // Loop over the m dimension (MR rows at a time). for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) { char* a1 = a_cast + i * rstep_a; char* c11 = c1 + i * rstep_c; dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); // Compute the addresses of the next panels of A and B. char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t // object. bli_auxinfo_set_next_a( a2, &aux ); bli_auxinfo_set_next_b( b2, &aux ); // Edge case handling now occurs within the microkernel itself, but // we must still explicitly accumulate to a temporary microtile in // situations where a virtual microkernel is being used, such as // during the 1m method or some cases of mixed datatypes. if ( dt_exec == dt_c ) { // Invoke the gemm micro-kernel. gemm_ukr ( m_cur, n_cur, k, alpha_cast, a1, b1, beta_cast, c11, rs_c, cs_c, &aux, cntx ); } else { // Invoke the gemm micro-kernel. gemm_ukr ( MR, NR, k, alpha_cast, a1, b1, zero, &ct, rs_ct, cs_ct, &aux, cntx ); // Accumulate to C with type-casting. xbpys_mxn[ dt_exec ][ dt_c ] ( m_cur, n_cur, &ct, rs_ct, cs_ct, beta_cast, c11, rs_c, cs_c ); } } } /* PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); */ } cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_md.c000066400000000000000000000517431427272030600224300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_GEMM_MD void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; const bool a_is_real = bli_obj_is_real( a ); const bool a_is_comp = bli_obj_is_complex( a ); const bool b_is_real = bli_obj_is_real( b ); const bool b_is_comp = bli_obj_is_complex( b ); const bool c_is_real = bli_obj_is_real( c ); const bool c_is_comp = bli_obj_is_complex( c ); if ( c_is_real && a_is_real && b_is_real ) { // C_real += A_real * B_real doms = bli_gemm_md_rrr( a, b, beta, c, cntx_local, cntx ); } else if ( c_is_comp && a_is_comp && b_is_comp ) { // C_complex += A_complex * B_complex doms = bli_gemm_md_ccc( a, b, beta, c, cntx_local, cntx ); } else if ( c_is_comp && a_is_comp && b_is_real ) { // C_complex += A_complex * B_real doms = bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx ); } else if ( c_is_comp && a_is_real && b_is_comp ) { // C_complex += A_real * B_complex doms = bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx ); } else if ( c_is_real && a_is_comp && b_is_comp ) { // C_real += A_complex * B_complex doms = bli_gemm_md_rcc( a, b, beta, c, cntx_local, cntx ); } else if ( c_is_comp && a_is_real && b_is_real ) { // C_complex += A_real * B_real doms = bli_gemm_md_crr( a, b, beta, c, cntx_local, cntx ); } else if ( c_is_real && a_is_comp && b_is_real ) { // C_real += A_complex * B_real doms = bli_gemm_md_rcr( a, b, beta, c, cntx_local, cntx ); } else if ( c_is_real && a_is_real && b_is_comp ) { // C_real += A_real * B_complex doms = bli_gemm_md_rrc( a, b, beta, c, cntx_local, cntx ); } else { doms.comp = BLIS_REAL; doms.exec = BLIS_REAL; // This should never execute. bli_abort(); } // Extract the computation and execution domains from the struct // returned above. dom_t dom_comp = doms.comp; dom_t dom_exec = doms.exec; // Inspect the computation precision of C. (The user may have set // this explicitly to request the precision in which the computation // should take place.) prec_t prec_comp = bli_obj_comp_prec( c ); // The computation precision tells us the target precision of A and B. // NOTE: We don't set the target domain here. The target domain would // either be unchanged, or would have been changed in one of the eight // domain cases above. bli_obj_set_target_prec( prec_comp, a ); bli_obj_set_target_prec( prec_comp, b ); // Combine the execution domain with the computation precision to form // the execution datatype. (The computation precision and execution // precision are always equal.) num_t dt_exec = dom_exec | prec_comp; // Set the execution datatypes of A, B, and C. bli_obj_set_exec_dt( dt_exec, a ); bli_obj_set_exec_dt( dt_exec, b ); bli_obj_set_exec_dt( dt_exec, c ); // Combine the computation precision and computation domain to form the // computation datatype. num_t dt_comp = dom_comp | prec_comp; // Set the computation datatypes of A, B, and C. bli_obj_set_comp_dt( dt_comp, a ); bli_obj_set_comp_dt( dt_comp, b ); bli_obj_set_comp_dt( dt_comp, c ); } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_ccr ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; // We assume that the requested computation domain is complex. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_COMPLEX; // For ccr, the computation (ukernel) will be real, but the execution // will appear complex to other parts of the implementation. doms.comp = BLIS_REAL; doms.exec = BLIS_COMPLEX; // Here we construct the computation datatype, which for the ccr case // is equal to the real projection of the execution datatype, and use // that computation datatype to query the corresponding ukernel output // preference. const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); const bool row_pref = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx ); // We can only perform this case of mixed-domain gemm, C += A*B where // B is real, if the microkernel prefers column output. If it prefers // row output, we must induce a transposition and perform C += A*B // where A (formerly B) is real. if ( row_pref ) { bli_obj_swap( a, b ); bli_obj_induce_trans( a ); bli_obj_induce_trans( b ); bli_obj_induce_trans( c ); // We must swap the pack schemas because the schemas were set before // the objects were swapped. bli_obj_swap_pack_schemas( a, b ); return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx ); } // Create a local copy of the context and then prepare to use this // context instead of the one passed in. *cntx_local = **cntx; *cntx = cntx_local; // Copy the real domain blocksizes into the slots of their complex // counterparts. blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); // Halve both the real and complex MR's (which are both real MR's). bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mr ); bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mr ); bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr ); bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr ); // Halve both the real and complex MC's (which are both real MC's). bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_mc ); bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_mc ); bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc ); bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc ); // Use the default pack schemas in the objects. // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs ); bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs ); // Return the computation and execution domains. return doms; } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_crc ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; // We assume that the requested computation domain is complex. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_COMPLEX; // For crc, the computation (ukernel) will be real, but the execution // will appear complex to other parts of the implementation. doms.comp = BLIS_REAL; doms.exec = BLIS_COMPLEX; // Here we construct the computation datatype, which for the crc case // is equal to the real projection of the execution datatype, and use // that computation datatype to query the corresponding ukernel output // preference. const num_t dt = BLIS_REAL | bli_obj_comp_prec( c ); const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx ); // We can only perform this case of mixed-domain gemm, C += A*B where // A is real, if the microkernel prefers row output. If it prefers // column output, we must induce a transposition and perform C += A*B // where B (formerly A) is real. if ( col_pref ) { bli_obj_swap( a, b ); bli_obj_induce_trans( a ); bli_obj_induce_trans( b ); bli_obj_induce_trans( c ); // We must swap the pack schemas because the schemas were set before // the objects were swapped. bli_obj_swap_pack_schemas( a, b ); return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx ); } // Create a local copy of the context and then prepare to use this // context instead of the one passed in. *cntx_local = **cntx; *cntx = cntx_local; // Copy the real domain blocksizes into the slots of their complex // counterparts. blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); // Halve both the real and complex NR's (which are both real NR's). bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nr ); bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nr ); bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nr ); bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nr ); // Halve both the real and complex NC's (which are both real NC's). bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_nc ); bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_nc ); bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc ); bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc ); // Use the default pack schemas in the objects. // static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx ); // Rather than check which complex datatype dt_comp refers to, we set // the mixed-domain virtual microkernel for both types. bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs ); bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs ); // Return the computation and execution domains. return doms; } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_rcc ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; // We assume that the requested computation domain is complex. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_COMPLEX; // For rcc, the computation (ukernel) will be real, and since the output // matrix C is also real, so must be the execution domain. doms.comp = BLIS_REAL; doms.exec = BLIS_REAL; // Create a local copy of the context and then prepare to use this // context instead of the one passed in. *cntx_local = **cntx; *cntx = cntx_local; // Copy the real domain blocksizes into the slots of their complex // counterparts. blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx ); blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx ); blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx ); blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx ); blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_mr, BLIS_SCOMPLEX, blksz_mr ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_nr, BLIS_SCOMPLEX, blksz_nr ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_mc, BLIS_SCOMPLEX, blksz_mc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_nc, BLIS_SCOMPLEX, blksz_nc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc ); bli_blksz_copy_dt( BLIS_FLOAT, blksz_kc, BLIS_SCOMPLEX, blksz_kc ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc ); // Halve both the real and complex KC's (which are both real KC's). bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT, blksz_kc ); bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE, blksz_kc ); bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_kc ); bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_kc ); // Use the 1r pack schema for both A and B with the conjugation // of A or B toggled (to produce ar * br - ai * bi). bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS_1R, a ); bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS_1R, b ); bli_obj_toggle_conj( b ); // We also need to copy over the packm kernels from the 1m // context. We query the address of that context here. // NOTE: This is needed for situations where the rcc case does not // involve any casting to different precisions, since currently // bli_packm_blk_var1() is coded to hand off control to // bli_packm_blk_var1_md() only when the storage datatype differs from // the target datatype. (The packm_blk_var1_md() function has "built-in" // support for packing to 1r (and 1e) schemas, whereas the // packm_blk_var1() function relies on packm kernels for packing to 1r. const num_t dt_complex = bli_obj_dt( a ); cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex ); func_t* cntx_funcs = bli_cntx_packm_kers_buf( *cntx ); func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m ); for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i ) { cntx_funcs[ i ] = cntx_1m_funcs[ i ]; } // Return the computation and execution domains. return doms; } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_crr ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; #ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM obj_t c_real; #endif // We assume that the requested computation domain is real. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_REAL; // For crr, the computation (ukernel) will be real, and since we will // be updating only the real part of the output matrix C, the exectuion // domain is also real. doms.comp = BLIS_REAL; doms.exec = BLIS_REAL; // Since the A*B product is real, we can update only the real part of // C. Thus, we convert the obj_t for the complex matrix to one that // represents only the real part. HOWEVER, there are two situations in // which we forgo this trick: // - If extra memory optimizations are enabled, we should leave C alone // since we'll be computing A*B to a temporary matrix and accumulating // that result back to C, and in order for that to work, we need to // allow that code to continue accessing C as a complex matrix. // - Even if extra memory optimizations are diabled, logically projecting // C as a real matrix can still cause problems if beta is non-unit. In // that situation, the implementation won't get a chance to scale the // imaginary components of C by beta, and thus it would compute the // wrong answer. Thus, if beta is non-unit, we must leave C alone. #ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM if ( bli_obj_equals( beta, &BLIS_ONE ) ) { bli_obj_real_part( c, &c_real ); // Overwrite the complex obj_t with its real-only alias. *c = c_real; } #endif // Use the default pack schemas in the objects. // Return the computation and execution domains. return doms; } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_rcr ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; obj_t a_real; // We assume that the requested computation domain is real. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_REAL; // For rcr, the computation (ukernel) will be real, and since the output // matrix C is also real, so must be the execution domain. doms.comp = BLIS_REAL; doms.exec = BLIS_REAL; // Convert the obj_t for the complex matrix to one that represents only // the real part. bli_obj_real_part( a, &a_real ); // Overwrite the complex obj_t with its real-only alias. *a = a_real; // Use the default pack schemas in the objects. // Return the computation and execution domains. return doms; } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_rrc ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; obj_t b_real; // We assume that the requested computation domain is real. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_REAL; // For rcr, the computation (ukernel) will be real, and since the output // matrix C is also real, so must be the execution domain. doms.comp = BLIS_REAL; doms.exec = BLIS_REAL; // Convert the obj_t for the complex matrix to one that represents only // the real part. bli_obj_real_part( b, &b_real ); // Overwrite the complex obj_t with its real-only alias. *b = b_real; // Use the default pack schemas in the objects. // Return the computation and execution domains. return doms; } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_rrr ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; // We assume that the requested computation domain is real. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_REAL; // For rrr, the computation (ukernel) and execution domains are both // real. doms.comp = BLIS_REAL; doms.exec = BLIS_REAL; // Use the default pack schemas in the objects. // Return the computation and execution domains. return doms; } // ----------------------------------------------------------------------------- // cab mddm_t bli_gemm_md_ccc ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ) { mddm_t doms; // We assume that the requested computation domain is complex. //dom_t dom_comp_in = bli_obj_comp_domain( c ); //dom_t dom_comp_in = BLIS_COMPLEX; // For ccc, the computation (ukernel) and execution domains are both // complex. doms.comp = BLIS_COMPLEX; doms.exec = BLIS_COMPLEX; // Use the default pack schemas in the objects. // Return the computation and execution domains. return doms; } #endif cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_md.h000066400000000000000000000221301427272030600224210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_gemm_md_c2r_ref.h" // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_md_c2r_ref.c000066400000000000000000000205241427272030600240230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_GEMM_MD #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, suf ) \ \ void PASTEMAC2(ch,opname,suf) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ dim_t mr_r = mr; \ dim_t nr_r = nr; \ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype_r ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ inc_t rs_ct; \ inc_t cs_ct; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ \ ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ /* ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ */ \ \ ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ \ dim_t m_use; \ dim_t n_use; \ \ ctype_r* c_use; \ inc_t rs_c_use; \ inc_t cs_c_use; \ \ bool using_ct; \ \ /* This virtual microkernel is used by ccr and crc mixed-domain cases when any of the following conditions are met: - beta is complex (ie: has a non-zero imaginary component) - C is general-stored - the computation precision differs from the storage of C If, however, none of the above conditions are met, then the real domain macrokernel can be (and will be) called instead of calling the complex macrokernel (and this virtual microkernel). */ \ \ /* PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: a", mr, k, \ a_r, 1, mr, "%5.2f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: b", k, nr, \ b_r, nr, 1, "%5.2f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c before", mr, nr, \ c_use, rs_c_use, cs_c_use, "%5.2f", "" ); \ */ \ \ /* SAFETY CHECK: The higher level implementation should never allow an alpha with non-zero imaginary component to be passed in, because it can't be applied properly using the 1m method. If alpha is not real, then something is very wrong. */ \ /* if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ */ \ \ /* If beta has a non-zero imaginary component OR if c is stored with general stride, then we compute the alpha*a*b product into temporary storage and then accumulate that result into c afterwards. Note that the other two cases concerning disagreement between the storage of C and the output preference of the micro-kernel, should ONLY occur in the context of trsm, whereby this virtual micro-kernel is called directly from the trsm macro-kernel to update the micro-tile b11 that exists within the packed row-panel of B. Indeed that is the reason those cases MUST be explicitly handled. */ \ if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ \ if ( using_ct ) \ { \ /* In the atypical cases, we compute the result into temporary workspace ct and then accumulate it back to c at the end. */ \ \ /* Set the strides of ct based on the preference of the underlying native real domain gemm micro-kernel. Note that we set the ct strides in units of complex elements. */ \ if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ else { rs_ct = nr; cs_ct = 1; } \ \ c_use = ( ctype_r* )ct; \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ \ /* Convert the strides and corresponding microtile dimension from being in units of complex elements to be in units of real elements. */ \ if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; mr_r *= 2; } \ else { rs_c_use *= 2; nr_r *= 2; }\ \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ mr_r, \ nr_r, \ k, \ alpha_r, \ a_r, \ b_r, \ zero_r, \ c_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ \ /* Accumulate the final result in ct back to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ *beta, \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ } \ else \ { \ /* In the typical cases, we use the real part of beta and accumulate directly into the output matrix c. */ \ \ c_use = ( ctype_r* )c; \ rs_c_use = rs_c; \ cs_c_use = cs_c; \ m_use = m; \ n_use = n; \ \ /* Convert the strides and corresponding microtile dimension from being in units of complex elements to be in units of real elements. */ \ if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) { cs_c_use *= 2; m_use *= 2; } \ else { rs_c_use *= 2; n_use *= 2; } \ \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ m_use, \ n_use, \ k, \ alpha_r, \ a_r, \ b_r, \ beta_r, \ c_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ } \ } INSERT_GENTFUNCCO_BASIC( gemm_md_c2r, BLIS_REF_SUFFIX ) #endif cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_md_c2r_ref.h000066400000000000000000000035431427272030600240320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. #include "bli_l3_ukr.h" cython-blis-0.9.1/blis/_src/frame/3/gemm/bli_gemm_var.h000066400000000000000000000043121427272030600226130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/gemm/ind/000077500000000000000000000000001427272030600205715ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/gemm/ind/bli_gemm_ind_opt.h000066400000000000000000000053071427272030600242360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; /* Detach the beta scalar from c so that we can test its imaginary component. */ bli_obj_scalar_detach( c, &beta ); /* If beta is in the real domain, and c is row- or column-stored, then we may proceed with the optimization. */ if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else /* if ( bli_is_1r_packed( schema_a ) ) */ { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } cython-blis-0.9.1/blis/_src/frame/3/gemm/other/000077500000000000000000000000001427272030600211405ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/gemm/other/bli_gemm_ker_var2.c000066400000000000000000000251011427272030600246510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, inc_t is_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t is_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2); void bli_gemm_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. #if 1 if ( bli_is_1m_packed( schema_a ) ) { bli_l3_ind_recast_1m_params ( dt_exec, schema_a, c, m, n, k, pd_a, ps_a, pd_b, ps_b, rs_c, cs_c ); } #endif // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t i, j; \ dim_t m_cur; \ dim_t n_cur; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ dim_t ir_thread_id = bli_thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemm_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ b2 = bli_gemm_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale the bottom edge of C and add the result from above. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemm_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/gemm/other/bli_gemm_ker_var2rr.c000066400000000000000000000262261427272030600252260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, inc_t is_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t is_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2rr); // // -- Macrokernel functions for round-robin partitioning ----------------------- // void bli_gemm_ker_var2rr ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. if ( bli_is_1m_packed( schema_a ) ) { bli_l3_ind_recast_1m_params ( dt_exec, schema_a, c, m, n, k, pd_a, ps_a, pd_b, ps_b, rs_c, cs_c ); } // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t i, j; \ dim_t m_cur; \ dim_t n_cur; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Determine the thread range and increment for each thrinfo_t node. */ \ bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter_rr( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale the bottom edge of C and add the result from above. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2rr: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemm_ker_var2rr ) cython-blis-0.9.1/blis/_src/frame/3/gemm/other/bli_gemm_ker_var2sl.c000066400000000000000000000262201427272030600252130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, inc_t is_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t is_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var2sl); // // -- Macrokernel functions for slab partitioning ------------------------------ // void bli_gemm_ker_var2sl ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // If 1m is being employed on a column- or row-stored matrix with a // real-valued beta, we can use the real domain macro-kernel, which // eliminates a little overhead associated with the 1m virtual // micro-kernel. if ( bli_is_1m_packed( schema_a ) ) { bli_l3_ind_recast_1m_params ( dt_exec, schema_a, c, m, n, k, pd_a, ps_a, pd_b, ps_b, rs_c, cs_c ); } // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t i, j; \ dim_t m_cur; \ dim_t n_cur; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Determine the thread range and increment for each thrinfo_t node. */ \ bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter_sl( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_sl( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale the bottom edge of C and add the result from above. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: a1", MR, k, a1, 1, MR, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2sl: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemm_ker_var2sl ) cython-blis-0.9.1/blis/_src/frame/3/gemm/other/bli_gemm_ker_var5.c000066400000000000000000000237361427272030600246700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T)( dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, void_fp gemm_ukr ); static FUNCPTR_T GENARRAY(ftypes,gemm_ker_var5); void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, gemm_thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; func_t* gemm_ukrs; void_fp gemm_ukr; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Extract from the context the func_t object containing // the gemm micro-kernel function addresses, and then query the // function address corresponding to the current datatype. gemm_ukrs = bli_cntx_get_l3_ukr( BLIS_GEMM_UKR, cntx ); gemm_ukr = bli_func_get_dt( dt_exec, gemm_ukrs ); // Invoke the function. f( m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, gemm_ukr ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, ukrtype ) \ \ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ void_fp gemm_ukr \ ) \ { \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,ukrtype) gemm_ukr_cast = gemm_ukr; \ \ /* Temporary buffer for incremental packing of B. */ \ ctype bp[ PASTEMAC(ch,maxkc) * \ /* !!!! NOTE: This packnr actually needs to be something like maxpacknr if it is to be guaranteed to work in all situations !!!! The right place to define maxpackmr/nr would be in bli_kernel_post_macro_defs.h */ \ PASTEMAC(ch,packnr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ \ /* Temporary C buffer for edge cases. */ \ ctype ct[ PASTEMAC(ch,maxmr) * \ PASTEMAC(ch,maxnr) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ct = 1; \ const inc_t cs_ct = PASTEMAC(ch,maxmr); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKNR = rs_b; \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ ctype* restrict b2; \ \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t i, j; \ dim_t m_cur; \ dim_t n_cur; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the panel strides of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_ps_a( ps_a, &aux ); \ bli_auxinfo_set_ps_b( ps_b, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Since we pack micro-panels of B incrementaly, one at a time, the address of the next micro-panel of B remains constant. */ \ b2 = bp; \ \ /* Save address of next panel of B to the auxinfo_t object. */ \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Incrementally pack a single micro-panel of B. */ \ PASTEMAC(ch,packm_cxk)( BLIS_NO_CONJUGATE, \ n_cur, \ k, \ one, \ b1, 1, rs_b, \ bp, PACKNR ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter( i, m_iter ) ) \ { \ a2 = a_cast; \ } \ \ /* Save address of next panel of A to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr_cast( k, \ alpha_cast, \ a1, \ bp, \ beta_cast, \ c11, rs_c, cs_c, \ &aux ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr_cast( k, \ alpha_cast, \ a1, \ bp, \ zero, \ ct, rs_ct, cs_ct, \ &aux ); \ \ /* Scale the bottom edge of C and add the result from above. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += cstep_b; \ c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: b1", k, NR, b1, NR, 1, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var5: a1", MR, k, a1, 1, MR, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC( gemm_ker_var5, gemm_ukr_t ) cython-blis-0.9.1/blis/_src/frame/3/gemm/other/bli_gemm_ker_var5.h000066400000000000000000000050731427272030600246670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interface. // void bli_gemm_ker_var5( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, gemm_t* cntl, gemm_thrinfo_t* thread ); // // Prototype BLAS-like interfaces. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ void_fp gemm_ukr \ ); INSERT_GENTPROT_BASIC( gemm_ker_var5 ) cython-blis-0.9.1/blis/_src/frame/3/gemmt/000077500000000000000000000000001427272030600202035ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/gemmt/bli_gemmt.h000066400000000000000000000032761427272030600223230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_gemmt_front.h" #include "bli_gemmt_var.h" cython-blis-0.9.1/blis/_src/frame/3/gemmt/bli_gemmt_front.c000066400000000000000000000101151427272030600235140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { bli_init_once(); obj_t a_local; obj_t b_local; obj_t c_local; // If C has a zero dimension, return early. if ( bli_obj_has_zero_dim( c ) ) { return; } // If alpha is zero, or if A or B has a zero dimension, scale C by beta // and return early. if ( bli_obj_equals( alpha, &BLIS_ZERO ) || bli_obj_has_zero_dim( a ) || bli_obj_has_zero_dim( b ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); // Set the obj_t buffer field to the location currently implied by the row // and column offsets and then zero the offsets. If any of the original // obj_t's were views into larger matrices, this step effectively makes // those obj_t's "forget" their lineage. bli_obj_reset_origin( &a_local ); bli_obj_reset_origin( &b_local ); bli_obj_reset_origin( &c_local ); // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_obj_swap( &a_local, &b_local ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } // Set the pack schemas within the objects, as appropriate. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_GEMM, BLIS_LEFT, // ignored for gemm/hemm/symm/gemmt bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // Invoke the internal back-end via the thread handler. bli_l3_thread_decorator ( bli_l3_int, BLIS_GEMMT, // operation family id alpha, &a_local, &b_local, beta, &c_local, cntx, rntm, cntl ); } cython-blis-0.9.1/blis/_src/frame/3/gemmt/bli_gemmt_front.h000066400000000000000000000036001427272030600235220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); cython-blis-0.9.1/blis/_src/frame/3/gemmt/bli_gemmt_l_ker_var2.c000066400000000000000000000376101427272030600244230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( doff_t diagoffc, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, inc_t is_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t is_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); void bli_gemmt_l_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffc_ij; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t i, j, ip; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of C is entirely above the diagonal, it is not stored. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ \ /* If there is a zero region above where the diagonal of C intersects the left edge of the panel, adjust the pointer to C and A and treat this case as if the diagonal offset were zero. */ \ if ( diagoffc < 0 ) \ { \ ip = -diagoffc / MR; \ i = ip * MR; \ m = m - i; \ diagoffc = -diagoffc % MR; \ c_cast = c_cast + (i )*rs_c; \ a_cast = a_cast + (ip )*ps_a; \ } \ \ /* If there is a zero region to the right of where the diagonal of C intersects the bottom of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( diagoffc + m < n ) \ { \ n = diagoffc + m; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the rectangular part of C, and the triangular portion. */ \ dim_t n_iter_rct; \ dim_t n_iter_tri; \ \ if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ { \ /* If the entire panel of C does not intersect the diagonal, there is no triangular region, and therefore we can skip the second set of loops. */ \ n_iter_rct = n_iter; \ n_iter_tri = 0; \ } \ else \ { \ /* If the panel of C does intersect the diagonal, compute the number of iterations in the rectangular region by dividing NR into the diagonal offset. Any remainder from this integer division is discarded, which is what we want. That is, we want the rectangular region to contain as many columns of whole microtiles as possible without including any microtiles that intersect the diagonal. The number of iterations in the triangular (or trapezoidal) region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_rct = diagoffc / NR; \ n_iter_tri = n_iter - n_iter_rct; \ } \ \ /* Determine the thread range and increment for the 2nd and 1st loops for the initial rectangular region of C (if it exists). NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ /* No need to compute the diagonal offset for the rectangular region. */ \ /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements on or below the diagonal. Otherwise, if the submatrix is strictly below the diagonal, we compute and store as we normally would. And if we're strictly above the diagonal, we do nothing and continue. */ \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ \ /* If there is no triangular region, then we're done. */ \ if ( n_iter_tri == 0 ) return; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop and the default (slab or rr) partitioning in the 1st loop for the remaining triangular region of C. */ \ bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Advance the start and end iteration offsets for the triangular region by the number of iterations used for the rectangular region. */ \ jr_start += n_iter_rct; \ jr_end += n_iter_rct; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements on or below the diagonal. Otherwise, if the submatrix is strictly below the diagonal, we compute and store as we normally would. And if we're strictly above the diagonal, we do nothing and continue. */ \ if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ MR, \ NR, \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/gemmt/bli_gemmt_u_ker_var2.c000066400000000000000000000401641427272030600244320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( doff_t diagoffc, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, inc_t is_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t is_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); void bli_gemmt_u_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffc_ij; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t i, j, jp; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of C is entirely below the diagonal, it is not stored. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ \ /* If there is a zero region to the left of where the diagonal of C intersects the top edge of the panel, adjust the pointer to C and B and treat this case as if the diagonal offset were zero. NOTE: It's possible that after this pruning that the diagonal offset is still positive (though it is guaranteed to be less than NR). */ \ if ( diagoffc > 0 ) \ { \ jp = diagoffc / NR; \ j = jp * NR; \ n = n - j; \ diagoffc = diagoffc % NR; \ c_cast = c_cast + (j )*cs_c; \ b_cast = b_cast + (jp )*ps_b; \ } \ \ /* If there is a zero region below where the diagonal of C intersects the right edge of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffc + n < m ) \ { \ m = -diagoffc + n; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ /* Save the desired output datatype (indicating no typecasting). */ \ /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the triangular part of C, and the rectangular portion. */ \ dim_t n_iter_tri; \ dim_t n_iter_rct; \ \ if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ { \ /* If the entire panel of C does not intersect the diagonal, there is no triangular region, and therefore we can skip the first set of loops. */ \ n_iter_tri = 0; \ n_iter_rct = n_iter; \ } \ else \ { \ /* If the panel of C does intersect the diagonal, compute the number of iterations in the triangular (or trapezoidal) region by dividing NR into the number of rows in C. A non-zero remainder means we need to add one additional iteration. That is, we want the triangular region to contain as few columns of whole microtiles as possible while still including all microtiles that intersect the diagonal. The number of iterations in the rectangular region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ n_iter_rct = n_iter - n_iter_tri; \ } \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop and the default (slab or rr) partitioning in the 1st loop for the initial triangular region of C (if it exists). */ \ bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements on or below the diagonal. Otherwise, if the submatrix is strictly above the diagonal, we compute and store as we normally would. And if we're strictly below the diagonal, we do nothing and continue. */ \ if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ MR, \ NR, \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ \ /* If there is no rectangular region, then we're done. */ \ if ( n_iter_rct == 0 ) return; \ \ /* Determine the thread range and increment for the 2nd loop of the remaining rectangular region of C (and also use default partitioning for the 1st loop). NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Advance the start and end iteration offsets for the rectangular region by the number of iterations used for the triangular region. */ \ jr_start += n_iter_tri; \ jr_end += n_iter_tri; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ /* No need to compute the diagonal offset for the rectangular region. */ \ /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements on or below the diagonal. Otherwise, if the submatrix is strictly above the diagonal, we compute and store as we normally would. And if we're strictly below the diagonal, we do nothing and continue. */ \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/gemmt/bli_gemmt_var.h000066400000000000000000000054501427272030600231670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/gemmt/bli_gemmt_x_ker_var2.c000066400000000000000000000045221427272030600244330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" static l3_var_oft vars[2] = { bli_gemmt_l_ker_var2, bli_gemmt_u_ker_var2, }; void bli_gemmt_x_ker_var2 ( obj_t* a, obj_t* ah, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { dim_t uplo; l3_var_oft f; // Set a bool based on the uplo field of C's root object. if ( bli_obj_root_is_lower( c ) ) uplo = 0; else uplo = 1; // Index into the variant array to extract the correct function pointer. f = vars[uplo]; // Call the macrokernel. f ( a, ah, c, cntx, rntm, cntl, thread ); } cython-blis-0.9.1/blis/_src/frame/3/gemmt/other/000077500000000000000000000000001427272030600213245ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c000066400000000000000000000300611427272030600255350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( doff_t diagoffc, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, inc_t is_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t is_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); void bli_gemmt_l_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffc_ij; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t i, j, ip; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of C is entirely above the diagonal, it is not stored. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ \ /* If there is a zero region above where the diagonal of C intersects the left edge of the panel, adjust the pointer to C and A and treat this case as if the diagonal offset were zero. */ \ if ( diagoffc < 0 ) \ { \ ip = -diagoffc / MR; \ i = ip * MR; \ m = m - i; \ diagoffc = -diagoffc % MR; \ c_cast = c_cast + (i )*rs_c; \ a_cast = a_cast + (ip )*ps_a; \ } \ \ /* If there is a zero region to the right of where the diagonal of C intersects the bottom of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( diagoffc + m < n ) \ { \ n = diagoffc + m; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ dim_t ir_thread_id = bli_thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements on or below the diagonal. Otherwise, if the submatrix is strictly below the diagonal, we compute and store as we normally would. And if we're strictly above the diagonal, we do nothing and continue. */ \ if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale the edge of C and add the result. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c000066400000000000000000000300631427272030600255500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemmt_fp typedef void (*FUNCPTR_T) ( doff_t diagoffc, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, inc_t is_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, inc_t is_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); void bli_gemmt_u_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffc, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, is_a, pd_a, ps_a, buf_b, rs_b, is_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ /*const dim_t PACKMR = cs_a;*/ \ /*const dim_t PACKNR = rs_b;*/ \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffc_ij; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t i, j, jp; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of C is entirely below the diagonal, it is not stored. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ \ /* If there is a zero region to the left of where the diagonal of C intersects the top edge of the panel, adjust the pointer to C and B and treat this case as if the diagonal offset were zero. */ \ if ( diagoffc > 0 ) \ { \ jp = diagoffc / NR; \ j = jp * NR; \ n = n - j; \ diagoffc = diagoffc % NR; \ c_cast = c_cast + (j )*cs_c; \ b_cast = b_cast + (jp )*ps_b; \ } \ \ /* If there is a zero region below where the diagonal of C intersects the right edge of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffc + n < m ) \ { \ m = -diagoffc + n; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a, &aux ); \ bli_auxinfo_set_is_b( is_b, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ dim_t jr_num_threads = bli_thread_n_way( thread ); \ dim_t jr_thread_id = bli_thread_work_id( thread ); \ dim_t ir_num_threads = bli_thread_n_way( caucus ); \ dim_t ir_thread_id = bli_thread_work_id( caucus ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_thread_id; j < n_iter; j += jr_num_threads ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Interior loop over the m dimension (MR rows at a time). */ \ for ( i = ir_thread_id; i < m_iter; i += ir_num_threads ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ /* Compute the diagonal offset for the submatrix at (i,j). */ \ diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_gemmt_get_next_a_upanel( caucus, a1, rstep_a ); \ if ( bli_is_last_iter( i, m_iter, ir_thread_id, ir_num_threads ) ) \ { \ a2 = a_cast; \ b2 = bli_gemmt_get_next_b_upanel( thread, b1, cstep_b ); \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* If the diagonal intersects the current MR x NR submatrix, we compute it the temporary buffer and then add in the elements on or below the diagonal. Otherwise, if the submatrix is strictly above the diagonal, we compute and store as we normally would. And if we're strictly below the diagonal, we do nothing and continue. */ \ if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale C and add the result to only the stored part. */ \ PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ { \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Scale the edge of C and add the result. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ beta_cast, \ c11, rs_c, cs_c ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/hemm/000077500000000000000000000000001427272030600200205ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/hemm/bli_hemm.h000066400000000000000000000032461427272030600217520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_hemm_front.h" cython-blis-0.9.1/blis/_src/frame/3/hemm/bli_hemm_front.c000066400000000000000000000137661427272030600231650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { bli_init_once(); obj_t a_local; obj_t b_local; obj_t c_local; // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); // Set the obj_t buffer field to the location currently implied by the row // and column offsets and then zero the offsets. If any of the original // obj_t's were views into larger matrices, this step effectively makes // those obj_t's "forget" their lineage. bli_obj_reset_origin( &a_local ); bli_obj_reset_origin( &b_local ); bli_obj_reset_origin( &c_local ); #ifdef BLIS_DISABLE_HEMM_RIGHT // NOTE: This case casts right-side hemm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel // that assumes that the packing kernel will have already duplicated // (broadcast) element of B in the packed copy of B. Supporting // duplication within the logic that packs micropanels from Hermitian/ // matrices would be ugly, and so we simply don't support it. As a // consequence, those subconfigurations need a way to force the Hermitian // matrix to be on the left (and thus the general matrix to the on the // right). So our solution is that in those cases, the subconfigurations // simply #define BLIS_DISABLE_HEMM_RIGHT. // NOTE: This case casts right-side hemm in terms of left side. This can // lead to the microkernel being executed on an output matrix with the // microkernel's general stride IO case (unless the microkernel supports // both both row and column IO cases as well). // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } #else // NOTE: This case computes right-side hemm/symm natively by packing // elements of the Hermitian/symmetric matrix A to micropanels of the // right-hand packed matrix operand "B", and elements of the general // matrix B to micropanels of the left-hand packed matrix operand "A". // This code path always gives us the opportunity to transpose the // entire operation so that the effective storage format of the output // matrix matches the microkernel's output preference. Thus, from a // performance perspective, this case is preferred. // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_toggle_conj( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } // If the Hermitian/symmetric matrix A is being multiplied from the right, // swap A and B so that the Hermitian/symmetric matrix will actually be on // the right. if ( bli_is_right( side ) ) { bli_obj_swap( &a_local, &b_local ); } #endif // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_HEMM, BLIS_LEFT, // ignored for gemm/hemm/symm bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // Invoke the internal back-end. bli_l3_thread_decorator ( bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, &b_local, beta, &c_local, cntx, rntm, cntl ); } cython-blis-0.9.1/blis/_src/frame/3/hemm/bli_hemm_front.h000066400000000000000000000035401427272030600231570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); cython-blis-0.9.1/blis/_src/frame/3/old/000077500000000000000000000000001427272030600176505ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/old/bli_l3_ft_ex.h000066400000000000000000000134701427272030600223570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_FT_EX_H #define BLIS_L3_FT_EX_H // // -- Level-3 expert function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEF( gemm ) // hemm, symm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEF( hemm ) INSERT_GENTDEF( symm ) // herk #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEFR( herk ) // her2k #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEFR( her2k ) // syrk #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEFR( syrk ) // syr2k #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEF( syr2k ) // trmm3 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEF( trmm3 ) // trmm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,BLIS_TAPI_EX_SUF,tsuf)) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTDEF( trmm ) INSERT_GENTDEF( trsm ) #endif cython-blis-0.9.1/blis/_src/frame/3/old/bli_l3_sup_edge.h000066400000000000000000000076521427272030600230520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ static void bli_dgemmsup_ker_edge_dispatcher ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx, const dim_t num_mr, const dim_t num_nr, dim_t* restrict mrs, dim_t* restrict nrs, dgemmsup_ker_ft* kmap ) { #if 1 // outer loop = mr; inner loop = nr dim_t n_left = n0; double* restrict cj = c; double* restrict bj = b; for ( dim_t j = 0; n_left != 0; ++j ) { const dim_t nr_cur = nrs[ j ]; if ( nr_cur <= n_left ) { dim_t m_left = m0; double* restrict cij = cj; double* restrict ai = a; for ( dim_t i = 0; m_left != 0; ++i ) { const dim_t mr_cur = mrs[ i ]; if ( mr_cur <= m_left ) { dgemmsup_ker_ft ker_fp = kmap[ i*num_nr + j*1 ]; ker_fp ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } } cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } } #else // outer loop = nr; inner loop = mr dim_t m_left = m0; double* restrict ci = c; double* restrict ai = a; for ( dim_t i = 0; m_left != 0; ++i ) { const dim_t mr_cur = mrs[ i ]; if ( mr_cur <= m_left ) { dim_t n_left = n0; double* restrict cij = ci; double* restrict bj = b; for ( dim_t j = 0; n_left != 0; ++j ) { const dim_t nr_cur = nrs[ j ]; if ( nr_cur <= n_left ) { dgemmsup_ker_ft ker_fp = kmap[ i*num_nr + j*1 ]; ker_fp ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } } ci += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } } #endif } cython-blis-0.9.1/blis/_src/frame/3/old/bli_l3_sup_var1n2m.c000066400000000000000000000602601427272030600234210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemmsup_fp typedef void (*FUNCPTR_T) ( conj_t conja, conj_t conjb, dim_t m, dim_t n, dim_t k, void* restrict alpha, void* restrict a, inc_t rs_a, inc_t cs_a, void* restrict b, inc_t rs_b, inc_t cs_b, void* restrict beta, void* restrict c, inc_t rs_c, inc_t cs_c, stor3_t eff_id, cntx_t* restrict cntx, rntm_t* restrict rntm, cntl_t* restrict cntl, thrinfo_t* restrict thread ); // // -- var1n -------------------------------------------------------------------- // static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n); void bli_gemmsup_ref_var1n ( trans_t trans, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, stor3_t eff_id, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { #if 0 obj_t at, bt; bli_obj_alias_to( a, &at ); bli_obj_alias_to( b, &bt ); // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width( &at ); void* restrict buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); void* restrict buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #else const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); dim_t k; void* restrict buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; void* restrict buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { k = bli_obj_width( a ); rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else // if ( bli_obj_has_trans( a ) ) { // Assign the variables with an implicit transposition. k = bli_obj_length( a ); rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else // if ( bli_obj_has_trans( b ) ) { // Assign the variables with an implicit transposition. rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #endif // Index into the type combination array to extract the correct // function pointer. FUNCPTR_T f = ftypes_var1n[dt_exec]; if ( bli_is_notrans( trans ) ) { // Invoke the function. f ( conja, conjb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, eff_id, cntx, rntm, cntl, thread ); } else { // Invoke the function (transposing the operation). f ( conjb, // swap the conj values. conja, n, // swap the m and n dimensions. m, k, buf_alpha, buf_b, cs_b, rs_b, // swap the positions of A and B. buf_a, cs_a, rs_a, // swap the strides of A and B. buf_beta, buf_c, cs_c, rs_c, // swap the strides of C. bli_stor3_trans( eff_id ), // transpose the stor3_t id. cntx, rntm, cntl, thread ); } } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t stor_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ cntl_t* restrict cntl, \ thrinfo_t* restrict thread \ ) \ { \ /* If m or n is zero, return immediately. */ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* If k < 1 or alpha is zero, scale by beta and return. */ \ if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ { \ PASTEMAC(ch,scalm) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m, n, \ beta, \ c, rs_c, cs_c \ ); \ return; \ } \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* This transposition of the stor3_t id value is inherent to variant 1. The reason: we assume that variant 2 is the "main" variant. The consequence of this is that we assume that the millikernels that iterate over m are registered to the kernel group associated with the kernel preference. So, regardless of whether the mkernels are row- or column-preferential, millikernels that iterate over n are always placed in the slots for the opposite kernel group. */ \ stor_id = bli_stor3_trans( stor_id ); \ \ /* Query the context for various blocksizes. */ \ const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( FALSE ) KC = KC0; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( m <= MR && n <= NR ) KC = KC0; \ else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ \ /* Nudge NC up to a multiple of MR and MC up to a multiple of NR. */ \ const dim_t NC = bli_align_dim_to_mult( NC0, MR ); \ const dim_t MC = bli_align_dim_to_mult( MC0, NR ); \ \ /* Query the maximum blocksize for MR, which implies a maximum blocksize extension for the final iteration. */ \ const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \ const dim_t MRE = MRM - MR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = rs_c * NC; \ const inc_t jcstep_a = rs_a * NC; \ \ const inc_t pcstep_a = cs_a * KC; \ const inc_t pcstep_b = rs_b * KC; \ \ const inc_t icstep_c = cs_c * MC; \ const inc_t icstep_b = cs_b * MC; \ \ const inc_t jrstep_c = rs_c * MR; \ const inc_t jrstep_a = rs_a * MR; \ \ /* const inc_t irstep_c = cs_c * NR; \ const inc_t irstep_b = cs_b * NR; \ */ \ \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ ctype* restrict c_00 = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ \ ctype* restrict one = PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ /* Compute number of primary and leftover components of the outer dimensions. NOTE: Functionally speaking, we compute jc_iter as: jc_iter = m / NC; if ( jc_left ) ++jc_iter; However, this is implemented as: jc_iter = ( m + NC - 1 ) / NC; This avoids a branch at the cost of two additional integer instructions. The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in similar manner. */ \ const dim_t jc_iter = ( m + NC - 1 ) / NC; \ const dim_t jc_left = m % NC; \ \ const dim_t pc_iter = ( k + KC - 1 ) / KC; \ const dim_t pc_left = k % KC; \ \ const dim_t ic_iter = ( n + MC - 1 ) / MC; \ const dim_t ic_left = n % MC; \ \ const dim_t jc_inc = 1; \ const dim_t pc_inc = 1; \ const dim_t ic_inc = 1; \ const dim_t jr_inc = 1; \ /* const dim_t ir_inc = 1; \ */ \ \ /* Loop over the m dimension (NC rows/columns at a time). */ \ for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \ { \ const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \ \ ctype* restrict a_jc = a_00 + jj * jcstep_a; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ \ dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \ dim_t jr_left = nc_cur % MR; \ \ /* An optimization: allow the last jr iteration to contain up to MRE rows of C and A. (If MRE > MR, the mkernel has agreed to handle these cases.) Note that this prevents us from declaring jr_iter and jr_left as const. */ \ if ( 1 ) \ if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \ { \ jr_iter--; jr_left += MR; \ } \ \ /* Loop over the k dimension (KC rows/columns at a time). */ \ for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \ { \ const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \ \ ctype* restrict a_pc = a_jc + pp * pcstep_a; \ ctype* restrict b_pc = b_00 + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \ \ /* Loop over the n dimension (MC rows at a time). */ \ for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \ { \ const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \ \ ctype* restrict b_ic = b_pc + ii * icstep_b; \ ctype* restrict c_ic = c_jc + ii * icstep_c; \ \ /* const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \ const dim_t ir_left = mc_cur % NR; \ */ \ \ /* Loop over the m dimension (NR columns at a time). */ \ for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \ \ ctype* restrict a_jr = a_pc + j * jrstep_a; \ ctype* restrict c_jr = c_ic + j * jrstep_c; \ \ /* Loop over the n dimension (MR rows at a time). */ \ { \ /* Invoke the gemmsup millikernel. */ \ gemmsup_ker \ ( \ conja, \ conjb, \ nr_cur, /* Notice: nr_cur <= MR. */ \ mc_cur, /* Recall: mc_cur partitions the n dimension! */ \ kc_cur, \ alpha_cast, \ a_jr, rs_a, cs_a, \ b_ic, rs_b, cs_b, \ beta_use, \ c_jr, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n ) // // -- var2m -------------------------------------------------------------------- // static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m); void bli_gemmsup_ref_var2m ( trans_t trans, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, stor3_t eff_id, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { #if 0 obj_t at, bt; bli_obj_alias_to( a, &at ); bli_obj_alias_to( b, &bt ); // Induce transpositions on A and/or B if either object is marked for // transposition. We can induce "fast" transpositions since they objects // are guaranteed to not have structure or be packed. if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); } if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); } const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); const dim_t k = bli_obj_width( &at ); void* restrict buf_a = bli_obj_buffer_at_off( &at ); const inc_t rs_a = bli_obj_row_stride( &at ); const inc_t cs_a = bli_obj_col_stride( &at ); void* restrict buf_b = bli_obj_buffer_at_off( &bt ); const inc_t rs_b = bli_obj_row_stride( &bt ); const inc_t cs_b = bli_obj_col_stride( &bt ); void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #else const num_t dt_exec = bli_obj_dt( c ); const conj_t conja = bli_obj_conj_status( a ); const conj_t conjb = bli_obj_conj_status( b ); const dim_t m = bli_obj_length( c ); const dim_t n = bli_obj_width( c ); dim_t k; void* restrict buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a; inc_t cs_a; void* restrict buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b; inc_t cs_b; if ( bli_obj_has_notrans( a ) ) { k = bli_obj_width( a ); rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else // if ( bli_obj_has_trans( a ) ) { // Assign the variables with an implicit transposition. k = bli_obj_length( a ); rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else // if ( bli_obj_has_trans( b ) ) { // Assign the variables with an implicit transposition. rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } void* restrict buf_c = bli_obj_buffer_at_off( c ); const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha ); void* restrict buf_beta = bli_obj_buffer_for_1x1( dt_exec, beta ); #endif // Index into the type combination array to extract the correct // function pointer. FUNCPTR_T f = ftypes_var2m[dt_exec]; if ( bli_is_notrans( trans ) ) { // Invoke the function. f ( conja, conjb, m, n, k, buf_alpha, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b, buf_beta, buf_c, rs_c, cs_c, eff_id, cntx, rntm, cntl, thread ); } else { // Invoke the function (transposing the operation). f ( conjb, // swap the conj values. conja, n, // swap the m and n dimensions. m, k, buf_alpha, buf_b, cs_b, rs_b, // swap the positions of A and B. buf_a, cs_a, rs_a, // swap the strides of A and B. buf_beta, buf_c, cs_c, rs_c, // swap the strides of C. bli_stor3_trans( eff_id ), // transpose the stor3_t id. cntx, rntm, cntl, thread ); } } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t stor_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ cntl_t* restrict cntl, \ thrinfo_t* restrict thread \ ) \ { \ /* If m or n is zero, return immediately. */ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* If k < 1 or alpha is zero, scale by beta and return. */ \ if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \ { \ PASTEMAC(ch,scalm) \ ( \ BLIS_NO_CONJUGATE, \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ m, n, \ beta, \ c, rs_c, cs_c \ ); \ return; \ } \ \ const num_t dt = PASTEMAC(ch,type); \ \ /* Query the context for various blocksizes. */ \ const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \ const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \ const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \ const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \ \ dim_t KC; \ if ( stor_id == BLIS_RRR || \ stor_id == BLIS_CCC ) KC = KC0; \ else if ( stor_id == BLIS_RRC || \ stor_id == BLIS_CRC ) KC = KC0; \ else if ( m <= MR && n <= NR ) KC = KC0; \ else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \ else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \ else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \ else KC = (( KC0 / 5 ) / 4 ) * 4; \ \ /* Query the maximum blocksize for NR, which implies a maximum blocksize extension for the final iteration. */ \ const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \ const dim_t NRE = NRM - NR; \ \ /* Compute partitioning step values for each matrix of each loop. */ \ const inc_t jcstep_c = cs_c * NC; \ const inc_t jcstep_b = cs_b * NC; \ \ const inc_t pcstep_a = cs_a * KC; \ const inc_t pcstep_b = rs_b * KC; \ \ const inc_t icstep_c = rs_c * MC; \ const inc_t icstep_a = rs_a * MC; \ \ const inc_t jrstep_c = cs_c * NR; \ const inc_t jrstep_b = cs_b * NR; \ \ /* const inc_t irstep_c = rs_c * MR; \ const inc_t irstep_a = rs_a * MR; \ */ \ \ /* Query the context for the sup microkernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemmsup_ker_ft) \ gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \ \ ctype* restrict a_00 = a; \ ctype* restrict b_00 = b; \ ctype* restrict c_00 = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ \ ctype* restrict one = PASTEMAC(ch,1); \ \ auxinfo_t aux; \ \ /* Compute number of primary and leftover components of the outer dimensions. NOTE: Functionally speaking, we compute jc_iter as: jc_iter = n / NC; if ( jc_left ) ++jc_iter; However, this is implemented as: jc_iter = ( n + NC - 1 ) / NC; This avoids a branch at the cost of two additional integer instructions. The pc_iter, mc_iter, nr_iter, and mr_iter variables are computed in similar manner. */ \ const dim_t jc_iter = ( n + NC - 1 ) / NC; \ const dim_t jc_left = n % NC; \ \ const dim_t pc_iter = ( k + KC - 1 ) / KC; \ const dim_t pc_left = k % KC; \ \ const dim_t ic_iter = ( m + MC - 1 ) / MC; \ const dim_t ic_left = m % MC; \ \ const dim_t jc_inc = 1; \ const dim_t pc_inc = 1; \ const dim_t ic_inc = 1; \ const dim_t jr_inc = 1; \ /* const dim_t ir_inc = 1; \ */ \ \ /* Loop over the n dimension (NC rows/columns at a time). */ \ for ( dim_t jj = 0; jj < jc_iter; jj += jc_inc ) \ { \ const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \ \ ctype* restrict b_jc = b_00 + jj * jcstep_b; \ ctype* restrict c_jc = c_00 + jj * jcstep_c; \ \ dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \ dim_t jr_left = nc_cur % NR; \ \ /* An optimization: allow the last jr iteration to contain up to NRE columns of C and B. (If NRE > NR, the mkernel has agreed to handle these cases.) Note that this prevents us from declaring jr_iter and jr_left as const. */ \ if ( 1 ) \ if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \ { \ jr_iter--; jr_left += NR; \ } \ \ /* Loop over the k dimension (KC rows/columns at a time). */ \ for ( dim_t pp = 0; pp < pc_iter; pp += pc_inc ) \ { \ const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \ \ ctype* restrict a_pc = a_00 + pp * pcstep_a; \ ctype* restrict b_pc = b_jc + pp * pcstep_b; \ \ /* Only apply beta to the first iteration of the pc loop. */ \ ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \ \ /* Loop over the m dimension (MC rows at a time). */ \ for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \ { \ const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \ \ ctype* restrict a_ic = a_pc + ii * icstep_a; \ ctype* restrict c_ic = c_jc + ii * icstep_c; \ \ /* const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \ const dim_t ir_left = mc_cur % MR; \ */ \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( dim_t j = 0; j < jr_iter; j += jr_inc ) \ { \ const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \ \ ctype* restrict b_jr = b_pc + j * jrstep_b; \ ctype* restrict c_jr = c_ic + j * jrstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ { \ /* Invoke the gemmsup millikernel. */ \ gemmsup_ker \ ( \ conja, \ conjb, \ mc_cur, \ nr_cur, \ kc_cur, \ alpha_cast, \ a_ic, rs_a, cs_a, \ b_jr, rs_b, cs_b, \ beta_use, \ c_jr, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m ) cython-blis-0.9.1/blis/_src/frame/3/symm/000077500000000000000000000000001427272030600200575ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/symm/bli_symm.h000066400000000000000000000032461427272030600220500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_symm_front.h" cython-blis-0.9.1/blis/_src/frame/3/symm/bli_symm_front.c000066400000000000000000000137221427272030600232530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { bli_init_once(); obj_t a_local; obj_t b_local; obj_t c_local; // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C in case we need to apply transformations. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); // Set the obj_t buffer field to the location currently implied by the row // and column offsets and then zero the offsets. If any of the original // obj_t's were views into larger matrices, this step effectively makes // those obj_t's "forget" their lineage. bli_obj_reset_origin( &a_local ); bli_obj_reset_origin( &b_local ); bli_obj_reset_origin( &c_local ); #ifdef BLIS_DISABLE_SYMM_RIGHT // NOTE: This case casts right-side symm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel // that assumes that the packing kernel will have already duplicated // (broadcast) element of B in the packed copy of B. Supporting // duplication within the logic that packs micropanels from symmetric // matrices would be ugly, and so we simply don't support it. As a // consequence, those subconfigurations need a way to force the symmetric // matrix to be on the left (and thus the general matrix to the on the // right). So our solution is that in those cases, the subconfigurations // simply #define BLIS_DISABLE_SYMM_RIGHT. // NOTE: This case casts right-side symm in terms of left side. This can // lead to the microkernel being executed on an output matrix with the // microkernel's general stride IO case (unless the microkernel supports // both both row and column IO cases as well). // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } #else // NOTE: This case computes right-side hemm/symm natively by packing // elements of the Hermitian/symmetric matrix A to micropanels of the // right-hand packed matrix operand "B", and elements of the general // matrix B to micropanels of the left-hand packed matrix operand "A". // This code path always gives us the opportunity to transpose the // entire operation so that the effective storage format of the output // matrix matches the microkernel's output preference. Thus, from a // performance perspective, this case is preferred. // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } // If the Hermitian/symmetric matrix A is being multiplied from the right, // swap A and B so that the Hermitian/symmetric matrix will actually be on // the right. if ( bli_is_right( side ) ) { bli_obj_swap( &a_local, &b_local ); } #endif // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_SYMM, BLIS_LEFT, // ignored for gemm/hemm/symm bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // Invoke the internal back-end. bli_l3_thread_decorator ( bli_l3_int, BLIS_GEMM, // operation family id alpha, &a_local, &b_local, beta, &c_local, cntx, rntm, cntl ); } cython-blis-0.9.1/blis/_src/frame/3/symm/bli_symm_front.h000066400000000000000000000035401427272030600232550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); cython-blis-0.9.1/blis/_src/frame/3/trmm/000077500000000000000000000000001427272030600200515ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm.h000066400000000000000000000033011427272030600220240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_trmm_front.h" #include "bli_trmm_var.h" cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_front.c000066400000000000000000000155531427272030600232430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { bli_init_once(); obj_t a_local; obj_t b_local; obj_t c_local; // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( alpha, b ); return; } // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); // Set the obj_t buffer field to the location currently implied by the row // and column offsets and then zero the offsets. If any of the original // obj_t's were views into larger matrices, this step effectively makes // those obj_t's "forget" their lineage. bli_obj_reset_origin( &a_local ); bli_obj_reset_origin( &b_local ); bli_obj_reset_origin( &c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( &a_local ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } #ifdef BLIS_DISABLE_TRMM_RIGHT // NOTE: This case casts right-side trmm in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel // that assumes that the packing kernel will have already duplicated // (broadcast) element of B in the packed copy of B. Supporting // duplication within the logic that packs micropanels from triangular // matrices would be ugly, and so we simply don't support it. As a // consequence, those subconfigurations need a way to force the triangular // matrix to be on the left (and thus the general matrix to the on the // right). So our solution is that in those cases, the subconfigurations // simply #define BLIS_DISABLE_TRMM_RIGHT. // NOTE: This case casts right-side trmm in terms of left side. This can // lead to the microkernel being executed on an output matrix with the // microkernel's general stride IO case (unless the microkernel supports // both both row and column IO cases as well). // NOTE: Casting right-side trmm in terms of left side reduces the number // of macrokernels exercised to two (trmm_ll and trmm_lu). // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } #else // NOTE: This case computes right-side trmm natively with trmm_rl and // trmm_ru macrokernels. This code path always gives us the opportunity // to transpose the entire operation so that the effective storage format // of the output matrix matches the microkernel's output preference. // Thus, from a performance perspective, this case is preferred. // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. // NOTE: We disable the optimization for 1x1 matrices since the concept // of row- vs. column storage breaks down. //if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT // be enabled. See issue #342 comments. if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( &a_local, &b_local ); } #endif // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_TRMM, side, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // Invoke the internal back-end. bli_l3_thread_decorator ( bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, &b_local, &BLIS_ZERO, &c_local, cntx, rntm, cntl ); } cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_front.h000066400000000000000000000034711427272030600232440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_ll_ker_var2.c000066400000000000000000000313701427272030600243100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); void bli_trmm_ll_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t off_a1011; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ dim_t jr_inc; \ \ /* Determine the thread range and increment for the 2nd loop. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. \ NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1011 = 0; \ k_a1011 = bli_min( diagoffa_i + MR, k ); \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = is_a_cur; \ \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + off_a1011 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k_a1011, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_lu_ker_var2.c000066400000000000000000000316261427272030600243250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); void bli_trmm_lu_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t off_a1112; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + i * PACKNR; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ dim_t jr_inc; \ \ /* Determine the thread range and increment for the 2nd loop. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. \ NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly above the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1112 = diagoffa_i; \ k_a1112 = k - off_a1112; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = is_a_cur; \ \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + off_a1112 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k_a1112, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_rl_ker_var2.c000066400000000000000000000354621427272030600243240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); void bli_trmm_rl_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b1121; \ dim_t off_b1121; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to B since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffb < 0 ) \ { \ j = -diagoffb; \ k = k - j; \ diagoffb = 0; \ a_cast = a_cast + j * PACKMR; \ } \ \ /* If there is a zero region to the right of where the diagonal of B intersects the bottom of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( diagoffb + k < n ) \ { \ n = diagoffb + k; \ } \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the rectangular part of B, and the triangular portion. */ \ dim_t n_iter_rct; \ dim_t n_iter_tri; \ \ if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ { \ /* If the entire panel of B does not intersect the diagonal, there is no triangular region, and therefore we can skip the second set of loops. */ \ n_iter_rct = n_iter; \ n_iter_tri = 0; \ } \ else \ { \ /* If the panel of B does intersect the diagonal, compute the number of iterations in the rectangular region by dividing NR into the diagonal offset. (There should never be any remainder in this division.) The number of iterations in the triangular (or trapezoidal) region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_rct = diagoffb / NR; \ n_iter_tri = n_iter - n_iter_rct; \ } \ \ /* Determine the thread range and increment for the 2nd and 1st loops for the initial rectangular region of B (if it exists). NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. \ NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ { \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ \ /* If there is no triangular region, then we're done. */ \ if ( n_iter_tri == 0 ) return; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd and 1st loops for the remaining triangular region of B (if it exists). NOTE: We don't need to call bli_thread_range_jrir_rr() here since we employ a hack that calls for each thread to execute every iteration of the jr and ir loops but skip all but the pointer increment for iterations that are not assigned to it. */ \ \ /* Advance the starting b1 and c1 pointers to the positions corresponding to the start of the triangular region of B. */ \ jr_start = n_iter_rct; \ b1 = b_cast + jr_start * cstep_b; \ c1 = c_cast + jr_start * cstep_c; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to the beginning of the panel that was packed so we can index into the corresponding location in A. Then compute the length of that panel. */ \ off_b1121 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b1121; \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b1121 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = is_b_cur; \ \ if ( bli_trmm_my_iter_rr( j, thread ) ) { \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter_rr( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + off_b1121 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k_b1121, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ \ c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_ru_ker_var2.c000066400000000000000000000373201427272030600243300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); void bli_trmm_ru_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b0111; \ dim_t off_b0111; \ dim_t i, j, jb0; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffb > 0 ) \ { \ j = diagoffb; \ n = n - j; \ diagoffb = 0; \ c_cast = c_cast + (j )*cs_c; \ } \ \ /* If there is a zero region below where the diagonal of B intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffb + n < k ) \ { \ k = -diagoffb + n; \ } \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the triangular part of C, and the rectangular portion. */ \ dim_t n_iter_tri; \ dim_t n_iter_rct; \ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ { \ /* If the entire panel of B does not intersect the diagonal, there is no triangular region, and therefore we can skip the first set of loops. */ \ n_iter_tri = 0; \ n_iter_rct = n_iter; \ } \ else \ { \ /* If the panel of B does intersect the diagonal, compute the number of iterations in the triangular (or trapezoidal) region by dividing NR into the number of rows in B. (There should never be any remainder in this division.) The number of iterations in the rectangular region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ n_iter_rct = n_iter - n_iter_tri; \ } \ \ /* Use round-robin assignment of micropanels to threads in the 2nd and 1st loops for the initial triangular region of B (if it exists). NOTE: We don't need to call bli_thread_range_jrir_rr() here since we employ a hack that calls for each thread to execute every iteration of the jr and ir loops but skip all but the pointer increment for iterations that are not assigned to it. */ \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter_tri; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b0111 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b0111 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = is_b_cur; \ \ if ( bli_trmm_my_iter_rr( j, thread ) ) { \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter_rr( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + off_b0111 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k_b0111, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ \ c1 += cstep_c; \ } \ \ /* If there is no rectangular region, then we're done. */ \ if ( n_iter_rct == 0 ) return; \ \ /* Determine the thread range and increment for the 2nd and 1st loops for the remaining rectangular region of B. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. \ NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Advance the start and end iteration offsets for the rectangular region by the number of iterations used for the triangular region. */ \ jr_start += n_iter_tri; \ jr_end += n_iter_tri; \ jb0 = n_iter_tri; \ \ /* Save the resulting value of b1 from the previous loop since it represents the starting point for the rectangular region. */ \ b_cast = b1; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ /* NOTE: We must index through b_cast differently since it contains the starting address of the rectangular region (which is already n_iter_tri logical iterations through B). */ \ b1 = b_cast + (j-jb0) * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ } \ } \ \ \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_var.h000066400000000000000000000057561427272030600227140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/bli_trmm_xx_ker_var2.c000066400000000000000000000054231427272030600243400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" static l3_var_oft vars[2][2] = { { bli_trmm_ll_ker_var2, bli_trmm_lu_ker_var2 }, { bli_trmm_rl_ker_var2, bli_trmm_ru_ker_var2 } }; void bli_trmm_xx_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { dim_t side; dim_t uplo; l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( a ) ) { side = 0; if ( bli_obj_root_is_lower( a ) ) uplo = 0; else uplo = 1; } else // if ( bli_obj_root_is_triangular( b ) ) { side = 1; if ( bli_obj_root_is_lower( b ) ) uplo = 0; else uplo = 1; } // Index into the variant array to extract the correct function pointer. f = vars[side][uplo]; // Call the macrokernel. f ( a, b, c, cntx, rntm, cntl, thread ); } cython-blis-0.9.1/blis/_src/frame/3/trmm/other/000077500000000000000000000000001427272030600211725ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_ll_ker_var2.c000066400000000000000000000372031427272030600254320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2); void bli_trmm_ll_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* jr_thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t off_a1011; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ if ( bli_trmm_my_iter( j, jr_thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1011 = 0; \ k_a1011 = bli_min( diagoffa_i + MR, k ); \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1011, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1011, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ b1 += cstep_b; \ c1 += cstep_c; \ } \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c000066400000000000000000000410551427272030600257760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2rr); // // -- Macrokernel functions for round-robin partitioning ----------------------- // void bli_trmm_ll_ker_var2rr ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t off_a1011; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ dim_t jr_inc; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for the initial rectangular region of C (if it exists). NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1011 = 0; \ k_a1011 = bli_min( diagoffa_i + MR, k ); \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1011, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1011, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2rr: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2rr ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c000066400000000000000000000410451427272030600257700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2sl); // // -- Macrokernel functions for slab partitioning ------------------------------ // void bli_trmm_ll_ker_var2sl ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t off_a1011; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ dim_t jr_inc; \ \ /* Use slab assignment of micropanels to threads in the 2nd loop for the initial rectangular region of C (if it exists). NOTE: Parallelism in the 1st loop is disabled for now. */ \ bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1011 = 0; \ k_a1011 = bli_min( diagoffa_i + MR, k ); \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1011 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1011, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1011, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2sl: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2sl ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_lu_ker_var2.c000066400000000000000000000374571427272030600254560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2); void bli_trmm_lu_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* jr_thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t off_a1112; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + ( i * PACKNR ) / off_scl; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ if ( bli_trmm_my_iter( j, jr_thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly above the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1112 = diagoffa_i; \ k_a1112 = k - off_a1112; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1112, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1112, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ b1 += cstep_b; \ c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c000066400000000000000000000412351427272030600260070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2rr); // // -- Macrokernel functions for round-robin partitioning ----------------------- // void bli_trmm_lu_ker_var2rr ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t off_a1112; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + ( i * PACKNR ) / off_scl; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ dim_t jr_inc; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for the initial rectangular region of C (if it exists). */ \ bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly above the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1112 = diagoffa_i; \ k_a1112 = k - off_a1112; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1112, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1112, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2rr: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2rr ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c000066400000000000000000000412261427272030600260020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2sl); // // -- Macrokernel functions for slab partitioning ------------------------------ // void bli_trmm_lu_ker_var2sl ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t off_a1112; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current block of A is entirely below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else if ( bli_is_rih_packed( schema_a ) ) { ss_a_num = 1; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + ( i * PACKNR ) / off_scl; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ /*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ /*dim_t ir_nt = bli_thread_n_way( ir_thread ); \ dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \ \ dim_t jr_start, jr_end; \ /*dim_t ir_start, ir_end;*/ \ dim_t jr_inc; \ \ /* Use slab assignment of micropanels to threads in the 2nd loop for the initial rectangular region of C (if it exists). */ \ bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ /*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, scale C by beta. If it is strictly above the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict b1_i; \ ctype* restrict a2; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in b1. */ \ off_a1112 = diagoffa_i; \ k_a1112 = k - off_a1112; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ b1_i = b1 + ( off_a1112 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1112, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_a1112, \ alpha_cast, \ a1, \ b1_i, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ /* NOTE: ir loop parallelism disabled for now. */ \ /*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \ \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ /*}*/ \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2sl: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2sl ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_rl_ker_var2.c000066400000000000000000000401511427272030600254340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2); void bli_trmm_rl_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* jr_thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b1121; \ dim_t off_b1121; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to B since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffb < 0 ) \ { \ j = -diagoffb; \ k = k - j; \ diagoffb = 0; \ a_cast = a_cast + ( j * PACKMR ) / off_scl; \ } \ \ /* If there is a zero region to the right of where the diagonal of B intersects the bottom of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( diagoffb + k < n ) \ { \ n = diagoffb + k; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to the beginning of the panel that was packed so we can index into the corresponding location in A. Then compute the length of that panel. */ \ off_b1121 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b1121; \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b1121 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( bli_trmm_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b1121, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b1121, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ { \ if ( bli_trmm_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += cstep_b; \ } \ \ c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c000066400000000000000000000447521427272030600260130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2rr); // // -- Macrokernel functions for round-robin partitioning ----------------------- // void bli_trmm_rl_ker_var2rr ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b1121; \ dim_t off_b1121; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to B since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffb < 0 ) \ { \ j = -diagoffb; \ k = k - j; \ diagoffb = 0; \ a_cast = a_cast + ( j * PACKMR ) / off_scl; \ } \ \ /* If there is a zero region to the right of where the diagonal of B intersects the bottom of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( diagoffb + k < n ) \ { \ n = diagoffb + k; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the rectangular part of B, and the triangular portion. */ \ dim_t n_iter_rct; \ dim_t n_iter_tri; \ \ if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ { \ /* If the entire panel of B does not intersect the diagonal, there is no triangular region, and therefore we can skip the second set of loops. */ \ n_iter_rct = n_iter; \ n_iter_tri = 0; \ } \ else \ { \ /* If the panel of B does intersect the diagonal, compute the number of iterations in the rectangular region by dividing NR into the diagonal offset. (There should never be any remainder in this division.) The number of iterations in the triangular (or trapezoidal) region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_rct = diagoffb / NR; \ n_iter_tri = n_iter - n_iter_rct; \ } \ \ /* Use round-robin assignment of micropanels to threads in the 2nd and 1st loops for the initial rectangular region of B (if it exists). */ \ bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ } \ } \ \ /* If there is no triangular region, then we're done. */ \ if ( n_iter_tri == 0 ) return; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for the remaining triangular region of B (if it exists). NOTE: We don't need to call bli_thread_range_jrir*() here since we employ a hack that calls for each thread to execute every iteration of the jr and ir loops but skip all but the pointer increment for iterations that are not assigned to it. */ \ \ /* Advance the starting b1 and c1 pointers to the positions corresponding to the start of the triangular region of B. */ \ jr_start = n_iter_rct; \ b1 = b_cast + jr_start * cstep_b; \ c1 = c_cast + jr_start * cstep_c; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to the beginning of the panel that was packed so we can index into the corresponding location in A. Then compute the length of that panel. */ \ off_b1121 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b1121; \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b1121 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( bli_trmm_my_iter( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b1121, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b1121, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ \ c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2rr: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2rr ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c000066400000000000000000000447431427272030600260060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2sl); // // -- Macrokernel functions for slab partitioning ------------------------------ // void bli_trmm_rl_ker_var2sl ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b1121; \ dim_t off_b1121; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to B since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffb < 0 ) \ { \ j = -diagoffb; \ k = k - j; \ diagoffb = 0; \ a_cast = a_cast + ( j * PACKMR ) / off_scl; \ } \ \ /* If there is a zero region to the right of where the diagonal of B intersects the bottom of the panel, shrink it to prevent "no-op" iterations from executing. */ \ if ( diagoffb + k < n ) \ { \ n = diagoffb + k; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the rectangular part of B, and the triangular portion. */ \ dim_t n_iter_rct; \ dim_t n_iter_tri; \ \ if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \ { \ /* If the entire panel of B does not intersect the diagonal, there is no triangular region, and therefore we can skip the second set of loops. */ \ n_iter_rct = n_iter; \ n_iter_tri = 0; \ } \ else \ { \ /* If the panel of B does intersect the diagonal, compute the number of iterations in the rectangular region by dividing NR into the diagonal offset. (There should never be any remainder in this division.) The number of iterations in the triangular (or trapezoidal) region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_rct = diagoffb / NR; \ n_iter_tri = n_iter - n_iter_rct; \ } \ \ /* Use slab assignment of micropanels to threads in the 2nd and 1st loops for the initial rectangular region of B (if it exists). */ \ bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ } \ } \ \ /* If there is no triangular region, then we're done. */ \ if ( n_iter_tri == 0 ) return; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for the remaining triangular region of B (if it exists). NOTE: We don't need to call bli_thread_range_jrir*() here since we employ a hack that calls for each thread to execute every iteration of the jr and ir loops but skip all but the pointer increment for iterations that are not assigned to it. */ \ \ /* Advance the starting b1 and c1 pointers to the positions corresponding to the start of the triangular region of B. */ \ jr_start = n_iter_rct; \ b1 = b_cast + jr_start * cstep_b; \ c1 = c_cast + jr_start * cstep_c; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to the beginning of the panel that was packed so we can index into the corresponding location in A. Then compute the length of that panel. */ \ off_b1121 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b1121; \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b1121 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( bli_trmm_my_iter( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + ( off_b1121 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b1121, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b1121, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ \ c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2sl: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2sl ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_ru_ker_var2.c000066400000000000000000000401641427272030600254510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2); void bli_trmm_ru_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* jr_thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b0111; \ dim_t off_b0111; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffb > 0 ) \ { \ j = diagoffb; \ n = n - j; \ diagoffb = 0; \ c_cast = c_cast + (j )*cs_c; \ } \ \ /* If there is a zero region below where the diagonal of B intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffb + n < k ) \ { \ k = -diagoffb + n; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ thrinfo_t* ir_thread = bli_thrinfo_sub_node( jr_thread ); \ dim_t jr_num_threads = bli_thread_n_way( jr_thread ); \ dim_t jr_thread_id = bli_thread_work_id( jr_thread ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b0111 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b0111 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( bli_trmm_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b0111, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b0111, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ { \ if ( bli_trmm_my_iter( j, jr_thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, ir_thread ) ) { \ \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_thread_id, jr_num_threads ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += cstep_b; \ } \ \ c1 += cstep_c; \ } \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c000066400000000000000000000466041427272030600260220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2rr); // // -- Macrokernel functions for round-robin partitioning ----------------------- // void bli_trmm_ru_ker_var2rr ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b0111; \ dim_t off_b0111; \ dim_t i, j, jb0; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffb > 0 ) \ { \ j = diagoffb; \ n = n - j; \ diagoffb = 0; \ c_cast = c_cast + (j )*cs_c; \ } \ \ /* If there is a zero region below where the diagonal of B intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffb + n < k ) \ { \ k = -diagoffb + n; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the triangular part of C, and the rectangular portion. */ \ dim_t n_iter_tri; \ dim_t n_iter_rct; \ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ { \ /* If the entire panel of B does not intersect the diagonal, there is no triangular region, and therefore we can skip the first set of loops. */ \ n_iter_tri = 0; \ n_iter_rct = n_iter; \ } \ else \ { \ /* If the panel of B does intersect the diagonal, compute the number of iterations in the triangular (or trapezoidal) region by dividing NR into the number of rows in B. (There should never be any remainder in this division.) The number of iterations in the rectangular region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ n_iter_rct = n_iter - n_iter_tri; \ } \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for the initial triangular region of B (if it exists). NOTE: We don't need to call bli_thread_range_jrir*() here since we employ a hack that calls for each thread to execute every iteration of the jr and ir loops but skip all but the pointer increment for iterations that are not assigned to it. */ \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter_tri; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b0111 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b0111 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( bli_trmm_my_iter( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b0111, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b0111, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ \ c1 += cstep_c; \ } \ \ /* If there is no rectangular region, then we're done. */ \ if ( n_iter_rct == 0 ) return; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd and 1st loops the remaining triangular region of B. */ \ bli_thread_range_jrir_rr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Advance the start and end iteration offsets for the rectangular region by the number of iterations used for the triangular region. */ \ jr_start += n_iter_tri; \ jr_end += n_iter_tri; \ jb0 = n_iter_tri; \ \ /* Save the resulting value of b1 from the previous loop since it represents the starting point for the rectangular region. */ \ b_cast = b1; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ /* NOTE: We must index through b_cast differently since it contains the starting address of the rectangular region (which is already n_iter_tri logical iterations through B). */ \ b1 = b_cast + (j-jb0) * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter_rr( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ } \ } \ \ \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2rr: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2rr ) cython-blis-0.9.1/blis/_src/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c000066400000000000000000000465751427272030600260240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* beta, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2sl); // // -- Macrokernel functions for slab partitioning ------------------------------ // void bli_trmm_ru_ker_var2sl ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); obj_t scalar_a; obj_t scalar_b; void* buf_alpha; void* buf_beta; FUNCPTR_T f; // Detach and multiply the scalars attached to A and B. bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); // Grab the addresses of the internal scalar buffers for the scalar // merged above and the scalar attached to C. buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); buf_beta = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_beta, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Query the context for the micro-kernel address and cast it to its function pointer type. */ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict one = PASTEMAC(ch,1); \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha_cast = alpha; \ ctype* restrict beta_cast = beta; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b0111; \ dim_t off_b0111; \ dim_t i, j, jb0; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full. For all trmm, k_full is simply k. This is needed because some parameter combinations of trmm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of A (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = k; \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. And if we are packing real-only, imag-only, or summed-only, we need to scale the computed panel sizes by 1/2 to compensate for the fact that the pointer arithmetic occurs in terms of complex elements rather than real elements. */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else if ( bli_is_rih_packed( schema_b ) ) { ss_b_num = 1; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffb > 0 ) \ { \ j = diagoffb; \ n = n - j; \ diagoffb = 0; \ c_cast = c_cast + (j )*cs_c; \ } \ \ /* If there is a zero region below where the diagonal of B intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffb + n < k ) \ { \ k = -diagoffb + n; \ } \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) loop around the microkernel. Here we query the thrinfo_t node for the 1st (ir) loop around the microkernel. */ \ thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ dim_t ir_nt = bli_thread_n_way( caucus ); \ dim_t ir_tid = bli_thread_work_id( caucus ); \ \ dim_t jr_start, jr_end; \ dim_t ir_start, ir_end; \ dim_t jr_inc, ir_inc; \ \ /* Note that we partition the 2nd loop into two regions: the triangular part of C, and the rectangular portion. */ \ dim_t n_iter_tri; \ dim_t n_iter_rct; \ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \ { \ /* If the entire panel of B does not intersect the diagonal, there is no triangular region, and therefore we can skip the first set of loops. */ \ n_iter_tri = 0; \ n_iter_rct = n_iter; \ } \ else \ { \ /* If the panel of B does intersect the diagonal, compute the number of iterations in the triangular (or trapezoidal) region by dividing NR into the number of rows in B. (There should never be any remainder in this division.) The number of iterations in the rectangular region is computed as the remaining number of iterations in the n dimension. */ \ n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \ n_iter_rct = n_iter - n_iter_tri; \ } \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop for the initial triangular region of B (if it exists). NOTE: We don't need to call bli_thread_range_jrir*() here since we employ a hack that calls for each thread to execute every iteration of the jr and ir loops but skip all but the pointer increment for iterations that are not assigned to it. */ \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter_tri; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b0111 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_b_cur = k_b0111 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ if ( bli_trmm_my_iter( j, thread ) ) { \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trmm_my_iter( i, caucus ) ) { \ \ ctype* restrict a1_i; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ a1_i = a1 + ( off_b0111 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b0111, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Copy edge elements of C to the temporary buffer. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ c11, rs_c, cs_c, \ ct, rs_ct, cs_ct ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k_b0111, \ alpha_cast, \ a1_i, \ b1, \ beta_cast, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ } \ \ b1 += ps_b_cur; \ } \ \ c1 += cstep_c; \ } \ \ /* If there is no rectangular region, then we're done. */ \ if ( n_iter_rct == 0 ) return; \ \ /* Use slab assignment of micropanels to threads in the 2nd and 1st loops the remaining triangular region of B. */ \ bli_thread_range_jrir_sl( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ bli_thread_range_jrir_sl( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ \ /* Advance the start and end iteration offsets for the rectangular region by the number of iterations used for the triangular region. */ \ jr_start += n_iter_tri; \ jr_end += n_iter_tri; \ jb0 = n_iter_tri; \ \ /* Save the resulting value of b1 from the previous loop since it represents the starting point for the rectangular region. */ \ b_cast = b1; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ /* NOTE: We must index through b_cast differently since it contains the starting address of the rectangular region (which is already n_iter_tri logical iterations through B). */ \ b1 = b_cast + (j-jb0) * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C by beta. If it is strictly below the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = ir_start; i < ir_end; i += ir_inc ) \ { \ ctype* restrict a2; \ \ a1 = a_cast + i * rstep_a; \ c11 = c1 + i * rstep_c; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \ if ( bli_is_last_iter_sl( i, m_iter, ir_tid, ir_nt ) ) \ { \ a2 = a_cast; \ b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ one, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ alpha_cast, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,adds_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ } \ } \ \ \ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \ /*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2sl: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \ } INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2sl ) cython-blis-0.9.1/blis/_src/frame/3/trmm3/000077500000000000000000000000001427272030600201345ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/trmm3/bli_trmm3.h000066400000000000000000000032471427272030600222030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_trmm3_front.h" cython-blis-0.9.1/blis/_src/frame/3/trmm3/bli_trmm3_front.c000066400000000000000000000143521427272030600234050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { bli_init_once(); obj_t a_local; obj_t b_local; obj_t c_local; // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( beta, c ); return; } // Alias A, B, and C so we can tweak the objects if necessary. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( c, &c_local ); // Set the obj_t buffer field to the location currently implied by the row // and column offsets and then zero the offsets. If any of the original // obj_t's were views into larger matrices, this step effectively makes // those obj_t's "forget" their lineage. bli_obj_reset_origin( &a_local ); bli_obj_reset_origin( &b_local ); bli_obj_reset_origin( &c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( &a_local ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } #ifdef BLIS_DISABLE_TRMM3_RIGHT // NOTE: This case casts right-side trmm3 in terms of left side. This is // necessary when the current subconfiguration uses a gemm microkernel // that assumes that the packing kernel will have already duplicated // (broadcast) element of B in the packed copy of B. Supporting // duplication within the logic that packs micropanels from triangular // matrices would be ugly, and so we simply don't support it. As a // consequence, those subconfigurations need a way to force the triangular // matrix to be on the left (and thus the general matrix to the on the // right). So our solution is that in those cases, the subconfigurations // simply #define BLIS_DISABLE_TRMM3_RIGHT. // NOTE: This case casts right-side trmm3 in terms of left side. This can // lead to the microkernel being executed on an output matrix with the // microkernel's general stride IO case (unless the microkernel supports // both both row and column IO cases as well). // NOTE: Casting right-side trmm3 in terms of left side reduces the number // of macrokernels exercised to two (trmm_ll and trmm_lu). // If A is being multiplied from the right, transpose all operands // so that we can perform the computation as if A were being multiplied // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } #else // An optimization: If C is stored by rows and the micro-kernel prefers // contiguous columns, or if C is stored by columns and the micro-kernel // prefers contiguous rows, transpose the entire operation to allow the // micro-kernel to access elements of C in its preferred manner. if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } // If A is being multiplied from the right, swap A and B so that // the matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( &a_local, &b_local ); } #endif // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_TRMM3, side, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // Invoke the internal back-end. bli_l3_thread_decorator ( bli_l3_int, BLIS_TRMM, // operation family id alpha, &a_local, &b_local, beta, &c_local, cntx, rntm, cntl ); } cython-blis-0.9.1/blis/_src/frame/3/trmm3/bli_trmm3_front.h000066400000000000000000000035411427272030600234100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); cython-blis-0.9.1/blis/_src/frame/3/trsm/000077500000000000000000000000001427272030600200575ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm.h000066400000000000000000000033331427272030600220450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_trsm_cntl.h" #include "bli_trsm_front.h" #include "bli_trsm_var.h" cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_blk_var1.c000066400000000000000000000142561427272030600236270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" //#define PRINT void bli_trsm_blk_var1 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { dim_t my_start, my_end; dim_t b_alg; // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_m( a, b, c, cntl ); // Isolate the diagonal block A11 and its corresponding row panel C1. const dim_t kc = bli_obj_width_after_trans( a ); obj_t a11, c1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, 0, kc, a, &a11 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, 0, kc, c, &c1 ); // All threads iterate over the entire diagonal block A11. my_start = 0; my_end = kc; #ifdef PRINT printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n", (int)bli_obj_length( &a11 ), (int)bli_obj_width( &a11 ), (int)bli_obj_row_off( &a11 ), (int)bli_obj_col_off( &a11 ) ); printf( "bli_trsm_blk_var1(): entering trsm subproblem loop.\n" ); #endif // Partition along the m dimension for the trsm subproblem. for ( dim_t i = my_start; i < my_end; i += b_alg ) { obj_t a11_1, c1_1; b_alg = bli_determine_blocksize( direct, i, my_end, &a11, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, &a11, &a11_1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, &c1, &c1_1 ); #ifdef PRINT printf( "bli_trsm_blk_var1(): a11_1 is %d x %d at offsets (%3d, %3d)\n", (int)bli_obj_length( &a11_1 ), (int)bli_obj_width( &a11_1 ), (int)bli_obj_row_off( &a11_1 ), (int)bli_obj_col_off( &a11_1 ) ); #endif // Perform trsm subproblem. bli_l3_int ( &BLIS_ONE, &a11_1, b, &BLIS_ONE, &c1_1, cntx, rntm, bli_cntl_sub_prenode( cntl ), bli_thrinfo_sub_prenode( thread ) ); } #ifdef PRINT printf( "bli_trsm_blk_var1(): finishing trsm subproblem loop.\n" ); #endif // We must execute a barrier here because the upcoming rank-k update // requires the packed matrix B to be fully updated by the trsm // subproblem. bli_thread_barrier( thread ); // Isolate the remaining part of the column panel matrix A, which we do by // acquiring the subpartition ahead of A11 (that is, A21 or A01, depending // on whether we are moving forwards or backwards, respectively). obj_t ax1, cx1; bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A, 0, kc, a, &ax1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A, 0, kc, c, &cx1 ); #ifdef PRINT printf( "bli_trsm_blk_var1(): ax1 is %d x %d at offsets (%3d, %3d)\n", (int)bli_obj_length( &ax1 ), (int)bli_obj_width( &ax1 ), (int)bli_obj_row_off( &ax1 ), (int)bli_obj_col_off( &ax1 ) ); #endif // Determine the current thread's subpartition range for the gemm // subproblem over Ax1. bli_thread_range_mdim ( direct, thread, &ax1, b, &cx1, cntl, cntx, &my_start, &my_end ); #ifdef PRINT printf( "bli_trsm_blk_var1(): entering gemm subproblem loop (%d->%d).\n", (int)my_start, (int)my_end ); #endif // Partition along the m dimension for the gemm subproblem. for ( dim_t i = my_start; i < my_end; i += b_alg ) { obj_t a11, c1; // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize( direct, i, my_end, &ax1, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and C1. bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, &ax1, &a11 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, &cx1, &c1 ); #ifdef PRINT printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n", (int)bli_obj_length( &a11 ), (int)bli_obj_width( &a11 ), (int)bli_obj_row_off( &a11 ), (int)bli_obj_col_off( &a11 ) ); #endif // Perform gemm subproblem. (Note that we use the same backend // function as before, since we're calling the same macrokernel.) bli_l3_int ( &BLIS_ONE, &a11, b, &BLIS_ONE, &c1, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } #ifdef PRINT printf( "bli_trsm_blk_var1(): finishing gemm subproblem loop.\n" ); #endif } cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_blk_var2.c000066400000000000000000000060031427272030600236170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trsm_blk_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t b1, c1; dim_t my_start, my_end; dim_t b_alg; // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_n( a, b, c, cntl ); // Determine the current thread's subpartition range. bli_thread_range_ndim ( direct, thread, a, b, c, cntl, cntx, &my_start, &my_end ); // Partition along the n dimension. for ( dim_t i = my_start; i < my_end; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_determine_blocksize( direct, i, my_end, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for B1 and C1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, c, &c1 ); // Perform trsm subproblem. bli_l3_int ( &BLIS_ONE, a, &b1, &BLIS_ONE, &c1, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); } } cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_blk_var3.c000066400000000000000000000064031427272030600236240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trsm_blk_var3 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { obj_t a1, b1; dim_t b_alg; // Determine the direction in which to partition (forwards or backwards). dir_t direct = bli_l3_direct( a, b, c, cntl ); // Prune any zero region that exists along the partitioning dimension. bli_l3_prune_unref_mparts_k( a, b, c, cntl ); // Query dimension in partitioning direction. dim_t k_trans = bli_obj_width_after_trans( a ); // Partition along the k dimension. for ( dim_t i = 0; i < k_trans; i += b_alg ) { // Determine the current algorithmic blocksize. b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b, bli_cntl_bszid( cntl ), cntx ); // Acquire partitions for A1 and B1. bli_acquire_mpart_ndim( direct, BLIS_SUBPART1, i, b_alg, a, &a1 ); bli_acquire_mpart_mdim( direct, BLIS_SUBPART1, i, b_alg, b, &b1 ); // Perform trsm subproblem. bli_l3_int ( &BLIS_ONE, &a1, &b1, &BLIS_ONE, c, cntx, rntm, bli_cntl_sub_node( cntl ), bli_thrinfo_sub_node( thread ) ); //bli_thread_ibarrier( thread ); bli_thread_barrier( bli_thrinfo_sub_node( thread ) ); // This variant executes multiple rank-k updates. Therefore, if the // internal alpha scalars on A/B and C are non-zero, we must ensure // that they are only used in the first iteration. if ( i == 0 ) { bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b ); bli_obj_scalar_reset( c ); } } } cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_cntl.c000066400000000000000000000205621427272030600230630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ) { if ( bli_is_left( side ) ) return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker ); else return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker ); } cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ) { void_fp macro_kernel_p; // Set the default macrokernel. If a non-NULL kernel function pointer is // passed in, we use that instead. macro_kernel_p = bli_trsm_xx_ker_var2; if ( ker ) macro_kernel_p = ker; const opid_t family = BLIS_TRSM; // // Create nodes for packing A and the macro-kernel (gemm branch). // cntl_t* gemm_cntl_bu_ke = bli_trsm_cntl_create_node ( rntm, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); cntl_t* gemm_cntl_bp_bu = bli_trsm_cntl_create_node ( rntm, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, gemm_cntl_bu_ke ); // Create a node for packing matrix A. cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node ( rntm, bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, FALSE, // do NOT invert diagonal TRUE, // reverse iteration if upper? FALSE, // reverse iteration if lower? schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, gemm_cntl_bp_bu ); // // Create nodes for packing A and the macro-kernel (trsm branch). // cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( rntm, // the thread's runtime structure family, // the operation family BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( rntm, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, bli_l3_packa, // trsm operation's packm function for A. BLIS_MR, BLIS_MR, #ifdef BLIS_ENABLE_TRSM_PREINVERSION TRUE, // invert diagonal #else FALSE, // do NOT invert diagonal #endif TRUE, // reverse iteration if upper? FALSE, // reverse iteration if lower? schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); // ------------------------------------------------------------------------- // Create a node for partitioning the m dimension by MC. // NOTE: We attach the gemm sub-tree as the main branch. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( rntm, family, BLIS_MC, bli_trsm_blk_var1, gemm_cntl_packa ); // Attach the trsm sub-tree as the auxiliary "prenode" branch. bli_cntl_set_sub_prenode( trsm_cntl_packa, trsm_cntl_op_bp ); // ------------------------------------------------------------------------- // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, bli_l3_packb, BLIS_NR, BLIS_MR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( rntm, family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( rntm, family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op ); return trsm_cntl_vl_mm; } cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ) { // NOTE: trsm macrokernels are presently disabled for right-side execution. // Set the default macrokernel. If a non-NULL kernel function pointer is // passed in, we use that instead. void_fp macro_kernel_p = bli_trsm_xx_ker_var2; if ( ker ) macro_kernel_p = ker; const opid_t family = BLIS_TRSM; // Create two nodes for the macro-kernel. cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node ( rntm, family, BLIS_MR, // needed for bli_thrinfo_rgrow() NULL, // variant function pointer not used NULL // no sub-node; this is the leaf of the tree. ); cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node ( rntm, family, BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow() macro_kernel_p, trsm_cntl_bu_ke ); // Create a node for packing matrix A. cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node ( rntm, bli_l3_packa, BLIS_NR, BLIS_MR, FALSE, // do NOT invert diagonal FALSE, // reverse iteration if upper? FALSE, // reverse iteration if lower? schema_a, // normally BLIS_PACKED_ROW_PANELS BLIS_BUFFER_FOR_A_BLOCK, trsm_cntl_bp_bu ); // Create a node for partitioning the m dimension by MC. cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node ( rntm, family, BLIS_MC, bli_trsm_blk_var1, trsm_cntl_packa ); // Create a node for packing matrix B. cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node ( rntm, bli_l3_packb, BLIS_MR, BLIS_MR, TRUE, // do NOT invert diagonal FALSE, // reverse iteration if upper? TRUE, // reverse iteration if lower? schema_b, // normally BLIS_PACKED_COL_PANELS BLIS_BUFFER_FOR_B_PANEL, trsm_cntl_op_bp ); // Create a node for partitioning the k dimension by KC. cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node ( rntm, family, BLIS_KC, bli_trsm_blk_var3, trsm_cntl_packb ); // Create a node for partitioning the n dimension by NC. cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node ( rntm, family, BLIS_NC, bli_trsm_blk_var2, trsm_cntl_mm_op ); return trsm_cntl_vl_mm; } void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { bli_cntl_free( rntm, cntl, thread ); } // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ) { return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node ); } cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_cntl.h000066400000000000000000000047321427272030600230710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_front.c000066400000000000000000000121241427272030600232460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { bli_init_once(); obj_t a_local; obj_t b_local; obj_t c_local; #if 0 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM gint_t status = bli_trsm_small( side, alpha, a, b, cntx, cntl ); if ( status == BLIS_SUCCESS ) return; #endif #endif // If alpha is zero, scale by beta and return. if ( bli_obj_equals( alpha, &BLIS_ZERO ) ) { bli_scalm( alpha, b ); return; } // Alias A and B so we can tweak the objects if necessary. bli_obj_alias_to( a, &a_local ); bli_obj_alias_to( b, &b_local ); bli_obj_alias_to( b, &c_local ); // Set the obj_t buffer field to the location currently implied by the row // and column offsets and then zero the offsets. If any of the original // obj_t's were views into larger matrices, this step effectively makes // those obj_t's "forget" their lineage. bli_obj_reset_origin( &a_local ); bli_obj_reset_origin( &b_local ); bli_obj_reset_origin( &c_local ); // We do not explicitly implement the cases where A is transposed. // However, we can still handle them. Specifically, if A is marked as // needing a transposition, we simply induce a transposition. This // allows us to only explicitly implement the no-transpose cases. Once // the transposition is induced, the correct algorithm will be called, // since, for example, an algorithm over a transposed lower triangular // matrix A moves in the same direction (forwards) as a non-transposed // upper triangular matrix. And with the transposition induced, the // matrix now appears to be upper triangular, so the upper triangular // algorithm will grab the correct partitions, as if it were upper // triangular (with no transpose) all along. if ( bli_obj_has_trans( &a_local ) ) { bli_obj_induce_trans( &a_local ); bli_obj_set_onlytrans( BLIS_NO_TRANSPOSE, &a_local ); } #if 1 // If A is being solved against from the right, transpose all operands // so that we can perform the computation as if A were being solved // from the left. if ( bli_is_right( side ) ) { bli_toggle_side( &side ); bli_obj_induce_trans( &a_local ); bli_obj_induce_trans( &b_local ); bli_obj_induce_trans( &c_local ); } #else // NOTE: Enabling this code requires that BLIS NOT be configured with // BLIS_RELAX_MCNR_NCMR_CONSTRAINTS defined. #ifdef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #error "BLIS_RELAX_MCNR_NCMR_CONSTRAINTS must not be defined for current trsm_r implementation." #endif // If A is being solved against from the right, swap A and B so that // the triangular matrix will actually be on the right. if ( bli_is_right( side ) ) { bli_obj_swap( &a_local, &b_local ); } #endif // Set the pack schemas within the objects. bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx ); // Parse and interpret the contents of the rntm_t object to properly // set the ways of parallelism for each loop, and then make any // additional modifications necessary for the current operation. bli_rntm_set_ways_for_op ( BLIS_TRSM, side, bli_obj_length( &c_local ), bli_obj_width( &c_local ), bli_obj_width( &a_local ), rntm ); // Invoke the internal back-end. bli_l3_thread_decorator ( bli_l3_int, BLIS_TRSM, // operation family id alpha, &a_local, &b_local, alpha, &c_local, cntx, rntm, cntl ); } cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_front.h000066400000000000000000000040711427272030600232550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_ll_ker_var2.c000066400000000000000000000370641427272030600243320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); void bli_trsm_ll_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ /* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t k_a10; \ dim_t off_a10; \ dim_t off_a11; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* We don't bother querying the thrinfo_t node for the 1st loop because we can't parallelize that loop in trsm due to the inter-iteration dependencies that exist. */ \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ \ /* Determine the thread range and increment for the 2nd loop. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. NOTE: Parallelism in the 1st loop is unattainable due to the inter-iteration dependencies present in trsm. */ \ bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1 + (0 )*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides below the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is above the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a10; \ ctype* restrict a11; \ ctype* restrict b01; \ ctype* restrict b11; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a10 = 0; \ k_a1011 = diagoffa_i + MR; \ k_a10 = k_a1011 - MR; \ off_a11 = k_a10; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = is_a_cur; \ \ /* Compute the addresses of the panel A10 and the triangular block A11. */ \ a10 = a1; \ a11 = a1 + k_a10 * PACKMR; \ /*a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );*/ \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b01 = b1 + off_a10 * PACKNR; \ b11 = b1 + off_a11 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ gemmtrsm_ukr \ ( \ m_cur, \ n_cur, \ k_a10, \ alpha1_cast, \ a10, \ a11, \ b01, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ /* PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_lu_ker_var2.c000066400000000000000000000400331427272030600243310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); void bli_trsm_lu_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ /* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t k_a11; \ dim_t k_a12; \ dim_t off_a11; \ dim_t off_a12; \ dim_t i, j, ib; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + i * PACKNR; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* We don't bother querying the thrinfo_t node for the 1st loop because we can't parallelize that loop in trsm due to the inter-iteration dependencies that exist. */ \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ \ /* Determine the thread range and increment for the 2nd loop. NOTE: The definition of bli_thread_range_jrir() will depend on whether slab or round-robin partitioning was requested at configure-time. NOTE: Parallelism in the 1st loop is unattainable due to the inter-iteration dependencies present in trsm. */ \ bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1 + (m_iter-1)*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( ib = 0; ib < m_iter; ++ib ) \ { \ i = m_iter - 1 - ib; \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides above the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is below the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a11; \ ctype* restrict a12; \ ctype* restrict b11; \ ctype* restrict b21; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a11 = diagoffa_i; \ k_a1112 = k - off_a11;; \ k_a11 = MR; \ k_a12 = k_a1112 - MR; \ off_a12 = off_a11 + k_a11; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = is_a_cur; \ \ /* Compute the addresses of the triangular block A11 and the panel A12. */ \ a11 = a1; \ a12 = a1 + k_a11 * PACKMR; \ /*a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );*/ \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b11 = b1 + off_a11 * PACKNR; \ b21 = b1 + off_a12 * PACKNR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ gemmtrsm_ukr \ ( \ m_cur, \ n_cur, \ k_a12, \ alpha1_cast, \ a12, \ a11, \ b21, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ \ a1 += rstep_a; \ } \ \ c11 -= rstep_c; \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ printf( "m_iter = %lu\n", m_iter ); \ printf( "m_cur = %lu\n", m_cur ); \ printf( "k = %lu\n", k ); \ printf( "diagoffa_i = %lu\n", diagoffa_i ); \ printf( "off_a1112 = %lu\n", off_a1112 ); \ printf( "k_a1112 = %lu\n", k_a1112 ); \ printf( "k_a12 = %lu\n", k_a12 ); \ printf( "k_a11 = %lu\n", k_a11 ); \ printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_rl_ker_var2.c000066400000000000000000000404001427272030600243240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); void bli_trsm_rl_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ /* NOTE: We use the upper-triangular gemmtrsm ukernel because, while the current macro-kernel targets the "rl" case (right-side/lower- triangular), it becomes upper-triangular after the kernel operation is transposed so that all kernel instances are of the "left" variety (since those are the only trsm ukernels that exist). */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ /* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b1121; \ dim_t k_b11; \ dim_t k_b21; \ dim_t off_b11; \ dim_t off_b21; \ dim_t i, j, jb; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKNR pd_a == NR ps_a == stride to next micro-panel of A rs_b == PACKMR cs_b == 1 pd_b == MR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the swapping of values in the control tree (ie: those values used when packing). This swapping is needed since we cast right-hand trsm in terms of transposed left-hand trsm. So, if we're going to be transposing the operation, then A needs to be packed with NR and B needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely above its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of NR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to B since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffb < 0 ) \ { \ j = -diagoffb; \ k = k - j; \ diagoffb = 0; \ a_cast = a_cast + j * PACKMR; \ } \ \ /* If there is a zero region to the right of where the diagonal of B intersects the bottom of the panel, shrink it so that we can index to the correct place in C (corresponding to the part of the panel of B that was packed). NOTE: This is NOT being done to skip over "no-op" iterations, as with the trsm_lu macro-kernel. This MUST be done for correct execution because we use n (via n_iter) to compute diagonal and index offsets for backwards movement through B. */ \ if ( diagoffb + k < n ) \ { \ n = diagoffb + k; \ } \ \ /* Check the k dimension, which needs to be a multiple of NR. If k isn't a multiple of NR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an NR x NR triangular solve. This adjustment of k is consistent with what happened when B was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of A. */ \ if ( k % NR != 0 ) k += NR - ( k % NR ); \ \ /* NOTE: We don't need to check that n is a multiple of PACKNR since we know that the underlying buffer was already allocated to have an n dimension that is a multiple of PACKNR, with the region between the last column and the next multiple of NR zero-padded accordingly. */ \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_schema_a( schema_b, &aux ); \ bli_auxinfo_set_schema_b( schema_a, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_b( istep_a, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( jb = 0; jb < n_iter; ++jb ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b11; \ ctype* restrict b21; \ ctype* restrict b2; \ \ j = n_iter - 1 - jb; \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ a1 = a_cast; \ c11 = c1 + (n_iter-1)*cstep_c; \ \ n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of B resides below the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is above the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b11 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b11; \ k_b11 = NR; \ k_b21 = k_b1121 - NR; \ off_b21 = off_b11 + k_b11; \ \ /* Compute the addresses of the triangular block B11 and the panel B21. */ \ b11 = b1; \ b21 = b1 + k_b11 * PACKNR; \ /*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ \ \ /* Compute the panel stride for the current micro-panel. */ \ is_b_cur = k_b1121 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = is_b_cur; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a11; \ ctype* restrict a12; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the A11 block and A12 panel. */ \ a11 = a1 + off_b11 * PACKMR; \ a12 = a1 + off_b21 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ gemmtrsm_ukr \ ( \ m_cur, \ n_cur, \ k_b21, \ alpha1_cast, \ b21, \ b11, \ a12, \ a11, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += ps_b_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ minus_one, \ b1, \ a1, \ alpha2_cast, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += cstep_b; \ } \ \ c1 -= cstep_c; \ } \ } INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_ru_ker_var2.c000066400000000000000000000376161427272030600243540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); void bli_trsm_ru_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ /* NOTE: We use the lower-triangular gemmtrsm ukernel because, while the current macro-kernel targets the "ru" case (right-side/upper- triangular), it becomes lower-triangular after the kernel operation is transposed so that all kernel instances are of the "left" variety (since those are the only trsm ukernels that exist). */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ /* ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ */ \ \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b0111; \ dim_t k_b01; \ dim_t off_b01; \ dim_t off_b11; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKNR pd_a == NR ps_a == stride to next micro-panel of A rs_b == PACKMR cs_b == 1 pd_b == MR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the swapping of values in the control tree (ie: those values used when packing). This swapping is needed since we cast right-hand trsm in terms of transposed left-hand trsm. So, if we're going to be transposing the operation, then A needs to be packed with NR and B needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of NR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffb > 0 ) \ { \ j = diagoffb; \ n = n - j; \ diagoffb = 0; \ c_cast = c_cast + (j )*cs_c; \ } \ \ /* If there is a zero region below where the diagonal of B intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffb + n < k ) \ { \ k = -diagoffb + n; \ } \ \ /* Check the k dimension, which needs to be a multiple of NR. If k isn't a multiple of NR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an NR x NR triangular solve. This adjustment of k is consistent with what happened when B was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of A. */ \ if ( k % NR != 0 ) k += NR - ( k % NR ); \ \ /* NOTE: We don't need to check that n is a multiple of PACKNR since we know that the underlying buffer was already allocated to have an n dimension that is a multiple of PACKNR, with the region between the last column and the next multiple of NR zero-padded accordingly. */ \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_schema_a( schema_b, &aux ); \ bli_auxinfo_set_schema_b( schema_a, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_b( istep_a, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b01; \ ctype* restrict b11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of B resides above the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is below the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b01 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ k_b01 = k_b0111 - NR; \ off_b11 = k_b01; \ \ /* Compute the addresses of the panel B10 and the triangular block B11. */ \ b01 = b1; \ b11 = b1 + k_b01 * PACKNR; \ /*b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ \ \ /* Compute the panel stride for the current micro-panel. */ \ is_b_cur = k_b0111 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = is_b_cur; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a10; \ ctype* restrict a11; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the A10 panel and A11 block. */ \ a10 = a1 + off_b01 * PACKMR; \ a11 = a1 + off_b11 * PACKMR; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ gemmtrsm_ukr \ ( \ m_cur, \ n_cur, \ k_b01, \ alpha1_cast, \ b01, \ b11, \ a10, \ a11, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += ps_b_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if ( bli_trsm_my_iter_rr( i, thread ) ){ \ \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ m_cur, \ n_cur, \ k, \ minus_one, \ b1, \ a1, \ alpha2_cast, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += cstep_b; \ } \ \ c1 += cstep_c; \ } \ } INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_var.h000066400000000000000000000057531427272030600227250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/bli_trsm_xx_ker_var2.c000066400000000000000000000054231427272030600243540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" static l3_var_oft vars[2][2] = { { bli_trsm_ll_ker_var2, bli_trsm_lu_ker_var2 }, { bli_trsm_rl_ker_var2, bli_trsm_ru_ker_var2 } }; void bli_trsm_xx_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { dim_t side; dim_t uplo; l3_var_oft f; // Set two bools: one based on the implied side parameter (the structure // of the root object) and one based on the uplo field of the triangular // matrix's root object (whether that is matrix A or matrix B). if ( bli_obj_root_is_triangular( a ) ) { side = 0; if ( bli_obj_root_is_lower( a ) ) uplo = 0; else uplo = 1; } else // if ( bli_obj_root_is_triangular( b ) ) { side = 1; if ( bli_obj_root_is_lower( b ) ) uplo = 0; else uplo = 1; } // Index into the variant array to extract the correct function pointer. f = vars[side][uplo]; // Call the macrokernel. f ( a, b, c, cntx, rntm, cntl, thread ); } cython-blis-0.9.1/blis/_src/frame/3/trsm/other/000077500000000000000000000000001427272030600212005ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_ll_ker_var2.c000066400000000000000000000464421427272030600254530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2); void bli_trsm_ll_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t k_a10; \ dim_t off_a10; \ dim_t off_a11; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ if( bli_trsm_my_iter( j, thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ a1 = a_cast; \ c11 = c1 + (0 )*rstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides below the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is above the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a10; \ ctype* restrict a11; \ ctype* restrict b01; \ ctype* restrict b11; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a10 = 0; \ k_a1011 = diagoffa_i + MR; \ k_a10 = k_a1011 - MR; \ off_a11 = k_a10; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the panel A10 and the triangular block A11. */ \ a10 = a1; \ /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \ b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ if ( j + bli_thread_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a10, \ alpha1_cast, \ a10, \ a11, \ b01, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a10, \ alpha1_cast, \ a10, \ a11, \ b01, \ b11, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ if ( j + bli_thread_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ b1 += cstep_b; \ c1 += cstep_c; \ } \ \ /* if ( bli_is_4mi_packed( schema_a ) ){ \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ ( double* )b, rs_b, 1, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ }else{ \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ } \ */ \ \ /* PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ */ \ \ /* if ( bli_is_4mi_packed( schema_a ) ){ \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ ( double* )b, rs_b, 1, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ }else{ \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ } \ PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ ( double* )c, 1, cs_c, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c000066400000000000000000000477131427272030600260210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2rr); // // -- Macrokernel functions for round-robin partitioning ----------------------- // void bli_trsm_ll_ker_var2rr ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t k_a10; \ dim_t off_a10; \ dim_t off_a11; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* We don't bother querying the thrinfo_t node for the 1st loop because we can't parallelize that loop in trsm due to the inter-iteration dependencies that exist. */ \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop. NOTE: Parallelism in the 1st loop is unattainable due to the inter-iteration dependencies present in trsm. */ \ bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1 + (0 )*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides below the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is above the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a10; \ ctype* restrict a11; \ ctype* restrict b01; \ ctype* restrict b11; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a10 = 0; \ k_a1011 = diagoffa_i + MR; \ k_a10 = k_a1011 - MR; \ off_a11 = k_a10; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the panel A10 and the triangular block A11. */ \ a10 = a1; \ /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \ b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a10, \ alpha1_cast, \ a10, \ a11, \ b01, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a10, \ alpha1_cast, \ a10, \ a11, \ b01, \ b11, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ /* if ( bli_is_4mi_packed( schema_a ) ){ \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ ( double* )b, rs_b, 1, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ }else{ \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ } \ */ \ \ /* PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ */ \ \ /* if ( bli_is_4mi_packed( schema_a ) ){ \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ ( double* )b, rs_b, 1, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ }else{ \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ } \ PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ ( double* )c, 1, cs_c, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2rr ) cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c000066400000000000000000000477041427272030600260140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2sl); // // -- Macrokernel functions for slab partitioning ------------------------------ // void bli_trsm_ll_ker_var2sl ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1011; \ dim_t k_a10; \ dim_t off_a10; \ dim_t off_a11; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is above the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region above where the diagonal of A intersects the left edge of the block, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffa < 0 ) \ { \ i = -diagoffa; \ m = m - i; \ diagoffa = 0; \ c_cast = c_cast + (i )*rs_c; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* We don't bother querying the thrinfo_t node for the 1st loop because we can't parallelize that loop in trsm due to the inter-iteration dependencies that exist. */ \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ \ /* Use slab assignment of micropanels to threads in the 2nd loop. NOTE: Parallelism in the 1st loop is unattainable due to the inter-iteration dependencies present in trsm. */ \ bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1 + (0 )*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides below the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is above the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a10; \ ctype* restrict a11; \ ctype* restrict b01; \ ctype* restrict b11; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a10 = 0; \ k_a1011 = diagoffa_i + MR; \ k_a10 = k_a1011 - MR; \ off_a11 = k_a10; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1011 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the panel A10 and the triangular block A11. */ \ a10 = a1; \ /* a11 = a1 + ( k_a10 * PACKMR ) / off_scl; */ \ a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b01 = b1 + ( off_a10 * PACKNR ) / off_scl; \ b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a10, \ alpha1_cast, \ a10, \ a11, \ b01, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a10, \ alpha1_cast, \ a10, \ a11, \ b01, \ b11, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ } \ \ c11 += rstep_c; \ } \ } \ \ /* if ( bli_is_4mi_packed( schema_a ) ){ \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r before", k, n, \ ( double* )b, rs_b, 1, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i before", k, n, \ ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ }else{ \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r before", k, n, \ ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i before", k, n, \ ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ } \ */ \ \ /* PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \ ( double* )a11, 1, PACKMR, "%4.1f", "" ); \ */ \ \ /* if ( bli_is_4mi_packed( schema_a ) ){ \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_r after", k, n, \ ( double* )b, rs_b, 1, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm4m1_ll_ker_var2: b_i after", k, n, \ ( double* )b+72, rs_b, 1, "%4.1f", "" ); \ }else{ \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_r after", k, n, \ ( double* )b, 2*rs_b, 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsmnat_ll_ker_var2: b_i after", k, n, \ ( double* )b+1, 2*rs_b, 2, "%4.1f", "" ); \ } \ PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_r", m, n, \ ( double* )c, 1, cs_c, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: b_i", m, n, \ ( double* )c + 8*9, 1, cs_c, "%4.1f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2sl ) cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_lu_ker_var2.c000066400000000000000000000445311427272030600254610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2); void bli_trsm_lu_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t k_a11; \ dim_t k_a12; \ dim_t off_a11; \ dim_t off_a12; \ dim_t i, j, ib; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + ( i * PACKNR ) / off_scl; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ if( bli_trsm_my_iter( j, thread ) ) { \ \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ a1 = a_cast; \ c11 = c1 + (m_iter-1)*rstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( ib = 0; ib < m_iter; ++ib ) \ { \ i = m_iter - 1 - ib; \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides above the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is below the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a11; \ ctype* restrict a12; \ ctype* restrict b11; \ ctype* restrict b21; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a11 = diagoffa_i; \ k_a1112 = k - off_a11;; \ k_a11 = MR; \ k_a12 = k_a1112 - MR; \ off_a12 = off_a11 + k_a11; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the triangular block A11 and the panel A12. */ \ a11 = a1; \ /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ if ( j + bli_thread_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a12, \ alpha1_cast, \ a12, \ a11, \ b21, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a12, \ alpha1_cast, \ a12, \ a11, \ b21, \ b11, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ /*if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) */\ if ( j + bli_thread_num_threads(thread) >= n_iter ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ } \ \ c11 -= rstep_c; \ } \ } \ \ b1 += cstep_b; \ c1 += cstep_c; \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ printf( "m_iter = %lu\n", m_iter ); \ printf( "m_cur = %lu\n", m_cur ); \ printf( "k = %lu\n", k ); \ printf( "diagoffa_i = %lu\n", diagoffa_i ); \ printf( "off_a1112 = %lu\n", off_a1112 ); \ printf( "k_a1112 = %lu\n", k_a1112 ); \ printf( "k_a12 = %lu\n", k_a12 ); \ printf( "k_a11 = %lu\n", k_a11 ); \ printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c000066400000000000000000000460021427272030600260200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2rr); // // -- Macrokernel functions for round-robin partitioning ----------------------- // void bli_trsm_lu_ker_var2rr ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t k_a11; \ dim_t k_a12; \ dim_t off_a11; \ dim_t off_a12; \ dim_t i, j, ib; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + ( i * PACKNR ) / off_scl; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* We don't bother querying the thrinfo_t node for the 1st loop because we can't parallelize that loop in trsm due to the inter-iteration dependencies that exist. */ \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ \ /* Use round-robin assignment of micropanels to threads in the 2nd loop. NOTE: Parallelism in the 1st loop is unattainable due to the inter-iteration dependencies present in trsm. */ \ bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1 + (m_iter-1)*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( ib = 0; ib < m_iter; ++ib ) \ { \ i = m_iter - 1 - ib; \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides above the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is below the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a11; \ ctype* restrict a12; \ ctype* restrict b11; \ ctype* restrict b21; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a11 = diagoffa_i; \ k_a1112 = k - off_a11;; \ k_a11 = MR; \ k_a12 = k_a1112 - MR; \ off_a12 = off_a11 + k_a11; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the triangular block A11 and the panel A12. */ \ a11 = a1; \ /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a12, \ alpha1_cast, \ a12, \ a11, \ b21, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a12, \ alpha1_cast, \ a12, \ a11, \ b21, \ b11, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ } \ \ c11 -= rstep_c; \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ printf( "m_iter = %lu\n", m_iter ); \ printf( "m_cur = %lu\n", m_cur ); \ printf( "k = %lu\n", k ); \ printf( "diagoffa_i = %lu\n", diagoffa_i ); \ printf( "off_a1112 = %lu\n", off_a1112 ); \ printf( "k_a1112 = %lu\n", k_a1112 ); \ printf( "k_a12 = %lu\n", k_a12 ); \ printf( "k_a11 = %lu\n", k_a11 ); \ printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2rr ) cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c000066400000000000000000000457731427272030600260310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffa, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2sl); // // -- Macrokernel functions for slab partitioning ------------------------------ // void bli_trsm_lu_ker_var2sl ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffa = bli_obj_diag_offset( a ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to B (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of B prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( b ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffa, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffa, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffa_i; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_a1112; \ dim_t k_a11; \ dim_t k_a12; \ dim_t off_a11; \ dim_t off_a12; \ dim_t i, j, ib; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_a_num; \ inc_t ss_a_den; \ inc_t ps_a_cur; \ inc_t is_a_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKMR pd_a == MR ps_a == stride to next micro-panel of A rs_b == PACKNR cs_b == 1 pd_b == NR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If matrix A is below the diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of MR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_a ) || \ bli_is_3mi_packed( schema_a ) || \ bli_is_rih_packed( schema_a ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_a ) ) { ss_a_num = 3; ss_a_den = 2; } \ else { ss_a_num = 1; ss_a_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of A intersects the top edge of the block, adjust the pointer to B and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to A since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffa > 0 ) \ { \ i = diagoffa; \ k = k - i; \ diagoffa = 0; \ b_cast = b_cast + ( i * PACKNR ) / off_scl; \ } \ \ /* If there is a zero region below where the diagonal of A intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffa + k < m ) \ { \ m = -diagoffa + k; \ } \ \ /* Check the k dimension, which needs to be a multiple of MR. If k isn't a multiple of MR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an MR x MR triangular solve. This adjustment of k is consistent with what happened when A was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of B. */ \ if ( k % MR != 0 ) k += MR - ( k % MR ); \ \ /* NOTE: We don't need to check that m is a multiple of PACKMR since we know that the underlying buffer was already allocated to have an m dimension that is a multiple of PACKMR, with the region between the last row and the next multiple of MR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k; \ istep_b = PACKNR * k_full; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_schema_a( schema_a, &aux ); \ bli_auxinfo_set_schema_b( schema_b, &aux ); \ \ /* Save the imaginary stride of B to the auxinfo_t object. */ \ bli_auxinfo_set_is_b( istep_b, &aux ); \ \ /* We don't bother querying the thrinfo_t node for the 1st loop because we can't parallelize that loop in trsm due to the inter-iteration dependencies that exist. */ \ /*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \ \ /* Query the number of threads and thread ids for each loop. */ \ dim_t jr_nt = bli_thread_n_way( thread ); \ dim_t jr_tid = bli_thread_work_id( thread ); \ \ dim_t jr_start, jr_end; \ dim_t jr_inc; \ \ /* Use slab assignment of micropanels to threads in the 2nd loop. NOTE: Parallelism in the 1st loop is unattainable due to the inter-iteration dependencies present in trsm. */ \ bli_thread_range_jrir_sl( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = jr_start; j < jr_end; j += jr_inc ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b2; \ \ b1 = b_cast + j * cstep_b; \ c1 = c_cast + j * cstep_c; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ a1 = a_cast; \ c11 = c1 + (m_iter-1)*rstep_c; \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( ib = 0; ib < m_iter; ++ib ) \ { \ i = m_iter - 1 - ib; \ diagoffa_i = diagoffa + ( doff_t )i*MR; \ \ m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \ \ /* If the current panel of A intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of A resides above the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is below the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a11; \ ctype* restrict a12; \ ctype* restrict b11; \ ctype* restrict b21; \ ctype* restrict a2; \ \ /* Compute various offsets into and lengths of parts of A. */ \ off_a11 = diagoffa_i; \ k_a1112 = k - off_a11;; \ k_a11 = MR; \ k_a12 = k_a1112 - MR; \ off_a12 = off_a11 + k_a11; \ \ /* Compute the panel stride for the current diagonal- intersecting micro-panel. */ \ is_a_cur = k_a1112 * PACKMR; \ is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \ ps_a_cur = ( is_a_cur * ss_a_num ) / ss_a_den; \ \ /* Compute the addresses of the triangular block A11 and the panel A12. */ \ a11 = a1; \ /* a12 = a1 + ( k_a11 * PACKMR ) / off_scl; */ \ a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, off_scl ); \ \ /* Compute the addresses of the panel B01 and the block B11. */ \ b11 = b1 + ( off_a11 * PACKNR ) / off_scl; \ b21 = b1 + ( off_a12 * PACKNR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + ps_a_cur; \ if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( is_a_cur, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a12, \ alpha1_cast, \ a12, \ a11, \ b21, \ b11, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_a12, \ alpha1_cast, \ a12, \ a11, \ b21, \ b11, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ \ a1 += ps_a_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \ { \ ctype* restrict a2; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1 + rstep_a; \ if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \ { \ a2 = a_cast; \ b2 = b1; \ if ( bli_is_last_iter_sl( j, n_iter, jr_tid, jr_nt ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. */ \ bli_auxinfo_set_next_a( a2, &aux ); \ bli_auxinfo_set_next_b( b2, &aux ); \ \ /* Save the 4m1/3m1 imaginary stride of A to the auxinfo_t object. */ \ bli_auxinfo_set_is_a( istep_a, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ alpha2_cast, \ c11, rs_c, cs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ a1, \ b1, \ zero, \ ct, rs_ct, cs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ \ a1 += rstep_a; \ } \ \ c11 -= rstep_c; \ } \ } \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \ printf( "m_iter = %lu\n", m_iter ); \ printf( "m_cur = %lu\n", m_cur ); \ printf( "k = %lu\n", k ); \ printf( "diagoffa_i = %lu\n", diagoffa_i ); \ printf( "off_a1112 = %lu\n", off_a1112 ); \ printf( "k_a1112 = %lu\n", k_a1112 ); \ printf( "k_a12 = %lu\n", k_a12 ); \ printf( "k_a11 = %lu\n", k_a11 ); \ printf( "rs_c,cs_c = %lu %lu\n", rs_c, cs_c ); \ printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \ */ \ \ /* PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \ PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2sl ) cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_rl_ker_var2.c000066400000000000000000000467211427272030600254610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2); void bli_trsm_rl_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ /* NOTE: We use the upper-triangular gemmtrsm ukernel because, while the current macro-kernel targets the "rl" case (right-side/lower- triangular), it becomes upper-triangular after the kernel operation is transposed so that all kernel instances are of the "left" variety (since those are the only trsm ukernels that exist). */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b1121; \ dim_t k_b11; \ dim_t k_b21; \ dim_t off_b11; \ dim_t off_b21; \ dim_t i, j, jb; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKNR pd_a == NR ps_a == stride to next micro-panel of A rs_b == PACKMR cs_b == 1 pd_b == MR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the swapping of values in the control tree (ie: those values used when packing). This swapping is needed since we cast right-hand trsm in terms of transposed left-hand trsm. So, if we're going to be transposing the operation, then A needs to be packed with NR and B needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely above its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of NR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region above where the diagonal of B intersects the left edge of the panel, adjust the pointer to A and treat this case as if the diagonal offset were zero. Note that we don't need to adjust the pointer to B since packm would have simply skipped over the region that was not stored. */ \ if ( diagoffb < 0 ) \ { \ j = -diagoffb; \ k = k - j; \ diagoffb = 0; \ a_cast = a_cast + ( j * PACKMR ) / off_scl; \ } \ \ /* If there is a zero region to the right of where the diagonal of B intersects the bottom of the panel, shrink it so that we can index to the correct place in C (corresponding to the part of the panel of B that was packed). NOTE: This is NOT being done to skip over "no-op" iterations, as with the trsm_lu macro-kernel. This MUST be done for correct execution because we use n (via n_iter) to compute diagonal and index offsets for backwards movement through B. */ \ if ( diagoffb + k < n ) \ { \ n = diagoffb + k; \ } \ \ /* Check the k dimension, which needs to be a multiple of NR. If k isn't a multiple of NR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an NR x NR triangular solve. This adjustment of k is consistent with what happened when B was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of A. */ \ if ( k % NR != 0 ) k += NR - ( k % NR ); \ \ /* NOTE: We don't need to check that n is a multiple of PACKNR since we know that the underlying buffer was already allocated to have an n dimension that is a multiple of PACKNR, with the region between the last column and the next multiple of NR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_schema_a( schema_b, &aux ); \ bli_auxinfo_set_schema_b( schema_a, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_b( istep_a, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( jb = 0; jb < n_iter; ++jb ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b11; \ ctype* restrict b21; \ ctype* restrict b2; \ \ j = n_iter - 1 - jb; \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ a1 = a_cast; \ c11 = c1 + (n_iter-1)*cstep_c; \ \ n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of B resides below the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is above the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b11 = bli_max( -diagoffb_j, 0 ); \ k_b1121 = k - off_b11; \ k_b11 = NR; \ k_b21 = k_b1121 - NR; \ off_b21 = off_b11 + k_b11; \ \ /* Compute the addresses of the triangular block B11 and the panel B21. */ \ b11 = b1; \ /* b21 = b1 + ( k_b11 * PACKNR ) / off_scl; */ \ b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, off_scl ); \ \ /* Compute the panel stride for the current micro-panel. */ \ is_b_cur = k_b1121 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_a( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a11; \ ctype* restrict a12; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the A11 block and A12 panel. */ \ a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \ a12 = a1 + ( off_b21 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_b21, \ alpha1_cast, \ b21, \ b11, \ a12, \ a11, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_b21, \ alpha1_cast, \ b21, \ b11, \ a12, \ a11, \ ct, cs_ct, rs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += ps_b_cur; \ } \ else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_a( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ if ( bli_is_last_iter( jb, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ b1, \ a1, \ alpha2_cast, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ b1, \ a1, \ zero, \ ct, cs_ct, rs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += cstep_b; \ } \ \ c1 -= cstep_c; \ } \ } INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/3/trsm/other/bli_trsm_ru_ker_var2.c000066400000000000000000000461211427272030600254640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T gemm_fp typedef void (*FUNCPTR_T) ( doff_t diagoffb, pack_t schema_a, pack_t schema_b, dim_t m, dim_t n, dim_t k, void* alpha1, void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, void* alpha2, void* c, inc_t rs_c, inc_t cs_c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2); void bli_trsm_ru_ker_var2 ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { num_t dt_exec = bli_obj_exec_dt( c ); doff_t diagoffb = bli_obj_diag_offset( b ); pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t cs_a = bli_obj_col_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); void* buf_c = bli_obj_buffer_at_off( c ); inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); void* buf_alpha1; void* buf_alpha2; FUNCPTR_T f; // Grab the address of the internal scalar buffer for the scalar // attached to A (the non-triangular matrix). This will be the alpha // scalar used in the gemmtrsm subproblems (ie: the scalar that would // be applied to the packed copy of A prior to it being updated by // the trsm subproblem). This scalar may be unit, if for example it // was applied during packing. buf_alpha1 = bli_obj_internal_scalar_buffer( a ); // Grab the address of the internal scalar buffer for the scalar // attached to C. This will be the "beta" scalar used in the gemm-only // subproblems that correspond to micro-panels that do not intersect // the diagonal. We need this separate scalar because it's possible // that the alpha attached to B was reset, if it was applied during // packing. buf_alpha2 = bli_obj_internal_scalar_buffer( c ); // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_exec]; // Invoke the function. f( diagoffb, schema_a, schema_b, m, n, k, buf_alpha1, buf_a, cs_a, pd_a, ps_a, buf_b, rs_b, pd_b, ps_b, buf_alpha2, buf_c, rs_c, cs_c, cntx, rntm, thread ); } #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffb, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ /* Alias some constants to simpler names. */ \ const dim_t MR = pd_a; \ const dim_t NR = pd_b; \ const dim_t PACKMR = cs_a; \ const dim_t PACKNR = rs_b; \ \ /* Cast the micro-kernel address to its function pointer type. */ \ /* NOTE: We use the lower-triangular gemmtrsm ukernel because, while the current macro-kernel targets the "ru" case (right-side/upper- triangular), it becomes lower-triangular after the kernel operation is transposed so that all kernel instances are of the "left" variety (since those are the only trsm ukernels that exist). */ \ PASTECH(ch,gemmtrsm_ukr_ft) \ gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ \ /* Temporary C buffer for edge cases. Note that the strides of this temporary buffer are set so that they match the storage of the original C matrix. For example, if C is column-stored, ct will be column-stored as well. */ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : NR ); \ const inc_t cs_ct = ( col_pref ? MR : 1 ); \ \ ctype* restrict zero = PASTEMAC(ch,0); \ ctype* restrict minus_one = PASTEMAC(ch,m1); \ ctype* restrict a_cast = a; \ ctype* restrict b_cast = b; \ ctype* restrict c_cast = c; \ ctype* restrict alpha1_cast = alpha1; \ ctype* restrict alpha2_cast = alpha2; \ ctype* restrict b1; \ ctype* restrict c1; \ \ doff_t diagoffb_j; \ dim_t k_full; \ dim_t m_iter, m_left; \ dim_t n_iter, n_left; \ dim_t m_cur; \ dim_t n_cur; \ dim_t k_b0111; \ dim_t k_b01; \ dim_t off_b01; \ dim_t off_b11; \ dim_t i, j; \ inc_t rstep_a; \ inc_t cstep_b; \ inc_t rstep_c, cstep_c; \ inc_t istep_a; \ inc_t istep_b; \ inc_t off_scl; \ inc_t ss_b_num; \ inc_t ss_b_den; \ inc_t ps_b_cur; \ inc_t is_b_cur; \ auxinfo_t aux; \ \ /* Assumptions/assertions: rs_a == 1 cs_a == PACKNR pd_a == NR ps_a == stride to next micro-panel of A rs_b == PACKMR cs_b == 1 pd_b == MR ps_b == stride to next micro-panel of B rs_c == (no assumptions) cs_c == (no assumptions) Note that MR/NR and PACKMR/PACKNR have been swapped to reflect the swapping of values in the control tree (ie: those values used when packing). This swapping is needed since we cast right-hand trsm in terms of transposed left-hand trsm. So, if we're going to be transposing the operation, then A needs to be packed with NR and B needs to be packed with MR (remember: B is the triangular matrix in the right-hand side parameter case). */ \ \ /* Safety trap: Certain indexing within this macro-kernel does not work as intended if both MR and NR are odd. */ \ if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \ ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \ \ /* If any dimension is zero, return immediately. */ \ if ( bli_zero_dim3( m, n, k ) ) return; \ \ /* Safeguard: If the current panel of B is entirely below its diagonal, it is implicitly zero. So we do nothing. */ \ if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \ \ /* Compute k_full as k inflated up to a multiple of NR. This is needed because some parameter combinations of trsm reduce k to advance past zero regions in the triangular matrix, and when computing the imaginary stride of B (the non-triangular matrix), which is used by 4m1/3m1 implementations, we need this unreduced value of k. */ \ k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \ \ /* Compute indexing scaling factor for for 4m or 3m. This is needed because one of the packing register blocksizes (PACKMR or PACKNR) is used to index into the micro-panels of the non- triangular matrix when computing with a diagonal-intersecting micro-panel of the triangular matrix. In the case of 4m or 3m, real values are stored in both sub-panels, and so the indexing needs to occur in units of real values. The value computed here is divided into the complex pointer offset to cause the pointer to be advanced by the correct value. */ \ if ( bli_is_4mi_packed( schema_b ) || \ bli_is_3mi_packed( schema_b ) || \ bli_is_rih_packed( schema_b ) ) off_scl = 2; \ else off_scl = 1; \ \ /* Compute the storage stride scaling. Usually this is just 1. However, in the case of interleaved 3m, we need to scale the offset by 3/2. Note that real-only, imag-only, and summed-only packing formats are not applicable here since trsm is a two- operand operation only (unlike trmm, which is capable of three- operand). */ \ if ( bli_is_3mi_packed( schema_b ) ) { ss_b_num = 3; ss_b_den = 2; } \ else { ss_b_num = 1; ss_b_den = 1; } \ \ /* If there is a zero region to the left of where the diagonal of B intersects the top edge of the panel, adjust the pointer to C and treat this case as if the diagonal offset were zero. This skips over the region that was not packed. (Note we assume the diagonal offset is a multiple of MR; this assumption will hold as long as the cache blocksizes are each a multiple of MR and NR.) */ \ if ( diagoffb > 0 ) \ { \ j = diagoffb; \ n = n - j; \ diagoffb = 0; \ c_cast = c_cast + (j )*cs_c; \ } \ \ /* If there is a zero region below where the diagonal of B intersects the right side of the block, shrink it to prevent "no-op" iterations from executing. */ \ if ( -diagoffb + n < k ) \ { \ k = -diagoffb + n; \ } \ \ /* Check the k dimension, which needs to be a multiple of NR. If k isn't a multiple of NR, we adjust it higher to satisfy the micro- kernel, which is expecting to perform an NR x NR triangular solve. This adjustment of k is consistent with what happened when B was packed: all of its bottom/right edges were zero-padded, and furthermore, the panel that stores the bottom-right corner of the matrix has its diagonal extended into the zero-padded region (as identity). This allows the trsm of that bottom-right panel to proceed without producing any infs or NaNs that would infect the "good" values of the corresponding block of A. */ \ if ( k % NR != 0 ) k += NR - ( k % NR ); \ \ /* NOTE: We don't need to check that n is a multiple of PACKNR since we know that the underlying buffer was already allocated to have an n dimension that is a multiple of PACKNR, with the region between the last column and the next multiple of NR zero-padded accordingly. */ \ \ /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ PASTEMAC(ch,set0s_mxn)( MR, NR, \ ct, rs_ct, cs_ct ); \ \ /* Compute number of primary and leftover components of the m and n dimensions. */ \ n_iter = n / NR; \ n_left = n % NR; \ \ m_iter = m / MR; \ m_left = m % MR; \ \ if ( n_left ) ++n_iter; \ if ( m_left ) ++m_iter; \ \ /* Determine some increments used to step through A, B, and C. */ \ rstep_a = ps_a; \ \ cstep_b = ps_b; \ \ rstep_c = rs_c * MR; \ cstep_c = cs_c * NR; \ \ istep_a = PACKMR * k_full; \ istep_b = PACKNR * k; \ \ if ( bli_is_odd( istep_a ) ) istep_a += 1; \ if ( bli_is_odd( istep_b ) ) istep_b += 1; \ \ /* Save the pack schemas of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_schema_a( schema_b, &aux ); \ bli_auxinfo_set_schema_b( schema_a, &aux ); \ \ /* Save the imaginary stride of A to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_b( istep_a, &aux ); \ \ b1 = b_cast; \ c1 = c_cast; \ \ /* Loop over the n dimension (NR columns at a time). */ \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype* restrict a1; \ ctype* restrict c11; \ ctype* restrict b01; \ ctype* restrict b11; \ ctype* restrict b2; \ \ diagoffb_j = diagoffb - ( doff_t )j*NR; \ a1 = a_cast; \ c11 = c1; \ \ n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ \ /* Initialize our next panel of B to be the current panel of B. */ \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, use a special micro-kernel that performs a fused gemm and trsm. If the current panel of B resides above the diagonal, use a a regular gemm micro-kernel. Otherwise, if it is below the diagonal, it was not packed (because it is implicitly zero) and so we do nothing. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Determine the offset to and length of the panel that was packed so we can index into the corresponding location in A. */ \ off_b01 = 0; \ k_b0111 = bli_min( k, -diagoffb_j + NR ); \ k_b01 = k_b0111 - NR; \ off_b11 = k_b01; \ \ /* Compute the addresses of the panel B10 and the triangular block B11. */ \ b01 = b1; \ /* b11 = b1 + ( k_b01 * PACKNR ) / off_scl; */ \ b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, off_scl ); \ \ /* Compute the panel stride for the current micro-panel. */ \ is_b_cur = k_b0111 * PACKNR; \ is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \ ps_b_cur = ( is_b_cur * ss_b_num ) / ss_b_den; \ \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_a( is_b_cur, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a10; \ ctype* restrict a11; \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the A10 panel and A11 block. */ \ a10 = a1 + ( off_b01 * PACKMR ) / off_scl; \ a11 = a1 + ( off_b11 * PACKMR ) / off_scl; \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + ps_b_cur; \ if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_b01, \ alpha1_cast, \ b01, \ b11, \ a10, \ a11, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the fused gemm/trsm micro-kernel. */ \ gemmtrsm_ukr \ ( \ k_b01, \ alpha1_cast, \ b01, \ b11, \ a10, \ a11, \ ct, cs_ct, rs_ct, \ &aux, \ cntx \ ); \ \ /* Copy the result to the bottom edge of C. */ \ PASTEMAC(ch,copys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += ps_b_cur; \ } \ else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \ { \ /* Save the 4m1/3m1 imaginary stride of B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_is_a( istep_b, &aux ); \ \ /* Loop over the m dimension (MR rows at a time). */ \ for ( i = 0; i < m_iter; ++i ) \ { \ if( bli_trsm_my_iter( i, thread ) ){ \ \ ctype* restrict a2; \ \ m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ \ /* Compute the addresses of the next panels of A and B. */ \ a2 = a1; \ /*if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) */\ if ( i + bli_thread_num_threads(thread) >= m_iter ) \ { \ a2 = a_cast; \ b2 = b1 + cstep_b; \ if ( bli_is_last_iter( j, n_iter, 0, 1 ) ) \ b2 = b_cast; \ } \ \ /* Save addresses of next panels of A and B to the auxinfo_t object. NOTE: We swap the values for A and B since the triangular "A" matrix is actually contained within B. */ \ bli_auxinfo_set_next_a( b2, &aux ); \ bli_auxinfo_set_next_b( a2, &aux ); \ \ /* Handle interior and edge cases separately. */ \ if ( m_cur == MR && n_cur == NR ) \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ b1, \ a1, \ alpha2_cast, \ c11, cs_c, rs_c, \ &aux, \ cntx \ ); \ } \ else \ { \ /* Invoke the gemm micro-kernel. */ \ gemm_ukr \ ( \ k, \ minus_one, \ b1, \ a1, \ zero, \ ct, cs_ct, rs_ct, \ &aux, \ cntx \ ); \ \ /* Add the result to the edge of C. */ \ PASTEMAC(ch,xpbys_mxn)( m_cur, n_cur, \ ct, rs_ct, cs_ct, \ alpha2_cast, \ c11, rs_c, cs_c ); \ } \ } \ \ a1 += rstep_a; \ c11 += rstep_c; \ } \ \ b1 += cstep_b; \ } \ \ c1 += cstep_c; \ } \ } INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 ) cython-blis-0.9.1/blis/_src/frame/base/000077500000000000000000000000001427272030600176425ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/base/bli_apool.c000066400000000000000000000436421427272030600217570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_apool_init ( apool_t* restrict apool ) { err_t r_val; // NOTE: The apool_t is only used in one place; it is the type used to // define the sba. We've switched to static initialization of the mutex // field to remove one more thing that could possibly go wrong during // library initialization. // Query the mutex from the apool_t. //bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); // Initialize the mutex. //*mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; //bli_pthread_mutex_init( mutex, NULL ); // We choose to start with: // - an empty pool // - an initial block_ptrs_len of 8 // - a single element in each initial array_t (though this is moot with // num_blocks = 0). const siz_t num_blocks = 0; siz_t block_ptrs_len = 8; const siz_t num_elem = 1; // NOTE: Unlike in the bli_pool API, apool_t allocates block_ptrs as an // array of array_t* instead of an array of pblk_t. Why? We don't need to // track the size of each block, thus we don't need the block_size field // of pblk_t. That leaves only the void* field, and since we know apool_t // will always contain "blocks" that are really array_t structs, we can // make block_ptrs an array of array_t*. // We formally set the block_size and align_size fields of the underlying // pool, even though they won't be queried. (They are used from hard-coded // values in bli_apool_alloc_block().) const siz_t block_size = sizeof( array_t ); const siz_t align_size = 64; // Query the underlying pool_t from the apool_t. pool_t* restrict pool = bli_apool_pool( apool ); // Set the default array_t length of the apool_t. bli_apool_set_def_array_len( num_elem, apool ); // ------------------------------------------------------------------------- // Make sure that block_ptrs_len is at least num_blocks. block_ptrs_len = bli_max( block_ptrs_len, num_blocks ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_init(): allocating block_ptrs (length %d): ", ( int )block_ptrs_len ); #endif // Allocate the block_ptrs array. array_t** restrict block_ptrs = bli_malloc_intl( block_ptrs_len * sizeof( array_t* ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_init(): allocating %d array_t.\n", ( int )num_blocks ); fflush( stdout ); #endif // Allocate and initialize each entry in the block_ptrs array. for ( dim_t i = 0; i < num_blocks; ++i ) { // Pass in num_elem so the function knows how many elements to // initially have in each array_t. bli_apool_alloc_block ( num_elem, &(block_ptrs[i]) ); } // NOTE: The semantics of top_index approximate a stack, where a "full" // stack (no blocks checked out) is one where top_index == 0 and an empty // stack (all blocks checked out) one where top_index == num_blocks. // (Here, num_blocks tracks the number of blocks currently allocated as // part of the pool.) This "orientation" of the stack was chosen // intentionally, in contrast to one where top_index == -1 means the // stack is empty and top_index = num_blocks - 1 means the stack is // full. The chosen scheme allows one to conceptualize the stack as a // number line in which blocks are checked out from lowest to highest, // and additional blocks are added at the higher end. // Initialize the pool_t structure. // NOTE: We don't use the malloc_fp and free_fp fields at the apool_t // level. Nevertheless, we set them to NULL. bli_pool_set_block_ptrs( block_ptrs, pool ); bli_pool_set_block_ptrs_len( block_ptrs_len, pool ); bli_pool_set_top_index( 0, pool ); bli_pool_set_num_blocks( num_blocks, pool ); bli_pool_set_block_size( block_size, pool ); bli_pool_set_align_size( align_size, pool ); bli_pool_set_malloc_fp( NULL, pool ); bli_pool_set_free_fp( NULL, pool ); } void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ) { err_t r_val; // Since the apool_t is defined as a pool of array_t, we can hard-code // the block_size parameter. const siz_t block_size = sizeof( array_t ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_alloc_block(): allocating array_t: " ); #endif // Allocate the array_t via the bli_fmalloc_align() wrapper, which performs // alignment logic and opaquely saves the original pointer so that it can // be recovered when it's time to free the block. array_t* restrict array = bli_malloc_intl( block_size, &r_val ); // Initialize an array_t struct within the newly allocated memory region. bli_array_init( num_elem, sizeof( pool_t* ), array ); // Save the pointer in the caller's array_t*. *array_p = array; } void bli_apool_free_block ( array_t* restrict array ) { const siz_t num_elem = bli_array_num_elem( array ); pool_t** restrict buf = bli_array_buf( array ); // Step through the array and finalize each pool_t. for ( dim_t i = 0; i < num_elem; ++i ) { pool_t* restrict pool = buf[ i ]; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_free_block(): freeing pool_t %d within array_t.\n", ( int )i ); fflush( stdout ); #endif // Finalize and free the current pool_t, if it was created/allocated. if ( pool != NULL ) { // Finalize the pool. bli_pool_finalize( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_free_block(): pool_t %d: ", ( int )i ); #endif // Free the pool_t struct. bli_free_intl( pool ); } } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_free_block(): " ); #endif // Free the array buffer. bli_array_finalize( array ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_free_block(): freeing array_t: " ); #endif // Free the array. bli_free_intl( array ); } void bli_apool_finalize ( apool_t* restrict apool ) { // NOTE: Since the apool_t's mutex is now initialized statically, we no // longer need to explicitly destroy it. // Query the mutex from the apool_t. //bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool ); // Destroy the mutex. //bli_pthread_mutex_destroy( mutex ); // Query the underlying pool_t and mutex from the apool_t. pool_t* restrict pool = bli_apool_pool( apool ); // ---------------------------------------------------------------- // Query the block_ptrs array. array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the total number of blocks currently allocated. siz_t num_blocks = bli_pool_num_blocks( pool ); // Query the top_index of the pool. siz_t top_index = bli_pool_top_index( pool ); // Sanity check: The top_index should be zero. if ( top_index != 0 ) bli_abort(); // Free the individual blocks (each an array_t) currently in the pool. for ( dim_t i = 0; i < num_blocks; ++i ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_finalize(): freeing array_t %d within apool_t.\n", ( int )i ); fflush( stdout ); #endif bli_apool_free_block( block_ptrs[i] ); } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_finalize(): freeing block_ptrs (length %d): ", ( int )( bli_pool_block_ptrs_len( pool ) ) ); #endif // Free the block_ptrs array. bli_free_intl( block_ptrs ); } array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ) { // Acquire the apool_t's mutex. bli_apool_lock( apool ); // ---------------------------------------------------------------------------- // NOTE: Unlike with the bli_pool API, we do not need to handle potential // reinitialization since the apool_t's block_size (corresponding to the // size of an array_t struct) will never grow. // If the apool_t is exhausted, add a block (e.g. an array_t). if ( bli_apool_is_exhausted( apool ) ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_checkout_block(): apool_t is exhausted; " "growing by 1 array_t.\n" ); fflush( stdout ); #endif bli_apool_grow( 1, apool ); } // At this point, at least one array_t is guaranteed to be available. // Query the underlying pool_t from the apool_t. pool_t* restrict pool = bli_apool_pool( apool ); // Query the block_ptrs array. array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_checkout_array(): checking out array_t %d.\n", ( int )top_index ); fflush( stdout ); #endif // Select the array_t* at top_index to return to the caller. array_t* restrict array = block_ptrs[ top_index ]; // Increment the pool's top_index. bli_pool_set_top_index( top_index + 1, pool ); // ---------------------------------------------------------------------------- // Release the apool_t's mutex. bli_apool_unlock( apool ); // Resize the array_t according to the number of threads specified by the // caller. (We need one element in the array_t per thread.) bli_array_resize( n_threads, array ); // Return the selected array_t*. return array; } void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ) { // Acquire the apool_t's mutex. bli_apool_lock( apool ); // Query the underlying pool_t from the apool_t. pool_t* restrict pool = bli_apool_pool( apool ); // ---------------------------------------------------------------------------- // NOTE: Unlike with the bli_pool API, we do not need to handle potential // freeing of the blocks upon checkin due to the block_size having since // changed due to reinitialization since the apool's block_size will never // change. // Query the block_ptrs array. array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_checkin_block(): checking in array_t %d.\n", ( int )top_index - 1 ); fflush( stdout ); #endif // Copy the caller's array_t address to the element at top_index - 1. block_ptrs[ top_index - 1 ] = array; // Decrement the pool's top_index. bli_pool_set_top_index( top_index - 1, pool ); // ---------------------------------------------------------------------------- // Release the apool_t's mutex. bli_apool_unlock( apool ); } pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ) { err_t r_val; // Query the array element corresponding to index. // NOTE: If we knew that the array_t contained elements of size // sizeof( void* ) or sizeof( whatever ), we could return the *value* // stored in the array. But since array_t is general-purpose, it can't // return the element itself. So instead, bli_array_elem() returns the // address of the element in the array. Since the elements that apool_t // stores in the array_t are pool_t*, that means that the function is // actually returning the address of a pool_t*, or pool_t**, hence the // dereferencing below. pool_t** restrict pool_p = bli_array_elem( index, array ); pool_t* pool = *pool_p; // If the element is NULL, then it means a pool_t has not yet been created // and allocated for the given index (thread id). if ( pool == NULL ) { // Settle on the parameters to use when initializing the pool_t for // the current index within the array_t. const siz_t num_blocks = 1; const siz_t block_ptrs_len = 25; const siz_t align_size = 16; const siz_t offset_size = 0; malloc_ft malloc_fp = BLIS_MALLOC_POOL; free_ft free_fp = BLIS_FREE_POOL; // Each small block pool should contain blocks large enough to // accommodate any of the data structures for which they will be // used. const siz_t n_sizes = 4; siz_t sizes[4] = { sizeof( cntl_t ), sizeof( packm_params_t ), sizeof( thrcomm_t ), sizeof( thrinfo_t ) }; siz_t block_size = 0; // Find the largest of the sizes above and use that as the block_size // for the pool. for ( dim_t i = 0; i < n_sizes; ++i ) { if ( block_size < sizes[i] ) block_size = sizes[i]; } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_array_elem(): pool_t for tid %d is NULL; allocating pool_t.\n", ( int )index ); printf( "bli_apool_array_elem(): allocating pool_t: " ); #endif // Allocate the pool_t. pool = bli_malloc_intl( sizeof( pool_t ), &r_val ); // Initialize the pool_t. bli_pool_init ( num_blocks, block_ptrs_len, block_size, align_size, offset_size, malloc_fp, free_fp, pool ); // Update the array element with the address to the new pool_t. // NOTE: We pass in the address of the pool_t* since the bli_array // API is generalized for arbitrarily-sized elements, and therefore // it must always take the address of the data, rather than the // value (which it can only do if the elem size were fixed). bli_array_set_elem( &pool, index, array ); } // The array element is now guaranteed to refer to an allocated and // initialized pool_t. // Return the array element. return pool; } void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ) { err_t r_val; // If the requested increase is zero, return early. if ( num_blocks_add == 0 ) return; // Query the underlying pool_t from the apool_t. pool_t* restrict pool = bli_apool_pool( apool ); // Query the default initial array length from the apool_t. const siz_t num_elem = bli_apool_def_array_len( apool ); // ---------------------------------------------------------------------------- // Query the allocated length of the block_ptrs array and also the // total number of blocks currently allocated. const siz_t block_ptrs_len_cur = bli_pool_block_ptrs_len( pool ); const siz_t num_blocks_cur = bli_pool_num_blocks( pool ); // Compute the total number of allocated blocks that will exist // after we grow the pool. const siz_t num_blocks_new = num_blocks_cur + num_blocks_add; // If adding num_blocks_add new blocks will exceed the current capacity // of the block_ptrs array, we need to first put in place a new (larger) // array. if ( block_ptrs_len_cur < num_blocks_new ) { // To prevent this from happening often, we double the current // length of the block_ptrs array. const siz_t block_ptrs_len_new = 2 * block_ptrs_len_cur; // Query the current block_ptrs array. array_t** restrict block_ptrs_cur = bli_pool_block_ptrs( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_grow(): growing block_ptrs_len (%d -> %d): ", ( int )block_ptrs_len_cur, ( int )block_ptrs_len_new ); #endif // Allocate a new block_ptrs array. array_t** restrict block_ptrs_new = bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ), &r_val ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); // Copy the contents of the old block_ptrs array to the new/resized // array. Notice that we can begin with top_index since all entries // from 0 to top_index-1 have been (and are currently) checked out // to threads. for ( dim_t i = top_index; i < num_blocks_cur; ++i ) { block_ptrs_new[i] = block_ptrs_cur[i]; } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_grow(): freeing prev block_ptrs: " ); #endif // Free the old block_ptrs array. bli_free_intl( block_ptrs_cur ); // Update the pool_t struct with the new block_ptrs array and // record its allocated length. bli_pool_set_block_ptrs( block_ptrs_new, pool ); bli_pool_set_block_ptrs_len( block_ptrs_len_new, pool ); } // At this point, we are guaranteed to have enough unused elements // in the block_ptrs array to accommodate an additional num_blocks_add // blocks. // Query the current block_ptrs array (which was maybe just resized). array_t** restrict block_ptrs = bli_pool_block_ptrs( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_apool_grow(): growing apool_t (%d -> %d).\n", ( int )num_blocks_cur, ( int )num_blocks_new ); fflush( stdout ); #endif // Allocate the requested additional blocks in the resized array. for ( dim_t i = num_blocks_cur; i < num_blocks_new; ++i ) { bli_apool_alloc_block ( num_elem, &(block_ptrs[i]) ); } // Update the pool_t struct with the new number of allocated blocks. // Notice that top_index remains unchanged, as do the block_size and // align_size fields. bli_pool_set_num_blocks( num_blocks_new, pool ); } cython-blis-0.9.1/blis/_src/frame/base/bli_apool.h000066400000000000000000000067731427272030600217700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- /* typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; */ // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_arch.c000066400000000000000000000221251427272030600215530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018-2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_CONFIGURETIME_CPUID // NOTE: If you need to make any changes to this cpp branch, it's probably // the case that you also need to modify bli_arch.c, bli_cpuid.c, and // bli_env.c. Don't forget to update these other files as needed! // The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp // branch in bli_system.h is processed. (This macro is normally defined in // bli_config.h.) #define BLIS_ENABLE_SYSTEM // Use C-style static inline functions for any static inline functions that // happen to be defined by the headers below. (This macro is normally defined // in bli_config_macro_defs.h.) #define BLIS_INLINE static // Since we're not building a shared library, we can forgo the use of the // BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro // is normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS #include "bli_system.h" #include "bli_type_defs.h" #include "bli_arch.h" #include "bli_cpuid.h" #include "bli_env.h" #else #include "blis.h" #endif // ----------------------------------------------------------------------------- // The arch_t id for the currently running hardware. We initialize to -1, // which will be overwritten upon calling bli_arch_set_id(). static arch_t id = -1; arch_t bli_arch_query_id( void ) { bli_arch_set_id_once(); // Simply return the id that was previously cached. return id; } // ----------------------------------------------------------------------------- // A pthread structure used in pthread_once(). pthread_once() is guaranteed to // execute exactly once among all threads that pass in this control object. static bli_pthread_once_t once_id = BLIS_PTHREAD_ONCE_INIT; void bli_arch_set_id_once( void ) { #ifndef BLIS_CONFIGURETIME_CPUID bli_pthread_once( &once_id, bli_arch_set_id ); #endif } // ----------------------------------------------------------------------------- void bli_arch_set_id( void ) { // Check the environment variable BLIS_ARCH_DEBUG to see if the user // requested that we echo the result of the subconfiguration selection. bool do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 ); bli_arch_set_logging( do_logging ); // Check the environment variable BLIS_ARCH_TYPE to see if the user // requested that we use a specific subconfiguration. dim_t req_id = bli_env_get_var( "BLIS_ARCH_TYPE", -1 ); #ifndef BLIS_CONFIGURETIME_CPUID if ( req_id != -1 ) { // BLIS_ARCH_TYPE was set. Cautiously check whether its value is usable. // If req_id was set to an invalid arch_t value (ie: outside the range // [0,BLIS_NUM_ARCHS-1]), output an error message and abort. if ( bli_error_checking_is_enabled() ) { err_t e_val = bli_check_valid_arch_id( req_id ); bli_check_error_code( e_val ); } // At this point, we know that req_id is in the valid range, but we // don't yet know if it refers to a context that was actually // initialized. Query the address of an internal context data structure // corresponding to req_id. This pointer will be NULL if the associated // subconfig is not available. cntx_t** req_cntx = bli_gks_lookup_id( req_id ); // This function checks the context pointer and aborts with a useful // error message if the pointer is found to be NULL. if ( bli_error_checking_is_enabled() ) { err_t e_val = bli_check_initialized_gks_cntx( req_cntx ); bli_check_error_code( e_val ); } // Finally, we can be confident that req_id (1) is in range and (2) // refers to a context that has been initialized. id = req_id; } else #endif { // BLIS_ARCH_TYPE was unset. Proceed with normal subconfiguration // selection behavior. // Architecture families. #if defined BLIS_FAMILY_INTEL64 || \ defined BLIS_FAMILY_AMD64 || \ defined BLIS_FAMILY_X86_64 || \ defined BLIS_FAMILY_ARM64 || \ defined BLIS_FAMILY_ARM32 || \ defined BLIS_FAMILY_X86_64_NO_SKX || \ defined BLIS_FAMILY_X86_64_NO_ZEN2 || \ defined BLIS_FAMILY_X86_64_NO_ZEN3 id = bli_cpuid_query_id(); #endif // Intel microarchitectures. #ifdef BLIS_FAMILY_SKX id = BLIS_ARCH_SKX; #endif #ifdef BLIS_FAMILY_KNL id = BLIS_ARCH_KNL; #endif #ifdef BLIS_FAMILY_KNC id = BLIS_ARCH_KNC; #endif #ifdef BLIS_FAMILY_HASWELL id = BLIS_ARCH_HASWELL; #endif #ifdef BLIS_FAMILY_SANDYBRIDGE id = BLIS_ARCH_SANDYBRIDGE; #endif #ifdef BLIS_FAMILY_PENRYN id = BLIS_ARCH_PENRYN; #endif // AMD microarchitectures. #ifdef BLIS_FAMILY_ZEN3 id = BLIS_ARCH_ZEN3; #endif #ifdef BLIS_FAMILY_ZEN2 id = BLIS_ARCH_ZEN2; #endif #ifdef BLIS_FAMILY_ZEN id = BLIS_ARCH_ZEN; #endif #ifdef BLIS_FAMILY_EXCAVATOR id = BLIS_ARCH_EXCAVATOR; #endif #ifdef BLIS_FAMILY_STEAMROLLER id = BLIS_ARCH_STEAMROLLER; #endif #ifdef BLIS_FAMILY_PILEDRIVER id = BLIS_ARCH_PILEDRIVER; #endif #ifdef BLIS_FAMILY_BULLDOZER id = BLIS_ARCH_BULLDOZER; #endif // ARM microarchitectures. #ifdef BLIS_FAMILY_ARMSVE id = BLIS_ARCH_ARMSVE; #endif #ifdef BLIS_FAMILY_A64FX id = BLIS_ARCH_A64FX; #endif #ifdef BLIS_FAMILY_FIRESTORM id = BLIS_ARCH_FIRESTORM; #endif #ifdef BLIS_FAMILY_THUNDERX2 id = BLIS_ARCH_THUNDERX2; #endif #ifdef BLIS_FAMILY_CORTEXA57 id = BLIS_ARCH_CORTEXA57; #endif #ifdef BLIS_FAMILY_CORTEXA53 id = BLIS_ARCH_CORTEXA53; #endif #ifdef BLIS_FAMILY_CORTEXA15 id = BLIS_ARCH_CORTEXA15; #endif #ifdef BLIS_FAMILY_CORTEXA9 id = BLIS_ARCH_CORTEXA9; #endif // IBM microarchitectures. #ifdef BLIS_FAMILY_POWER10 id = BLIS_ARCH_POWER10; #endif #ifdef BLIS_FAMILY_POWER9 id = BLIS_ARCH_POWER9; #endif #ifdef BLIS_FAMILY_POWER7 id = BLIS_ARCH_POWER7; #endif #ifdef BLIS_FAMILY_BGQ id = BLIS_ARCH_BGQ; #endif // Generic microarchitecture. #ifdef BLIS_FAMILY_GENERIC id = BLIS_ARCH_GENERIC; #endif } if ( bli_arch_get_logging() ) fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n", bli_arch_string( id ) ); //printf( "blis_arch_query_id(): id = %u\n", id ); //exit(1); } // ----------------------------------------------------------------------------- // NOTE: This string array must be kept up-to-date with the arch_t // enumeration that is typedef'ed in bli_type_defs.h. That is, the // index order of each string should correspond to the implied/assigned // enum value given to the corresponding BLIS_ARCH_ value. static char* config_name[ BLIS_NUM_ARCHS ] = { "skx", "knl", "knc", "haswell", "sandybridge", "penryn", "zen3", "zen2", "zen", "excavator", "steamroller", "piledriver", "bulldozer", "armsve", "a64fx", "firestorm", "thunderx2", "cortexa57", "cortexa53", "cortexa15", "cortexa9", "power10", "power9", "power7", "bgq", "generic" }; char* bli_arch_string( arch_t id ) { return config_name[ id ]; } // ----------------------------------------------------------------------------- static bool arch_dolog = 0; void bli_arch_set_logging( bool dolog ) { arch_dolog = dolog; } bool bli_arch_get_logging( void ) { return arch_dolog; } void bli_arch_log( char* fmt, ... ) { char prefix[] = "libblis: "; int n_chars = strlen( prefix ) + strlen( fmt ) + 1; if ( bli_arch_get_logging() && fmt ) { char* prefix_fmt = malloc( n_chars ); snprintf( prefix_fmt, n_chars, "%s%s", prefix, fmt ); va_list ap; va_start( ap, fmt ); vfprintf( stderr, prefix_fmt, ap ); va_end( ap ); free( prefix_fmt ); } } cython-blis-0.9.1/blis/_src/frame/base/bli_arch.h000066400000000000000000000037251427272030600215650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_array.c000066400000000000000000000151141427272030600217540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" //#define BLIS_ENABLE_MEM_TRACING void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ) { err_t r_val; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_array_init(): allocating array [%d * %d]: ", ( int )num_elem, ( int )elem_size ); #endif // Compute the total size (in bytes) of the array. const size_t array_size = num_elem * elem_size; // Allocate the array buffer. void* restrict buf = bli_malloc_intl( array_size, &r_val ); // Initialize the array elements to zero. THIS IS IMPORANT because // consumer threads will use the NULL-ness of the array elements to // determine if the corresponding block (data structure) needs to be // created/allocated and initialized. memset( buf, 0, array_size ); // Initialize the array_t structure. bli_array_set_buf( buf, array ); bli_array_set_num_elem( num_elem, array ); bli_array_set_elem_size( elem_size, array ); } void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ) { err_t r_val; // Query the number of elements in the array. const siz_t num_elem_prev = bli_array_num_elem( array ); // If the new requested size (number of elements) is less than or equal to // the current size, no action is needed; return early. if ( num_elem_new <= num_elem_prev ) return; // At this point, we know that num_elem_prev < num_elem_new, which means // we need to proceed with the resizing. // Query the size of each element in the array. const siz_t elem_size = bli_array_elem_size( array ); // Compute the total size (in bytes) of the array before and after resizing. const size_t array_size_prev = num_elem_prev * elem_size; const size_t array_size_new = num_elem_new * elem_size; // Query the previous array buffer. void* restrict buf_prev = bli_array_buf( array ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_array_resize(): allocating array [%d * %d]: ", ( int )num_elem_new, ( int )elem_size ); #endif // Allocate a new array buffer. char* restrict buf_new = bli_malloc_intl( array_size_new, &r_val ); // Copy the previous array contents to the new array. memcpy( buf_new, buf_prev, array_size_prev ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_array_resize(): freeing array [%d * %d]: ", ( int )num_elem_prev, ( int )elem_size ); #endif // Now that the elements have been copied over to the new buffer, we can // free the previous array buffer. bli_free_intl( buf_prev ); // Initialize the new elements' contents to zero. (Note that we advance // the new buffer address by the size of the previous array so that we // arrive at the first byte of the new segment.) memset( &buf_new[ array_size_prev ], 0, array_size_new - array_size_prev ); // Update the array_t structure. // NOTE: The array elem_size field does not need updating. bli_array_set_buf( buf_new, array ); bli_array_set_num_elem( num_elem_new, array ); } void bli_array_finalize ( array_t* restrict array ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_array_finalize(): freeing buf (length %d): ", ( int )bli_array_num_elem( array ) ); #endif // Query the buffer from the array. void* restrict buf = bli_array_buf( array ); // Free the buffer. bli_free_intl( buf ); } void* bli_array_elem ( const siz_t index, array_t* restrict array ) { // Query the number of elements in the array. const siz_t num_elem = bli_array_num_elem( array ); // Sanity check: disallow access beyond the bounds of the array. if ( num_elem <= index ) bli_abort(); // Query the size of each element in the array. const siz_t elem_size = bli_array_elem_size( array ); // Query the buffer from the array, but store it as a char* so we can use // it to easily perform byte pointer arithmetic. char* restrict buf = bli_array_buf( array ); // Advance the pointer by (index * elem_size) bytes. buf += index * elem_size; // Return the address of the element computed above. return ( void* )buf; } void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ) { // Query the size of each element in the array. const siz_t elem_size = bli_array_elem_size( array ); // Query the buffer from the array as a char*. char* restrict buf = bli_array_buf( array ); if ( elem_size == sizeof( void* ) ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_array_set_elem(): elem_size is %d; setting index %d.\n", ( int )elem_size, ( int )index ); fflush( stdout ); #endif // Special case: Handle elem_size = sizeof( void* ) without calling // memcpy(). void** restrict buf_vvp = ( void** )buf; void** restrict elem_vvp = ( void** )elem; buf_vvp[ index ] = *elem_vvp; } else { // General case: Copy the elem_size bytes from elem to buf at the // element index specified by index. memcpy( &buf[ index * elem_size ], elem, ( size_t )elem_size ); } } cython-blis-0.9.1/blis/_src/frame/base/bli_array.h000066400000000000000000000060161427272030600217620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- /* typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; */ // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_auxinfo.h000066400000000000000000000070471427272030600223220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif cython-blis-0.9.1/blis/_src/frame/base/bli_blksz.c000066400000000000000000000245131427272030600217660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ) { err_t r_val; blksz_t* b = bli_malloc_intl( sizeof( blksz_t ), &r_val ); bli_blksz_init_ed ( b, b_s, be_s, b_d, be_d, b_c, be_c, b_z, be_z ); return b; } blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ) { err_t r_val; blksz_t* b = bli_malloc_intl( sizeof( blksz_t ), &r_val ); bli_blksz_init ( b, b_s, b_d, b_c, b_z, be_s, be_d, be_c, be_z ); return b; } void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ) { b->v[BLIS_FLOAT] = b_s; b->v[BLIS_DOUBLE] = b_d; b->v[BLIS_SCOMPLEX] = b_c; b->v[BLIS_DCOMPLEX] = b_z; b->e[BLIS_FLOAT] = be_s; b->e[BLIS_DOUBLE] = be_d; b->e[BLIS_SCOMPLEX] = be_c; b->e[BLIS_DCOMPLEX] = be_z; } void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ) { b->v[BLIS_FLOAT] = b_s; b->v[BLIS_DOUBLE] = b_d; b->v[BLIS_SCOMPLEX] = b_c; b->v[BLIS_DCOMPLEX] = b_z; b->e[BLIS_FLOAT] = be_s; b->e[BLIS_DOUBLE] = be_d; b->e[BLIS_SCOMPLEX] = be_c; b->e[BLIS_DCOMPLEX] = be_z; } void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ) { b->v[BLIS_FLOAT] = b->e[BLIS_FLOAT] = b_s; b->v[BLIS_DOUBLE] = b->e[BLIS_DOUBLE] = b_d; b->v[BLIS_SCOMPLEX] = b->e[BLIS_SCOMPLEX] = b_c; b->v[BLIS_DCOMPLEX] = b->e[BLIS_DCOMPLEX] = b_z; } void bli_blksz_free ( blksz_t* b ) { bli_free_intl( b ); } // ----------------------------------------------------------------------------- #if 0 void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ) { dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); // If the blocksize multiple is zero, we do nothing. if ( bmult_val == 0 ) return; // Round the default and maximum blocksize values down to their // respective nearest multiples of bmult_val. (Notice that we // ignore the "max" entry in the bmult object since that would // correspond to the packing dimension, which plays no role // as a blocksize multiple.) blksz_def = ( blksz_def / bmult_val ) * bmult_val; blksz_max = ( blksz_max / bmult_val ) * bmult_val; // Make sure the new blocksize values are at least the blocksize // multiple. if ( blksz_def == 0 ) blksz_def = bmult_val; if ( blksz_max == 0 ) blksz_max = bmult_val; // Store the new blocksizes back to the object. bli_blksz_set_def( blksz_def, dt_bs, blksz ); bli_blksz_set_max( blksz_max, dt_bs, blksz ); } #endif // ----------------------------------------------------------------------------- void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ) { dim_t blksz_def = bli_blksz_get_def( dt_bs, blksz ); dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); // If the blocksize multiple is zero, we do nothing. if ( bmult_val == 0 ) return; // Round the default and maximum blocksize values down to their // respective nearest multiples of bmult_val. (Notice that we // ignore the "max" entry in the bmult object since that would // correspond to the packing dimension, which plays no role // as a blocksize multiple.) blksz_def = ( blksz_def / bmult_val ) * bmult_val; // Make sure the new blocksize values are at least the blocksize // multiple. if ( blksz_def == 0 ) blksz_def = bmult_val; // Store the new blocksizes back to the object. bli_blksz_set_def( blksz_def, dt_bs, blksz ); } // ----------------------------------------------------------------------------- void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ) { dim_t blksz_max = bli_blksz_get_max( dt_bs, blksz ); dim_t bmult_val = bli_blksz_get_def( dt_bm, bmult ); // If the blocksize multiple is zero, we do nothing. if ( bmult_val == 0 ) return; // Round the blocksize values down to its nearest multiple of // of bmult_val. (Notice that we ignore the "max" entry in the // bmult object since that would correspond to the packing // dimension, which plays no role as a blocksize multiple.) blksz_max = ( blksz_max / bmult_val ) * bmult_val; // Make sure the new blocksize value is at least the blocksize // multiple. if ( blksz_max == 0 ) blksz_max = bmult_val; // Store the new blocksize back to the object. bli_blksz_set_max( blksz_max, dt_bs, blksz ); } // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ) { if ( direct == BLIS_FWD ) return bli_determine_blocksize_f( i, dim, obj, bszid, cntx ); else return bli_determine_blocksize_b( i, dim, obj, bszid, cntx ); } dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ) { num_t dt; blksz_t* bsize; dim_t b_alg, b_max; dim_t b_use; // Extract the execution datatype and use it to query the corresponding // blocksize and blocksize maximum values from the blksz_t object. dt = bli_obj_exec_dt( obj ); bsize = bli_cntx_get_blksz( bszid, cntx ); b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max ); return b_use; } dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ) { num_t dt; blksz_t* bsize; dim_t b_alg, b_max; dim_t b_use; // Extract the execution datatype and use it to query the corresponding // blocksize and blocksize maximum values from the blksz_t object. dt = bli_obj_exec_dt( obj ); bsize = bli_cntx_get_blksz( bszid, cntx ); b_alg = bli_blksz_get_def( dt, bsize ); b_max = bli_blksz_get_max( dt, bsize ); b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max ); return b_use; } dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ) { dim_t b_now; dim_t dim_left_now; // We assume that this function is being called from an algorithm that // is moving "forward" (ie: top to bottom, left to right, top-left // to bottom-right). // Compute how much of the matrix dimension is left, including the // chunk that will correspond to the blocksize we are computing now. dim_left_now = dim - i; // If the dimension currently remaining is less than the maximum // blocksize, use it instead of the default blocksize b_alg. // Otherwise, use b_alg. if ( dim_left_now <= b_max ) { b_now = dim_left_now; } else { b_now = b_alg; } return b_now; } dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ) { dim_t b_now; dim_t dim_left_now; dim_t dim_at_edge; // We assume that this function is being called from an algorithm that // is moving "backward" (ie: bottom to top, right to left, bottom-right // to top-left). // Compute how much of the matrix dimension is left, including the // chunk that will correspond to the blocksize we are computing now. dim_left_now = dim - i; // Sanity check: if dim_left_now is zero, then we can return zero // without going any further. if ( dim_left_now == 0 ) return 0; dim_at_edge = dim_left_now % b_alg; // If dim_left_now is a multiple of b_alg, we can safely return b_alg // without going any further. if ( dim_at_edge == 0 ) return b_alg; // If the dimension currently remaining is less than the maximum // blocksize, use it as the chosen blocksize. If this is not the case, // then we know dim_left_now is greater than the maximum blocksize. // To determine how much of it we should use for the current blocksize, // we inspect dim_at_edge; if it is smaller than (or equal to) b_max - // b_alg, then we use b_alg + dim_at_edge. Otherwise, dim_at_edge is // greater than b_max - b_alg, in which case we use dim_at_edge. if ( dim_left_now <= b_max ) { b_now = dim_left_now; } else // if ( dim_left_now > b_max ) { if ( dim_at_edge <= b_max - b_alg ) { b_now = b_alg + dim_at_edge; } else // if ( dim_at_edge > b_max - b_alg ) { b_now = dim_at_edge; } } return b_now; } cython-blis-0.9.1/blis/_src/frame/base/bli_blksz.h000066400000000000000000000160561427272030600217760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); cython-blis-0.9.1/blis/_src/frame/base/bli_check.c000066400000000000000000000532121427272030600217140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // -- General stuff ------------------------------------------------------------ err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ) { if ( code == BLIS_SUCCESS ) return code; if ( BLIS_ERROR_CODE_MAX < code && code < BLIS_ERROR_CODE_MIN ) { bli_print_msg( bli_error_string_for_code( code ), file, line ); bli_abort(); } else { bli_print_msg( bli_error_string_for_code( BLIS_UNDEFINED_ERROR_CODE ), file, line ); bli_abort(); } return code; } err_t bli_check_valid_error_level( errlev_t level ) { err_t e_val = BLIS_SUCCESS; if ( level != BLIS_NO_ERROR_CHECKING && level != BLIS_FULL_ERROR_CHECKING ) e_val = BLIS_INVALID_ERROR_CHECKING_LEVEL; return e_val; } err_t bli_check_null_pointer( void* ptr ) { err_t e_val = BLIS_SUCCESS; if ( ptr == NULL ) e_val = BLIS_NULL_POINTER; return e_val; } // -- Parameter-related checks ------------------------------------------------- err_t bli_check_valid_side( side_t side ) { err_t e_val = BLIS_SUCCESS; if ( side != BLIS_LEFT && side != BLIS_RIGHT /*&& side != BLIS_TOP && side != BLIS_BOTTOM*/ ) e_val = BLIS_INVALID_SIDE; return e_val; } err_t bli_check_valid_uplo( uplo_t uplo ) { err_t e_val = BLIS_SUCCESS; if ( !bli_is_lower( uplo ) && !bli_is_upper( uplo ) ) e_val = BLIS_INVALID_UPLO; return e_val; } err_t bli_check_valid_trans( trans_t trans ) { err_t e_val = BLIS_SUCCESS; if ( trans != BLIS_NO_TRANSPOSE && trans != BLIS_TRANSPOSE && trans != BLIS_CONJ_NO_TRANSPOSE && trans != BLIS_CONJ_TRANSPOSE ) e_val = BLIS_INVALID_TRANS; return e_val; } err_t bli_check_valid_diag( diag_t diag ) { err_t e_val = BLIS_SUCCESS; if ( diag != BLIS_NONUNIT_DIAG && diag != BLIS_UNIT_DIAG ) e_val = BLIS_INVALID_DIAG; return e_val; } err_t bli_check_nonunit_diag( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( !bli_obj_has_nonunit_diag( a ) ) e_val = BLIS_EXPECTED_NONUNIT_DIAG; return e_val; } // -- Datatype-related checks -------------------------------------------------- err_t bli_check_valid_datatype( num_t dt ) { err_t e_val = BLIS_SUCCESS; if ( dt != BLIS_FLOAT && dt != BLIS_DOUBLE && dt != BLIS_SCOMPLEX && dt != BLIS_DCOMPLEX && dt != BLIS_INT && dt != BLIS_CONSTANT ) e_val = BLIS_INVALID_DATATYPE; return e_val; } err_t bli_check_object_valid_datatype( obj_t* a ) { err_t e_val; num_t dt; dt = bli_obj_dt( a ); e_val = bli_check_valid_datatype( dt ); return e_val; } err_t bli_check_noninteger_datatype( num_t dt ) { err_t e_val = BLIS_SUCCESS; if ( dt == BLIS_INT ) e_val = BLIS_EXPECTED_NONINTEGER_DATATYPE; return e_val; } err_t bli_check_noninteger_object( obj_t* a ) { err_t e_val; num_t dt; dt = bli_obj_dt( a ); e_val = bli_check_noninteger_datatype( dt ); return e_val; } err_t bli_check_nonconstant_datatype( num_t dt ) { err_t e_val = BLIS_SUCCESS; if ( dt == BLIS_CONSTANT ) e_val = BLIS_EXPECTED_NONCONSTANT_DATATYPE; return e_val; } err_t bli_check_nonconstant_object( obj_t* a ) { err_t e_val; num_t dt; dt = bli_obj_dt( a ); e_val = bli_check_nonconstant_datatype( dt ); return e_val; } err_t bli_check_floating_datatype( num_t dt ) { err_t e_val = BLIS_SUCCESS; if ( dt != BLIS_FLOAT && dt != BLIS_DOUBLE && dt != BLIS_SCOMPLEX && dt != BLIS_DCOMPLEX ) e_val = BLIS_EXPECTED_FLOATING_POINT_DATATYPE; return e_val; } err_t bli_check_floating_object( obj_t* a ) { err_t e_val; num_t dt; dt = bli_obj_dt( a ); e_val = bli_check_floating_datatype( dt ); return e_val; } err_t bli_check_real_datatype( num_t dt ) { err_t e_val = BLIS_SUCCESS; if ( dt != BLIS_FLOAT && dt != BLIS_DOUBLE ) e_val = BLIS_EXPECTED_REAL_DATATYPE; return e_val; } err_t bli_check_real_object( obj_t* a ) { err_t e_val; num_t dt; dt = bli_obj_dt( a ); e_val = bli_check_real_datatype( dt ); return e_val; } err_t bli_check_integer_datatype( num_t dt ) { err_t e_val = BLIS_SUCCESS; if ( dt != BLIS_INT ) e_val = BLIS_EXPECTED_INTEGER_DATATYPE; return e_val; } err_t bli_check_integer_object( obj_t* a ) { err_t e_val; num_t dt; dt = bli_obj_dt( a ); e_val = bli_check_integer_datatype( dt ); return e_val; } err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ) { err_t e_val = BLIS_SUCCESS; if ( dt_a != BLIS_CONSTANT && dt_b != BLIS_CONSTANT ) if ( dt_a != dt_b ) e_val = BLIS_INCONSISTENT_DATATYPES; return e_val; } err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ) { err_t e_val; num_t dt_a; num_t dt_b; dt_a = bli_obj_dt( a ); dt_b = bli_obj_dt( b ); e_val = bli_check_consistent_datatypes( dt_a, dt_b ); return e_val; } err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ) { err_t e_val = BLIS_SUCCESS; if ( ( dt_c == BLIS_CONSTANT && bli_is_complex( dt_r ) ) || ( dt_c == BLIS_FLOAT && dt_r != BLIS_FLOAT ) || ( dt_c == BLIS_DOUBLE && dt_r != BLIS_DOUBLE ) || ( dt_c == BLIS_SCOMPLEX && dt_r != BLIS_FLOAT ) || ( dt_c == BLIS_DCOMPLEX && dt_r != BLIS_DOUBLE ) ) e_val = BLIS_EXPECTED_REAL_PROJ_OF; return e_val; } err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ) { err_t e_val; num_t dt_c; num_t dt_r; dt_c = bli_obj_dt( c ); dt_r = bli_obj_dt( r ); e_val = bli_check_datatype_real_proj_of( dt_c, dt_r ); return e_val; } err_t bli_check_real_valued_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; double a_real; double a_imag; bli_getsc( a, &a_real, &a_imag ); if ( a_imag != 0.0 ) e_val = BLIS_EXPECTED_REAL_VALUED_OBJECT; return e_val; } err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ) { err_t e_val = BLIS_SUCCESS; if ( dt_a == BLIS_FLOAT ) { if ( dt_b != BLIS_FLOAT && dt_b != BLIS_SCOMPLEX ) e_val = BLIS_INCONSISTENT_PRECISIONS; } else if ( dt_a == BLIS_DOUBLE ) { if ( dt_b != BLIS_DOUBLE && dt_b != BLIS_DCOMPLEX ) e_val = BLIS_INCONSISTENT_PRECISIONS; } return e_val; } err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ) { err_t e_val; num_t dt_a; num_t dt_b; dt_a = bli_obj_dt( a ); dt_b = bli_obj_dt( b ); e_val = bli_check_consistent_precisions( dt_a, dt_b ); return e_val; } // -- Dimension-related checks ------------------------------------------------- err_t bli_check_conformal_dims( obj_t* a, obj_t* b ) { err_t e_val = BLIS_SUCCESS; dim_t m_a, n_a; dim_t m_b, n_b; m_a = bli_obj_length_after_trans( a ); n_a = bli_obj_width_after_trans( a ); m_b = bli_obj_length_after_trans( b ); n_b = bli_obj_width_after_trans( b ); if ( m_a != m_b || n_a != n_b ) e_val = BLIS_NONCONFORMAL_DIMENSIONS; return e_val; } err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ) { err_t e_val = BLIS_SUCCESS; dim_t m_c, n_c; dim_t m_a, k_a; dim_t k_b, n_b; m_c = bli_obj_length_after_trans( c ); n_c = bli_obj_width_after_trans( c ); m_a = bli_obj_length_after_trans( a ); k_a = bli_obj_width_after_trans( a ); k_b = bli_obj_length_after_trans( b ); n_b = bli_obj_width_after_trans( b ); if ( m_c != m_a || n_c != n_b || k_a != k_b ) e_val = BLIS_NONCONFORMAL_DIMENSIONS; return e_val; } err_t bli_check_scalar_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_length( a ) < 0 || bli_obj_width( a ) < 0 ) return BLIS_NEGATIVE_DIMENSION; if ( bli_obj_length( a ) != 1 || bli_obj_width( a ) != 1 ) return BLIS_EXPECTED_SCALAR_OBJECT; return e_val; } err_t bli_check_vector_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_length( a ) < 0 || bli_obj_width( a ) < 0 ) return BLIS_NEGATIVE_DIMENSION; if ( !bli_obj_is_vector( a ) ) return BLIS_EXPECTED_VECTOR_OBJECT; return e_val; } err_t bli_check_matrix_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_length( a ) < 0 || bli_obj_width( a ) < 0 ) e_val = BLIS_NEGATIVE_DIMENSION; return e_val; } err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ) { err_t e_val = BLIS_SUCCESS; dim_t dim_x; dim_t dim_y; dim_x = bli_obj_vector_dim( x ); dim_y = bli_obj_vector_dim( y ); if ( dim_x != dim_y ) e_val = BLIS_UNEQUAL_VECTOR_LENGTHS; return e_val; } err_t bli_check_square_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_length( a ) != bli_obj_width( a ) ) e_val = BLIS_EXPECTED_SQUARE_OBJECT; return e_val; } err_t bli_check_object_length_equals( obj_t* a, dim_t m ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_length( a ) != m ) e_val = BLIS_UNEXPECTED_OBJECT_LENGTH; return e_val; } err_t bli_check_object_width_equals( obj_t* a, dim_t n ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_width( a ) != n ) e_val = BLIS_UNEXPECTED_OBJECT_WIDTH; return e_val; } err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_vector_dim( a ) != n ) e_val = BLIS_UNEXPECTED_VECTOR_DIM; return e_val; } err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ) { err_t e_val = BLIS_SUCCESS; if ( offset != bli_obj_diag_offset( a ) ) e_val = BLIS_UNEXPECTED_DIAG_OFFSET; return e_val; } // -- Stride-related checks ---------------------------------------------------- err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ) { err_t e_val = BLIS_SUCCESS; // Note: A lot of thought went into designing these checks. Do NOT change // them unless you absolutely know what you are doing! Particularly, do // not try to merge the general and row-/column-major sections. It might // be possible, but it would be a lot less readable. // Prohibit negative dimensions. if ( m < 0 || n < 0 ) return BLIS_NEGATIVE_DIMENSION; // Overwrite rs and cs with the absolute value of each. We can do this // since the checks below are not dependent on the sign of the strides. rs = bli_abs( rs ); cs = bli_abs( cs ); is = bli_abs( is ); // The default case (whereby we interpret rs == cs == 0 as a request for // column-major order) is handled prior to calling this function, so the // only time we should see zero strides here is if the matrix is empty. if ( m == 0 || n == 0 ) return e_val; // Disallow row, column, or imaginary strides of zero. if ( ( rs == 0 || cs == 0 || is == 0 ) ) return BLIS_INVALID_DIM_STRIDE_COMBINATION; // Check stride consistency in cases of general stride. if ( rs != 1 && cs != 1 ) { // We apply different tests depending on which way the strides // "tilt". if ( rs == cs ) { // If rs == cs, then we must be dealing with an m-by-1 or a // 1-by-n matrix and thus at least one of the dimensions, m // or n, must be unit (even if the other is zero). if ( m != 1 && n != 1 ) return BLIS_INVALID_DIM_STRIDE_COMBINATION; } else if ( rs < cs ) { // For column-major tilt, cs must be equal or larger than m * rs. if ( m * rs > cs ) return BLIS_INVALID_DIM_STRIDE_COMBINATION; } else if ( cs < rs ) { // For row-major tilt, rs must be equal or larger than n * cs. if ( n * cs > rs ) return BLIS_INVALID_DIM_STRIDE_COMBINATION; } } else // check stride consistency of row-/column-storage cases. { if ( rs == 1 && cs == 1 ) { // If rs == cs == 1, then we must be dealing with an m-by-1, a // 1-by-n, or a 1-by-1 matrix and thus at least one of the // dimensions, m or n, must be unit (even if the other is zero). if ( m != 1 && n != 1 ) return BLIS_INVALID_DIM_STRIDE_COMBINATION; } else if ( rs == 1 ) { // For column-major storage, don't allow the column stride to be // less than the m dimension. if ( cs < m ) return BLIS_INVALID_COL_STRIDE; } else if ( cs == 1 ) { // For row-major storage, don't allow the row stride to be less // than the n dimension. if ( rs < n ) return BLIS_INVALID_ROW_STRIDE; } } return e_val; } // -- Structure-related checks ------------------------------------------------- err_t bli_check_general_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( !bli_obj_is_general( a ) ) e_val = BLIS_EXPECTED_GENERAL_OBJECT; return e_val; } err_t bli_check_hermitian_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( !bli_obj_is_hermitian( a ) ) e_val = BLIS_EXPECTED_HERMITIAN_OBJECT; return e_val; } err_t bli_check_symmetric_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( !bli_obj_is_symmetric( a ) ) e_val = BLIS_EXPECTED_SYMMETRIC_OBJECT; return e_val; } err_t bli_check_triangular_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( !bli_obj_is_triangular( a ) ) e_val = BLIS_EXPECTED_TRIANGULAR_OBJECT; return e_val; } err_t bli_check_object_struc( obj_t* a, struc_t struc ) { err_t e_val = BLIS_SUCCESS; if ( bli_is_general( struc ) ) e_val = bli_check_general_object( a ); else if ( bli_is_hermitian( struc ) ) e_val = bli_check_hermitian_object( a ); else if ( bli_is_symmetric( struc ) ) e_val = bli_check_symmetric_object( a ); else if ( bli_is_triangular( struc ) ) e_val = bli_check_triangular_object( a ); return e_val; } // -- Storage-related checks --------------------------------------------------- err_t bli_check_upper_or_lower_object( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( !bli_obj_is_lower( a ) && !bli_obj_is_upper( a ) ) e_val = BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT; return e_val; } // -- Partitioning-related checks ---------------------------------------------- err_t bli_check_valid_3x1_subpart( subpart_t part ) { err_t e_val = BLIS_SUCCESS; if ( part != BLIS_SUBPART0 && part != BLIS_SUBPART1AND0 && part != BLIS_SUBPART1 && part != BLIS_SUBPART1AND2 && part != BLIS_SUBPART2 && part != BLIS_SUBPART1A && part != BLIS_SUBPART1B ) e_val = BLIS_INVALID_3x1_SUBPART; return e_val; } err_t bli_check_valid_1x3_subpart( subpart_t part ) { err_t e_val = BLIS_SUCCESS; if ( part != BLIS_SUBPART0 && part != BLIS_SUBPART1AND0 && part != BLIS_SUBPART1 && part != BLIS_SUBPART1AND2 && part != BLIS_SUBPART2 && part != BLIS_SUBPART1A && part != BLIS_SUBPART1B ) e_val = BLIS_INVALID_1x3_SUBPART; return e_val; } err_t bli_check_valid_3x3_subpart( subpart_t part ) { err_t e_val = BLIS_SUCCESS; if ( part != BLIS_SUBPART00 && part != BLIS_SUBPART10 && part != BLIS_SUBPART20 && part != BLIS_SUBPART01 && part != BLIS_SUBPART11 && part != BLIS_SUBPART21 && part != BLIS_SUBPART02 && part != BLIS_SUBPART12 && part != BLIS_SUBPART22 ) e_val = BLIS_INVALID_3x3_SUBPART; return e_val; } // -- Control tree-related checks ---------------------------------------------- err_t bli_check_valid_cntl( void* cntl ) { err_t e_val = BLIS_SUCCESS; if ( cntl == NULL ) e_val = BLIS_UNEXPECTED_NULL_CONTROL_TREE; return e_val; } // -- Packing-related checks --------------------------------------------------- err_t bli_check_packm_schema_on_unpack( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_pack_schema( a ) != BLIS_PACKED_ROWS && bli_obj_pack_schema( a ) != BLIS_PACKED_COLUMNS && bli_obj_pack_schema( a ) != BLIS_PACKED_ROW_PANELS && bli_obj_pack_schema( a ) != BLIS_PACKED_COL_PANELS ) e_val = BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK; return e_val; } err_t bli_check_packv_schema_on_unpack( obj_t* a ) { err_t e_val = BLIS_SUCCESS; if ( bli_obj_pack_schema( a ) != BLIS_PACKED_VECTOR ) e_val = BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK; return e_val; } // -- Buffer-related checks ---------------------------------------------------- err_t bli_check_object_buffer( obj_t* a ) { err_t e_val = BLIS_SUCCESS; // We are only concerned with NULL buffers in objects where BOTH // dimensions are non-zero. if ( bli_obj_buffer( a ) == NULL ) if ( bli_obj_length( a ) > 0 && bli_obj_width( a ) > 0 ) e_val = BLIS_EXPECTED_NONNULL_OBJECT_BUFFER; return e_val; } // -- Memory checks ------------------------------------------------------------ err_t bli_check_valid_malloc_buf( void* ptr ) { err_t e_val = BLIS_SUCCESS; if ( ptr == NULL ) e_val = BLIS_MALLOC_RETURNED_NULL; return e_val; } // -- Internal memory pool checks ---------------------------------------------- err_t bli_check_valid_packbuf( packbuf_t buf_type ) { err_t e_val = BLIS_SUCCESS; if ( buf_type != BLIS_BUFFER_FOR_A_BLOCK && buf_type != BLIS_BUFFER_FOR_B_PANEL && buf_type != BLIS_BUFFER_FOR_C_PANEL && buf_type != BLIS_BUFFER_FOR_GEN_USE ) e_val = BLIS_INVALID_PACKBUF; return e_val; } err_t bli_check_if_exhausted_pool( pool_t* pool ) { err_t e_val = BLIS_SUCCESS; if ( bli_pool_is_exhausted( pool ) ) e_val = BLIS_EXHAUSTED_CONTIG_MEMORY_POOL; return e_val; } err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ) { err_t e_val = BLIS_SUCCESS; num_t dt; for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); siz_t dt_size = bli_dt_size( dt ); // NOTE: For induced methods, we use the size of the complex datatypes // (rather than the size of the native micro-kernels' datatype) because // the macro-kernel needs this larger micro-tile footprint, even if the // virtual micro-kernel implementation will only ever be writing to half // of it (real or imaginary part) at a time. if ( mr * nr * dt_size > BLIS_STACK_BUF_MAX_SIZE ) e_val = BLIS_INSUFFICIENT_STACK_BUF_SIZE; } return e_val; } err_t bli_check_alignment_is_power_of_two( size_t align_size ) { err_t e_val = BLIS_SUCCESS; // This function returns an error code if align_size is zero or not // a power of two. if ( align_size == 0 ) e_val = BLIS_ALIGNMENT_NOT_POWER_OF_TWO; else if ( ( align_size & ( align_size - 1 ) ) ) e_val = BLIS_ALIGNMENT_NOT_POWER_OF_TWO; return e_val; } err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ) { err_t e_val = BLIS_SUCCESS; // This function returns an error code if align_size is not a whole // multiple of the size of a pointer. if ( align_size % sizeof( void* ) != 0 ) e_val = BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE; return e_val; } // -- Object-related errors ---------------------------------------------------- err_t bli_check_object_alias_of( obj_t* a, obj_t* b ) { err_t e_val = BLIS_SUCCESS; if ( !bli_obj_is_alias_of( a, b ) ) e_val = BLIS_EXPECTED_OBJECT_ALIAS; return e_val; } // -- Architecture-related errors ---------------------------------------------- err_t bli_check_valid_arch_id( arch_t id ) { err_t e_val = BLIS_SUCCESS; if ( ( gint_t )id < 0 || BLIS_NUM_ARCHS <= ( gint_t )id ) e_val = BLIS_INVALID_ARCH_ID; return e_val; } err_t bli_check_initialized_gks_cntx( cntx_t** cntx ) { err_t e_val = BLIS_SUCCESS; if ( cntx == NULL ) e_val = BLIS_UNINITIALIZED_GKS_CNTX; return e_val; } // -- Architecture-related errors ---------------------------------------------- err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ) { num_t dt; for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { dim_t mc_def_dt = bli_blksz_get_def( dt, mc ); dim_t mc_max_dt = bli_blksz_get_max( dt, mc ); dim_t mr_dt = bli_blksz_get_def( dt, mr ); if ( mc_def_dt % mr_dt != 0 ) return BLIS_MC_DEF_NONMULTIPLE_OF_MR; else if ( mc_max_dt % mr_dt != 0 ) return BLIS_MC_MAX_NONMULTIPLE_OF_MR; } return BLIS_SUCCESS; } err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ) { num_t dt; for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { dim_t nc_def_dt = bli_blksz_get_def( dt, nc ); dim_t nc_max_dt = bli_blksz_get_max( dt, nc ); dim_t nr_dt = bli_blksz_get_def( dt, nr ); if ( nc_def_dt % nr_dt != 0 ) return BLIS_NC_DEF_NONMULTIPLE_OF_NR; else if ( nc_max_dt % nr_dt != 0 ) return BLIS_NC_MAX_NONMULTIPLE_OF_NR; } return BLIS_SUCCESS; } err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ) { num_t dt; for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { dim_t kc_def_dt = bli_blksz_get_def( dt, kc ); dim_t kc_max_dt = bli_blksz_get_max( dt, kc ); dim_t kr_dt = bli_blksz_get_def( dt, kr ); if ( kc_def_dt % kr_dt != 0 ) return BLIS_KC_DEF_NONMULTIPLE_OF_KR; else if ( kc_max_dt % kr_dt != 0 ) return BLIS_KC_MAX_NONMULTIPLE_OF_KR; } return BLIS_SUCCESS; } cython-blis-0.9.1/blis/_src/frame/base/bli_check.h000066400000000000000000000120301427272030600217120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); cython-blis-0.9.1/blis/_src/frame/base/bli_clock.c000066400000000000000000000110121427272030600217220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" static double gtod_ref_time_sec = 0.0; double bli_clock( void ) { return bli_clock_helper(); } double bli_clock_min_diff( double time_min, double time_start ) { double time_min_prev; double time_diff; // Save the old value. time_min_prev = time_min; time_diff = bli_clock() - time_start; time_min = bli_fmin( time_min, time_diff ); // Assume that anything: // - under or equal to zero, // - under a nanosecond // is actually garbled due to the clocks being taken too closely together. if ( time_min <= 0.0 ) time_min = time_min_prev; else if ( time_min < 1.0e-9 ) time_min = time_min_prev; return time_min; } #ifdef BLIS_DISABLE_SYSTEM // --- Begin systemless definitions -------------------------------------------- double bli_clock_helper() { return 0.0; } // --- End systemless definitions ---------------------------------------------- #else // --- Begin system definitions ------------------------------------------------ #if BLIS_OS_WINDOWS // --- Begin Windows build definitions ----------------------------------------- double bli_clock_helper() { LARGE_INTEGER clock_freq = {0}; LARGE_INTEGER clock_val; BOOL r_val; r_val = QueryPerformanceFrequency( &clock_freq ); if ( r_val == 0 ) { bli_print_msg( "QueryPerformanceFrequency() failed", __FILE__, __LINE__ ); bli_abort(); } r_val = QueryPerformanceCounter( &clock_val ); if ( r_val == 0 ) { bli_print_msg( "QueryPerformanceCounter() failed", __FILE__, __LINE__ ); bli_abort(); } return ( ( double) clock_val.QuadPart / ( double) clock_freq.QuadPart ); } // --- End Windows build definitions ------------------------------------------- #elif BLIS_OS_OSX // --- Begin OSX build definitions ------------------------------------------- double bli_clock_helper() { mach_timebase_info_data_t timebase; mach_timebase_info( &timebase ); uint64_t nsec = mach_absolute_time(); double the_time = (double) nsec * 1.0e-9 * timebase.numer / timebase.denom; if ( gtod_ref_time_sec == 0.0 ) gtod_ref_time_sec = the_time; return the_time - gtod_ref_time_sec; } // --- End OSX build definitions --------------------------------------------- #else // --- Begin Linux build definitions ------------------------------------------- double bli_clock_helper() { double the_time, norm_sec; struct timespec ts; clock_gettime( CLOCK_MONOTONIC, &ts ); if ( gtod_ref_time_sec == 0.0 ) gtod_ref_time_sec = ( double ) ts.tv_sec; norm_sec = ( double ) ts.tv_sec - gtod_ref_time_sec; the_time = norm_sec + ts.tv_nsec * 1.0e-9; return the_time; } // --- End Linux build definitions --------------------------------------------- #endif // --- End system definitions -------------------------------------------------- #endif cython-blis-0.9.1/blis/_src/frame/base/bli_clock.h000066400000000000000000000034511427272030600217370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); cython-blis-0.9.1/blis/_src/frame/base/bli_cntl.c000066400000000000000000000256001427272030600215770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ) { cntl_t* cntl; mem_t* pack_mem; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_create_node(): " ); #endif // Allocate the cntl_t struct. cntl = bli_sba_acquire( rntm, sizeof( cntl_t ) ); bli_cntl_set_family( family, cntl ); bli_cntl_set_bszid( bszid, cntl ); bli_cntl_set_var_func( var_func, cntl ); bli_cntl_set_params( params, cntl ); bli_cntl_set_sub_prenode( NULL, cntl ); bli_cntl_set_sub_node( sub_node, cntl ); // Query the address of the node's packed mem_t entry so we can initialize // key fields (to NULL or 0). // NOTE: This initialization is important, since it allows threads to // discern whether blocks have been acquired from the memory allocator. pack_mem = bli_cntl_pack_mem( cntl ); bli_mem_clear( pack_mem ); return cntl; } void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_free_node(): " ); #endif bli_sba_release( rntm, cntl ); } void bli_cntl_clear_node ( cntl_t* cntl ) { mem_t* pack_mem; // Clear various fields in the control tree. Clearing these fields // actually is not needed, but we do it for debugging/completeness. bli_cntl_set_var_func( NULL, cntl ); bli_cntl_set_params( NULL, cntl ); bli_cntl_set_sub_prenode( NULL, cntl ); bli_cntl_set_sub_node( NULL, cntl ); // Clearing these fields is potentially more important if the control // tree is cached somewhere and reused. pack_mem = bli_cntl_pack_mem( cntl ); bli_mem_clear( pack_mem ); } // ----------------------------------------------------------------------------- void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { if ( thread != NULL ) bli_cntl_free_w_thrinfo( rntm, cntl, thread ); else bli_cntl_free_wo_thrinfo( rntm, cntl ); } void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { // Base case: simply return when asked to free NULL nodes. if ( cntl == NULL ) return; cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl ); cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); void* cntl_params = bli_cntl_params( cntl ); mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); // Don't immediately dereference the prenode and subnode of the thrinfo_t // node. In some cases, the thrinfo_t tree is not built out all the way, // perhaps because there are more ways of parallelization than micropanels // of data in this dimension, or because the problem is small enough that // there is no gemm subproblem in bli_trsm_blk_var1(). Thus, we start with // NULL values for these variables and only dereference the fields of the // thrinfo_t struct if the thrinfo_t exists (ie: is non-NULL). We will also // have to check the thrinfo_t pointer for NULLness before using it below, // when checking if we need to free the pack_mem field of the cntl_t node // (see below). thrinfo_t* thread_sub_prenode = NULL; thrinfo_t* thread_sub_node = NULL; if ( thread != NULL ) { thread_sub_prenode = bli_thrinfo_sub_prenode( thread ); thread_sub_node = bli_thrinfo_sub_node( thread ); } // Only recurse into prenode branch if it exists. if ( cntl_sub_prenode != NULL ) { // Recursively free all memory associated with the sub-prenode and its // children. bli_cntl_free_w_thrinfo( rntm, cntl_sub_prenode, thread_sub_prenode ); } // Only recurse into the child node if it exists. if ( cntl_sub_node != NULL ) { // Recursively free all memory associated with the sub-node and its // children. bli_cntl_free_w_thrinfo( rntm, cntl_sub_node, thread_sub_node ); } // Free the current node's params field, if it is non-NULL. if ( cntl_params != NULL ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_free_w_thrinfo(): " ); #endif bli_sba_release( rntm, cntl_params ); } // Release the current node's pack mem_t entry back to the memory // broker from which it originated, but only if the mem_t entry is // allocated, and only if the current thread is chief for its group. // Also note that we don't proceed with either of the above tests if // the thrinfo_t pointer is NULL. (See above for background on when // this can happen.) if ( thread != NULL ) if ( bli_thread_am_ochief( thread ) ) if ( bli_mem_is_alloc( cntl_pack_mem ) ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" ); #endif bli_pba_release( rntm, cntl_pack_mem ); } // Free the current node. bli_cntl_free_node( rntm, cntl ); } void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ) { // Base case: simply return when asked to free NULL nodes. if ( cntl == NULL ) return; cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl ); cntl_t* cntl_sub_node = bli_cntl_sub_node( cntl ); void* cntl_params = bli_cntl_params( cntl ); mem_t* cntl_pack_mem = bli_cntl_pack_mem( cntl ); { // Recursively free all memory associated with the sub-prenode and its // children. bli_cntl_free_wo_thrinfo( rntm, cntl_sub_prenode ); } { // Recursively free all memory associated with the sub-node and its // children. bli_cntl_free_wo_thrinfo( rntm, cntl_sub_node ); } // Free the current node's params field, if it is non-NULL. if ( cntl_params != NULL ) { bli_sba_release( rntm, cntl_params ); } // Release the current node's pack mem_t entry back to the memory // broker from which it originated, but only if the mem_t entry is // allocated. if ( bli_mem_is_alloc( cntl_pack_mem ) ) { bli_pba_release( rntm, cntl_pack_mem ); } // Free the current node. bli_cntl_free_node( rntm, cntl ); } // ----------------------------------------------------------------------------- cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ) { // Make a copy of the current node. Notice that the source node // should NOT have any allocated/cached mem_t entries, and that // bli_cntl_create_node() creates a node with a cleared mem_t // field. cntl_t* cntl_copy = bli_cntl_create_node ( rntm, bli_cntl_family( cntl ), bli_cntl_bszid( cntl ), bli_cntl_var_func( cntl ), NULL, NULL ); // Check the params field of the existing control tree; if it's non-NULL, // copy it. if ( bli_cntl_params( cntl ) != NULL ) { // Detect the size of the params struct by reading the first field // as a uint64_t, and then allocate this many bytes for a new params // struct. uint64_t params_size = bli_cntl_params_size( cntl ); void* params_orig = bli_cntl_params( cntl ); void* params_copy = bli_sba_acquire( rntm, ( size_t )params_size ); // Copy the original params struct to the new memory region. memcpy( params_copy, params_orig, params_size ); // Save the address of the new params struct into the new control // tree node. bli_cntl_set_params( params_copy, cntl_copy ); } // If the sub-prenode exists, copy it recursively. if ( bli_cntl_sub_prenode( cntl ) != NULL ) { cntl_t* sub_prenode_copy = bli_cntl_copy ( rntm, bli_cntl_sub_prenode( cntl ) ); // Save the address of the new sub-node (sub-tree) to the existing // node. bli_cntl_set_sub_prenode( sub_prenode_copy, cntl_copy ); } // If the sub-node exists, copy it recursively. if ( bli_cntl_sub_node( cntl ) != NULL ) { cntl_t* sub_node_copy = bli_cntl_copy ( rntm, bli_cntl_sub_node( cntl ) ); // Save the address of the new sub-node (sub-tree) to the existing // node. bli_cntl_set_sub_node( sub_node_copy, cntl_copy ); } // Return the address of the newly created node. return cntl_copy; } void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ) { // This function sets the family field of all cntl tree nodes that are // children of cntl. It's used by bli_l3_cntl_create_if() after making // a copy of a user-given cntl tree, if the user provided one, to mark // the operation family, which is used to determine appropriate behavior // by various functions when executing the blocked variants. // Set the family of the root node. bli_cntl_set_family( family, cntl ); // Recursively set the family field of the sub-tree rooted at the sub-node, // if it exists. if ( bli_cntl_sub_prenode( cntl ) != NULL ) { bli_cntl_mark_family( family, bli_cntl_sub_prenode( cntl ) ); } // Recursively set the family field of the sub-tree rooted at the prenode, // if it exists. if ( bli_cntl_sub_node( cntl ) != NULL ) { bli_cntl_mark_family( family, bli_cntl_sub_node( cntl ) ); } } // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ) { dim_t n_threads_in = 1; for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) ) { bszid_t bszid = bli_cntl_bszid( cntl ); dim_t cur_way; // We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not // BLIS_NO_PART. if ( bszid != BLIS_NO_PART ) cur_way = bli_rntm_ways_for( bszid, rntm ); else cur_way = 1; n_threads_in *= cur_way; } return n_threads_in; } cython-blis-0.9.1/blis/_src/frame/base/bli_cntl.h000066400000000000000000000125221427272030600216030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* // -- Control tree node definition -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; */ // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } cython-blis-0.9.1/blis/_src/frame/base/bli_cntx.c000066400000000000000000001447071427272030600216250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_cntx_clear( cntx_t* cntx ) { // Fill the entire cntx_t structure with zeros. memset( ( void* )cntx, 0, sizeof( cntx_t ) ); } // ----------------------------------------------------------------------------- void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default blocksizes. It should be called after // bli_cntx_init_defaults() so that the context begins with default // blocksizes across all datatypes. /* Example prototypes: void bli_cntx_set_blkszs ( ind_t method = BLIS_NAT, dim_t n_bs, bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, ... cntx_t* cntx ); void bli_cntx_set_blkszs ( ind_t method != BLIS_NAT, dim_t n_bs, bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0, bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1, bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bszid_t* bmults = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_bs ); // Handle native and induced method cases separately. if ( method == BLIS_NAT ) { // Process n_bs tuples. for ( i = 0; i < n_bs; ++i ) { // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, // - the address of the blksz_t object, // - the bszid_t of the multiple we need to associate with // the blksz_t object. bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; blkszs[ i ] = blksz; bmults[ i ] = bm_id; } } else // if induced method execution was indicated { // Process n_bs tuples. for ( i = 0; i < n_bs; ++i ) { // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, // - the address of the blksz_t object, // - the bszid_t of the multiple we need to associate with // the blksz_t object, // - the scalars we wish to apply to the real blocksizes to // come up with the induced complex blocksizes (for default // and maximum blocksizes). bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); bszid_t bm_id = ( bszid_t )va_arg( args, bszid_t ); double dsclr = ( double )va_arg( args, double ); double msclr = ( double )va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; blkszs[ i ] = blksz; bmults[ i ] = bm_id; dsclrs[ i ] = dsclr; msclrs[ i ] = msclr; } } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Save the execution type into the context. bli_cntx_set_method( method, cntx ); // Query the context for the addresses of: // - the blocksize object array // - the blocksize multiple array blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. Notice that the blksz_t* pointers were saved, rather than // the objects themselves, but we copy the contents of the objects // when copying into the context. // Handle native and induced method cases separately. if ( method == BLIS_NAT ) { // Process each blocksize id tuple provided. for ( i = 0; i < n_bs; ++i ) { // Read the current blocksize id, blksz_t* pointer, blocksize // multiple id, and blocksize scalar. bszid_t bs_id = bszids[ i ]; bszid_t bm_id = bmults[ i ]; blksz_t* blksz = blkszs[ i ]; blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the blksz_t object contents into the appropriate // location within the context's blksz_t array. Do the same // for the blocksize multiple id. //cntx_blkszs[ bs_id ] = *blksz; //bli_blksz_copy( blksz, cntx_blksz ); bli_blksz_copy_if_pos( blksz, cntx_blksz ); // Copy the blocksize multiple id into the context. cntx_bmults[ bs_id ] = bm_id; } } else { // Process each blocksize id tuple provided. for ( i = 0; i < n_bs; ++i ) { // Read the current blocksize id, blksz_t pointer, blocksize // multiple id, and blocksize scalar. bszid_t bs_id = bszids[ i ]; bszid_t bm_id = bmults[ i ]; double dsclr = dsclrs[ i ]; double msclr = msclrs[ i ]; blksz_t* blksz = blkszs[ i ]; blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Copy the real domain values of the source blksz_t object into // the context, duplicating into the complex domain fields. bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_FLOAT, cntx_blksz ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE, cntx_blksz ); bli_blksz_copy_dt( BLIS_FLOAT, blksz, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz ); // If the default blocksize scalar is non-unit, we need to scale // the complex domain default blocksizes. if ( dsclr != 1.0 ) { // Scale the complex domain default blocksize values in the // blocksize object. bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz ); } // Similarly, if the maximum blocksize scalar is non-unit, we need // to scale the complex domain maximum blocksizes. if ( msclr != 1.0 ) { // Scale the complex domain maximum blocksize values in the // blocksize object. bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz ); bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz ); } // Copy the blocksize multiple id into the context. cntx_bmults[ bs_id ] = bm_id; } } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bli_free_intl( blkszs ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bli_free_intl( bszids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bli_free_intl( bmults ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bli_free_intl( dsclrs ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bli_free_intl( msclrs ); } // ----------------------------------------------------------------------------- void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ) { /* Example prototypes: void bli_gks_cntx_set_ind_blkszs ( ind_t method != BLIS_NAT, num_t dt, dim_t n_bs, bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0, bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1, bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2, ... cntx_t* cntx ); NOTE: This function modifies an existing context that is presumed to have been initialized for native execution. */ va_list args; dim_t i; err_t r_val; // Project the given datatype to the real domain. This will be used later on. num_t dt_real = bli_dt_proj_to_real( dt ); // Return early if called with BLIS_NAT. if ( method == BLIS_NAT ) return; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif double* dsclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif double* msclrs = bli_malloc_intl( n_bs * sizeof( double ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_bs ); { // Process n_bs tuples. for ( i = 0; i < n_bs; ++i ) { // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, // - the scalars we wish to apply to the real blocksizes to // come up with the induced complex blocksizes (for default // and maximum blocksizes). bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); double dsclr = ( double )va_arg( args, double ); double msclr = ( double )va_arg( args, double ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; dsclrs[ i ] = dsclr; msclrs[ i ] = msclr; } } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Save the execution type into the context. bli_cntx_set_method( method, cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. { // Process each blocksize id tuple provided. for ( i = 0; i < n_bs; ++i ) { // Read the current blocksize id, blocksize multiple id, // and blocksize scalar. bszid_t bs_id = bszids[ i ]; double dsclr = dsclrs[ i ]; double msclr = msclrs[ i ]; //blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ]; // Query the context for the blksz_t object assoicated with the // current blocksize id, and also query the object corresponding // to the blocksize multiple. blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx ); // Copy the real domain value of the blksz_t object into the // corresponding complex domain slot of the same object. bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz ); // If the default blocksize scalar is non-unit, we need to scale // the complex domain default blocksizes. if ( dsclr != 1.0 ) { // Scale the default blocksize value corresponding to the given // datatype. bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz ); } // Similarly, if the maximum blocksize scalar is non-unit, we need // to scale the complex domain maximum blocksizes. if ( msclr != 1.0 ) { // Scale the maximum blocksize value corresponding to the given // datatype. bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz ); } } } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif bli_free_intl( bszids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif bli_free_intl( dsclrs ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_ind_blkszs(): " ); #endif bli_free_intl( msclrs ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default level-3 microkernels. It should be called after // bli_cntx_init_defaults() so that the context begins with default // microkernels across all datatypes. /* Example prototypes: void bli_cntx_set_l3_nat_ukrs ( dim_t n_ukrs, l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, bool pref0, l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, bool pref1, l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, bool pref2, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_ukrs ); // Process n_ukrs tuples. for ( i = 0; i < n_ukrs; ++i ) { // Here, we query the variable argument list for: // - the l3ukr_t of the kernel we're about to process, // - the datatype of the kernel, // - the kernel function pointer, and // - the kernel function storage preference // that we need to store to the context. // NOTE: Though bool_t is no longer used, the following comment is // being kept for historical reasons. // The type that we pass into the va_arg() macro for the ukr // preference matters. Using 'bool_t' may cause breakage on 64-bit // systems that define int as 32 bits and long int and pointers as // 64 bits. The problem is that TRUE or FALSE are defined as 1 and // 0, respectively, and when "passed" into the variadic function // they come with no contextual typecast. Thus, default rules of // argument promotion kick in to treat these integer literals as // being of type int. Thus, we need to let va_arg() treat the TRUE // or FALSE value as an int, even if we cast it to and store it // within a bool_t afterwards. const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t ); const num_t ukr_dt = ( num_t )va_arg( args, num_t ); void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); const bool ukr_pref = ( bool )va_arg( args, int ); // Store the values in our temporary arrays. ukr_ids[ i ] = ukr_id; ukr_dts[ i ] = ukr_dt; ukr_fps[ i ] = ukr_fp; ukr_prefs[ i ] = ukr_pref; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the addresses of: // - the l3 virtual ukernel func_t array // - the l3 native ukernel func_t array // - the l3 native ukernel preferences array func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* cntx_l3_nat_ukrs = bli_cntx_l3_nat_ukrs_buf( cntx ); mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. // Process each blocksize id tuple provided. for ( i = 0; i < n_ukrs; ++i ) { // Read the current ukernel id, ukernel datatype, ukernel function // pointer, and ukernel preference. const l3ukr_t ukr_id = ukr_ids[ i ]; const num_t ukr_dt = ukr_dts[ i ]; void_fp ukr_fp = ukr_fps[ i ]; const bool ukr_pref = ukr_prefs[ i ]; // Index into the func_t and mbool_t for the current kernel id // being processed. func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ]; func_t* ukrs = &cntx_l3_nat_ukrs[ ukr_id ]; mbool_t* prefs = &cntx_l3_nat_ukrs_prefs[ ukr_id ]; // Store the ukernel function pointer and preference values into // the context. Notice that we redundantly store the native // ukernel address in both the native and virtual ukernel slots // in the context. This is standard practice when creating a // native context. (Induced method contexts will overwrite the // virtual function pointer with the address of the appropriate // virtual ukernel.) bli_func_set_dt( ukr_fp, ukr_dt, vukrs ); bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif bli_free_intl( ukr_ids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif bli_free_intl( ukr_dts ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif bli_free_intl( ukr_fps ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_nat_ukrs(): " ); #endif bli_free_intl( ukr_prefs ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default level-3 virtual microkernels. It should be called after // bli_cntx_init_defaults() so that the context begins with default // microkernels across all datatypes. /* Example prototypes: void bli_cntx_set_l3_vir_ukrs ( dim_t n_ukrs, l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif l3ukr_t* ukr_ids = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif void_fp* ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_ukrs ); // Process n_ukrs tuples. for ( i = 0; i < n_ukrs; ++i ) { // Here, we query the variable argument list for: // - the l3ukr_t of the kernel we're about to process, // - the datatype of the kernel, and // - the kernel function pointer. // that we need to store to the context. const l3ukr_t ukr_id = ( l3ukr_t )va_arg( args, l3ukr_t ); const num_t ukr_dt = ( num_t )va_arg( args, num_t ); void_fp ukr_fp = ( void_fp )va_arg( args, void_fp ); // Store the values in our temporary arrays. ukr_ids[ i ] = ukr_id; ukr_dts[ i ] = ukr_dt; ukr_fps[ i ] = ukr_fp; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the addresses of: // - the l3 virtual ukernel func_t array func_t* cntx_l3_vir_ukrs = bli_cntx_l3_vir_ukrs_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. // Process each blocksize id tuple provided. for ( i = 0; i < n_ukrs; ++i ) { // Read the current ukernel id, ukernel datatype, ukernel function // pointer, and ukernel preference. const l3ukr_t ukr_id = ukr_ids[ i ]; const num_t ukr_dt = ukr_dts[ i ]; void_fp ukr_fp = ukr_fps[ i ]; // Index into the func_t and mbool_t for the current kernel id // being processed. func_t* vukrs = &cntx_l3_vir_ukrs[ ukr_id ]; // Store the ukernel function pointer and preference values into // the context. Notice that we redundantly store the native // ukernel address in both the native and virtual ukernel slots // in the context. This is standard practice when creating a // native context. (Induced method contexts will overwrite the // virtual function pointer with the address of the appropriate // virtual ukernel.) bli_func_set_dt( ukr_fp, ukr_dt, vukrs ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif bli_free_intl( ukr_ids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif bli_free_intl( ukr_dts ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_vir_ukrs(): " ); #endif bli_free_intl( ukr_fps ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default thresholds for small/unpacked matrix handling. It should // be called after bli_cntx_init_defaults() so that the context begins // with default thresholds. /* Example prototypes: void bli_cntx_set_l3_sup_thresh ( dim_t n_thresh, threshid_t th0_id, blksz_t* blksz0, threshid_t th1_id, blksz_t* blksz1, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_thresh(): " ); #endif threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_thresh(): " ); #endif blksz_t** threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_thresh ); // Process n_thresh tuples. for ( i = 0; i < n_thresh; ++i ) { // Here, we query the variable argument list for: // - the threshid_t of the threshold we're about to process, // - the address of the blksz_t object, threshid_t th_id = ( threshid_t )va_arg( args, threshid_t ); blksz_t* thresh = ( blksz_t* )va_arg( args, blksz_t* ); // Store the values in our temporary arrays. threshids[ i ] = th_id; threshs[ i ] = thresh; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the addresses of: // - the threshold array blksz_t* cntx_threshs = bli_cntx_l3_sup_thresh_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. Notice that the blksz_t* pointers were saved, rather than // the objects themselves, but we copy the contents of the objects // when copying into the context. // Process each blocksize id tuple provided. for ( i = 0; i < n_thresh; ++i ) { // Read the current blocksize id, blksz_t* pointer, blocksize // multiple id, and blocksize scalar. threshid_t th_id = threshids[ i ]; blksz_t* thresh = threshs[ i ]; blksz_t* cntx_thresh = &cntx_threshs[ th_id ]; // Copy the blksz_t object contents into the appropriate // location within the context's blksz_t array. //cntx_threshs[ th_id ] = *thresh; //bli_blksz_copy( thresh, cntx_thresh ); bli_blksz_copy_if_pos( thresh, cntx_thresh ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_thresh(): " ); #endif bli_free_intl( threshs ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_thresh(): " ); #endif bli_free_intl( threshids ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default level-3 operation handler for small/unpacked matrices. It // should be called after bli_cntx_init_defaults() so that the context // begins with default sup handlers across all datatypes. /* Example prototypes: void bli_cntx_set_l3_sup_handlers ( dim_t n_ops, opid_t op0_id, void* handler0_fp, opid_t op1_id, void* handler1_fp, opid_t op2_id, void* handler2_fp, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_handlers(): " ); #endif opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_handlers(): " ); #endif void** op_fps = bli_malloc_intl( n_ops * sizeof( void* ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_ops ); // Process n_ukrs tuples. for ( i = 0; i < n_ops; ++i ) { // Here, we query the variable argument list for: // - the opid_t of the operation we're about to process, // - the sup handler function pointer // that we need to store to the context. const opid_t op_id = ( opid_t )va_arg( args, opid_t ); void* op_fp = ( void* )va_arg( args, void* ); // Store the values in our temporary arrays. op_ids[ i ] = op_id; op_fps[ i ] = op_fp; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the addresses of: // - the l3 small/unpacked handlers array void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. // Process each operation id tuple provided. for ( i = 0; i < n_ops; ++i ) { // Read the current operation id and handler function pointer. const opid_t op_id = op_ids[ i ]; void* op_fp = op_fps[ i ]; // Store the sup handler function pointer into the slot for the // specified operation id. cntx_l3_sup_handlers[ op_id ] = op_fp; } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_handlers(): " ); #endif bli_free_intl( op_ids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_handlers(): " ); #endif bli_free_intl( op_fps ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default l3 sup blocksizes. It should be called after // bli_cntx_init_defaults() so that the context begins with default // blocksizes across all datatypes. /* Example prototypes: void bli_cntx_set_blkszs ( dim_t n_bs, bszid_t bs0_id, blksz_t* blksz0, bszid_t bs1_id, blksz_t* blksz1, bszid_t bs2_id, blksz_t* blksz2, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_bs ); // Process n_bs tuples. for ( i = 0; i < n_bs; ++i ) { // Here, we query the variable argument list for: // - the bszid_t of the blocksize we're about to process, // - the address of the blksz_t object. bszid_t bs_id = ( bszid_t )va_arg( args, bszid_t ); blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* ); // Store the values in our temporary arrays. bszids[ i ] = bs_id; blkszs[ i ] = blksz; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the addresses of: // - the blocksize object array blksz_t* cntx_l3_sup_blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. Notice that the blksz_t* pointers were saved, rather than // the objects themselves, but we copy the contents of the objects // when copying into the context. // Process each blocksize id tuple provided. for ( i = 0; i < n_bs; ++i ) { // Read the current blocksize id, blksz_t* pointer, blocksize // multiple id, and blocksize scalar. bszid_t bs_id = bszids[ i ]; blksz_t* blksz = blkszs[ i ]; blksz_t* cntx_l3_sup_blksz = &cntx_l3_sup_blkszs[ bs_id ]; // Copy the blksz_t object contents into the appropriate // location within the context's blksz_t array. //cntx_l3_sup_blkszs[ bs_id ] = *blksz; //bli_blksz_copy( blksz, cntx_l3_sup_blksz ); bli_blksz_copy_if_pos( blksz, cntx_l3_sup_blksz ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bli_free_intl( blkszs ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_blkszs(): " ); #endif bli_free_intl( bszids ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default level-3 microkernels for small/unpacked matrices. It // should be called after bli_cntx_init_defaults() so that the context // begins with default sup micro/millikernels across all datatypes. /* Example prototypes: void bli_cntx_set_l3_sup_kers ( dim_t n_ukrs, stor3_t stor_id0, num_t dt0, void* ukr0_fp, bool pref0, stor3_t stor_id1, num_t dt1, void* ukr1_fp, bool pref1, stor3_t stor_id2, num_t dt2, void* ukr2_fp, bool pref2, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif stor3_t* st3_ids = bli_malloc_intl( n_ukrs * sizeof( stor3_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif num_t* ukr_dts = bli_malloc_intl( n_ukrs * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif void** ukr_fps = bli_malloc_intl( n_ukrs * sizeof( void* ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif bool* ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_ukrs ); // Process n_ukrs tuples. for ( i = 0; i < n_ukrs; ++i ) { // Here, we query the variable argument list for: // - the stor3_t storage case being assigned to the kernel we're // about to process, // - the datatype of the kernel, // - the kernel function pointer, and // - the kernel function storage preference // that we need to store to the context. const stor3_t st3_id = ( stor3_t )va_arg( args, stor3_t ); const num_t ukr_dt = ( num_t )va_arg( args, num_t ); void* ukr_fp = ( void* )va_arg( args, void* ); const bool ukr_pref = ( bool )va_arg( args, int ); // Store the values in our temporary arrays. st3_ids[ i ] = st3_id; ukr_dts[ i ] = ukr_dt; ukr_fps[ i ] = ukr_fp; ukr_prefs[ i ] = ukr_pref; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the addresses of: // - the l3 small/unpacked ukernel func_t array // - the l3 small/unpacked ukernel preferences array func_t* cntx_l3_sup_kers = bli_cntx_l3_sup_kers_buf( cntx ); mbool_t* cntx_l3_sup_kers_prefs = bli_cntx_l3_sup_kers_prefs_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. #if 0 dim_t sup_map[ BLIS_NUM_LEVEL3_SUP_UKRS ][2]; // Create the small/unpacked ukernel mappings: // - rv -> rrr 0, rcr 2 // - rg -> rrc 1, rcc 3 // - cv -> ccr 6, ccc 7 // - cg -> crr 4, crc 5 // - rd -> rrc 1 // - cd -> crc 5 // - rc -> rcc 3 // - cr -> crr 4 // - gx -> xxx 8 // NOTE: We only need to set one slot in the context l3_sup_kers array // for the general-stride/generic ukernel type, but since the loop below // needs to be set up to set two slots to accommodate the RV, RG, CV, and // CG, ukernel types, we will just be okay with the GX ukernel being set // redundantly. (The RD, CD, CR, and RC ukernel types are set redundantly // for the same reason.) sup_map[ BLIS_GEMMSUP_RV_UKR ][0] = BLIS_RRR; sup_map[ BLIS_GEMMSUP_RV_UKR ][1] = BLIS_RCR; sup_map[ BLIS_GEMMSUP_RG_UKR ][0] = BLIS_RRC; sup_map[ BLIS_GEMMSUP_RG_UKR ][1] = BLIS_RCC; sup_map[ BLIS_GEMMSUP_CV_UKR ][0] = BLIS_CCR; sup_map[ BLIS_GEMMSUP_CV_UKR ][1] = BLIS_CCC; sup_map[ BLIS_GEMMSUP_CG_UKR ][0] = BLIS_CRR; sup_map[ BLIS_GEMMSUP_CG_UKR ][1] = BLIS_CRC; sup_map[ BLIS_GEMMSUP_RD_UKR ][0] = BLIS_RRC; sup_map[ BLIS_GEMMSUP_RD_UKR ][1] = BLIS_RRC; sup_map[ BLIS_GEMMSUP_CD_UKR ][0] = BLIS_CRC; sup_map[ BLIS_GEMMSUP_CD_UKR ][1] = BLIS_CRC; sup_map[ BLIS_GEMMSUP_RC_UKR ][0] = BLIS_RCC; sup_map[ BLIS_GEMMSUP_RC_UKR ][1] = BLIS_RCC; sup_map[ BLIS_GEMMSUP_CR_UKR ][0] = BLIS_CRR; sup_map[ BLIS_GEMMSUP_CR_UKR ][1] = BLIS_CRR; sup_map[ BLIS_GEMMSUP_GX_UKR ][0] = BLIS_XXX; sup_map[ BLIS_GEMMSUP_GX_UKR ][1] = BLIS_XXX; #endif // Process each blocksize id tuple provided. for ( i = 0; i < n_ukrs; ++i ) { // Read the current stor3_t id, ukernel datatype, ukernel function // pointer, and ukernel preference. const stor3_t st3_id = st3_ids[ i ]; const num_t ukr_dt = ukr_dts[ i ]; void* ukr_fp = ukr_fps[ i ]; const bool ukr_pref = ukr_prefs[ i ]; // Index to the func_t and mbool_t for the current stor3_t id // being processed. func_t* ukrs = &cntx_l3_sup_kers[ st3_id ]; mbool_t* prefs = &cntx_l3_sup_kers_prefs[ st3_id ]; // Store the ukernel function pointer and preference values into // the stor3_t location in the context. bli_func_set_dt( ukr_fp, ukr_dt, ukrs ); bli_mbool_set_dt( ukr_pref, ukr_dt, prefs ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif bli_free_intl( st3_ids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif bli_free_intl( ukr_dts ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif bli_free_intl( ukr_fps ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l3_sup_kers(): " ); #endif bli_free_intl( ukr_prefs ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l1f_kers( dim_t n_kers, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default level-1f kernels. It should be called after // bli_cntx_init_defaults() so that the context begins with default l1f // kernels across all datatypes. /* Example prototypes: void bli_cntx_set_l1f_kers ( dim_t n_ukrs, l1fkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, l1fkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, l1fkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif l1fkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1fkr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_kers ); // Process n_kers tuples. for ( i = 0; i < n_kers; ++i ) { // Here, we query the variable argument list for: // - the l1fkr_t of the kernel we're about to process, // - the datatype of the kernel, and // - the kernel function pointer // that we need to store to the context. const l1fkr_t ker_id = ( l1fkr_t )va_arg( args, l1fkr_t ); const num_t ker_dt = ( num_t )va_arg( args, num_t ); void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); // Store the values in our temporary arrays. ker_ids[ i ] = ker_id; ker_dts[ i ] = ker_dt; ker_fps[ i ] = ker_fp; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the address of: // - the level-1f kernels func_t array func_t* cntx_l1f_kers = bli_cntx_l1f_kers_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. // Process each blocksize id tuple provided. for ( i = 0; i < n_kers; ++i ) { // Read the current kernel id, kernel datatype, and kernel function // pointer. const l1fkr_t ker_id = ker_ids[ i ]; const num_t ker_dt = ker_dts[ i ]; void_fp ker_fp = ker_fps[ i ]; // Index into the func_t and mbool_t for the current kernel id // being processed. func_t* kers = &cntx_l1f_kers[ ker_id ]; // Store the ukernel function pointer and preference values into // the context. bli_func_set_dt( ker_fp, ker_dt, kers ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif bli_free_intl( ker_ids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif bli_free_intl( ker_dts ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1f_kers(): " ); #endif bli_free_intl( ker_fps ); } // ----------------------------------------------------------------------------- void bli_cntx_set_l1v_kers( dim_t n_kers, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default level-1v kernels. It should be called after // bli_cntx_init_defaults() so that the context begins with default l1v // kernels across all datatypes. /* Example prototypes: void bli_cntx_set_l1v_kers ( dim_t n_ukrs, l1vkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, l1vkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, l1vkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif l1vkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1vkr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_kers ); // Process n_kers tuples. for ( i = 0; i < n_kers; ++i ) { // Here, we query the variable argument list for: // - the l1vkr_t of the kernel we're about to process, // - the datatype of the kernel, and // - the kernel function pointer // that we need to store to the context. const l1vkr_t ker_id = ( l1vkr_t )va_arg( args, l1vkr_t ); const num_t ker_dt = ( num_t )va_arg( args, num_t ); void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); // Store the values in our temporary arrays. ker_ids[ i ] = ker_id; ker_dts[ i ] = ker_dt; ker_fps[ i ] = ker_fp; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the address of: // - the level-1v kernels func_t array func_t* cntx_l1v_kers = bli_cntx_l1v_kers_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. // Process each blocksize id tuple provided. for ( i = 0; i < n_kers; ++i ) { // Read the current kernel id, kernel datatype, and kernel function // pointer. const l1vkr_t ker_id = ker_ids[ i ]; const num_t ker_dt = ker_dts[ i ]; void_fp ker_fp = ker_fps[ i ]; // Index into the func_t and mbool_t for the current kernel id // being processed. func_t* kers = &cntx_l1v_kers[ ker_id ]; // Store the ukernel function pointer and preference values into // the context. bli_func_set_dt( ker_fp, ker_dt, kers ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif bli_free_intl( ker_ids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif bli_free_intl( ker_dts ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_l1v_kers(): " ); #endif bli_free_intl( ker_fps ); } // ----------------------------------------------------------------------------- void bli_cntx_set_packm_kers( dim_t n_kers, ... ) { // This function can be called from the bli_cntx_init_*() function for // a particular architecture if the kernel developer wishes to use // non-default packing kernels. It should be called after // bli_cntx_init_defaults() so that the context begins with default packm // kernels across all datatypes. /* Example prototypes: void bli_cntx_set_packm_kers ( dim_t n_ukrs, l1mkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp, l1mkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp, l1mkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp, ... cntx_t* cntx ); */ va_list args; dim_t i; err_t r_val; // Allocate some temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif l1mkr_t* ker_ids = bli_malloc_intl( n_kers * sizeof( l1mkr_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif num_t* ker_dts = bli_malloc_intl( n_kers * sizeof( num_t ), &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif void_fp* ker_fps = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val ); // -- Begin variable argument section -- // Initialize variable argument environment. va_start( args, n_kers ); // Process n_kers tuples. for ( i = 0; i < n_kers; ++i ) { // Here, we query the variable argument list for: // - the l1mkr_t of the kernel we're about to process, // - the datatype of the kernel, and // - the kernel function pointer // that we need to store to the context. const l1mkr_t ker_id = ( l1mkr_t )va_arg( args, l1mkr_t ); const num_t ker_dt = ( num_t )va_arg( args, num_t ); void_fp ker_fp = ( void_fp )va_arg( args, void_fp ); // Store the values in our temporary arrays. ker_ids[ i ] = ker_id; ker_dts[ i ] = ker_dt; ker_fps[ i ] = ker_fp; } // The last argument should be the context pointer. cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* ); // Shutdown variable argument environment and clean up stack. va_end( args ); // -- End variable argument section -- // Query the context for the address of: // - the packm kernels func_t array func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx ); // Now that we have the context address, we want to copy the values // from the temporary buffers into the corresponding buffers in the // context. // Process each blocksize id tuple provided. for ( i = 0; i < n_kers; ++i ) { // Read the current kernel id, kernel datatype, and kernel function // pointer. const l1mkr_t ker_id = ker_ids[ i ]; const num_t ker_dt = ker_dts[ i ]; void_fp ker_fp = ker_fps[ i ]; // Index into the func_t and mbool_t for the current kernel id // being processed. func_t* kers = &cntx_packm_kers[ ker_id ]; // Store the ukernel function pointer and preference values into // the context. bli_func_set_dt( ker_fp, ker_dt, kers ); } // Free the temporary local arrays. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif bli_free_intl( ker_ids ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif bli_free_intl( ker_dts ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_cntx_set_packm_kers(): " ); #endif bli_free_intl( ker_fps ); } // ----------------------------------------------------------------------------- void bli_cntx_print( cntx_t* cntx ) { dim_t i; // Print the values stored in the blksz_t objects. printf( " s d c z\n" ); for ( i = 0; i < BLIS_NUM_BLKSZS; ++i ) { printf( "blksz/mult %2lu: %13lu/%2lu %13lu/%2lu %13lu/%2lu %13lu/%2lu\n", ( unsigned long )i, ( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_FLOAT, i, cntx ), ( unsigned long )bli_cntx_get_bmult_dt ( BLIS_FLOAT, i, cntx ), ( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, i, cntx ), ( unsigned long )bli_cntx_get_bmult_dt ( BLIS_DOUBLE, i, cntx ), ( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_SCOMPLEX, i, cntx ), ( unsigned long )bli_cntx_get_bmult_dt ( BLIS_SCOMPLEX, i, cntx ), ( unsigned long )bli_cntx_get_blksz_def_dt( BLIS_DCOMPLEX, i, cntx ), ( unsigned long )bli_cntx_get_bmult_dt ( BLIS_DCOMPLEX, i, cntx ) ); } for ( i = 0; i < BLIS_NUM_LEVEL3_UKRS; ++i ) { func_t* ukr = bli_cntx_get_l3_vir_ukrs( i, cntx ); printf( "l3 vir ukr %2lu: %16p %16p %16p %16p\n", ( unsigned long )i, bli_func_get_dt( BLIS_FLOAT, ukr ), bli_func_get_dt( BLIS_DOUBLE, ukr ), bli_func_get_dt( BLIS_SCOMPLEX, ukr ), bli_func_get_dt( BLIS_DCOMPLEX, ukr ) ); } for ( i = 0; i < BLIS_NUM_3OP_RC_COMBOS; ++i ) { func_t* ukr = bli_cntx_get_l3_sup_kers( i, cntx ); printf( "l3 sup ukr %2lu: %16p %16p %16p %16p\n", ( unsigned long )i, bli_func_get_dt( BLIS_FLOAT, ukr ), bli_func_get_dt( BLIS_DOUBLE, ukr ), bli_func_get_dt( BLIS_SCOMPLEX, ukr ), bli_func_get_dt( BLIS_DCOMPLEX, ukr ) ); } for ( i = 0; i < BLIS_NUM_LEVEL1F_KERS; ++i ) { func_t* ker = bli_cntx_get_l1f_kers( i, cntx ); printf( "l1f ker %2lu: %16p %16p %16p %16p\n", ( unsigned long )i, bli_func_get_dt( BLIS_FLOAT, ker ), bli_func_get_dt( BLIS_DOUBLE, ker ), bli_func_get_dt( BLIS_SCOMPLEX, ker ), bli_func_get_dt( BLIS_DCOMPLEX, ker ) ); } for ( i = 0; i < BLIS_NUM_LEVEL1V_KERS; ++i ) { func_t* ker = bli_cntx_get_l1v_kers( i, cntx ); printf( "l1v ker %2lu: %16p %16p %16p %16p\n", ( unsigned long )i, bli_func_get_dt( BLIS_FLOAT, ker ), bli_func_get_dt( BLIS_DOUBLE, ker ), bli_func_get_dt( BLIS_SCOMPLEX, ker ), bli_func_get_dt( BLIS_DCOMPLEX, ker ) ); } { ind_t method = bli_cntx_method( cntx ); printf( "ind method : %lu\n", ( unsigned long )method ); } } cython-blis-0.9.1/blis/_src/frame/base/bli_cntx.h000066400000000000000000000522751427272030600216300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) /* typedef struct cntx_s { blksz_t* blkszs; bszid_t* bmults; func_t* l3_vir_ukrs; func_t* l3_nat_ukrs; mbool_t* l3_nat_ukrs_prefs; blksz_t* l3_sup_thresh; void** l3_sup_handlers; blksz_t* l3_sup_blkszs; func_t* l3_sup_kers; mbool_t* l3_sup_kers_prefs; func_t* l1f_kers; func_t* l1v_kers; func_t* packm_kers; func_t* unpackm_kers; ind_t method; } cntx_t; */ // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_const.c000066400000000000000000000064101427272030600217630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Statically initialize structs containing representations of various // constants for each datatype supported in BLIS. static constdata_t bli_two_buffer = bli_obj_init_constdata( 2.0 ); static constdata_t bli_one_buffer = bli_obj_init_constdata( 1.0 ); static constdata_t bli_zero_buffer = bli_obj_init_constdata( 0.0 ); static constdata_t bli_mone_buffer = bli_obj_init_constdata( -1.0 ); static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 ); // Statically initialize global scalar constants, attaching the addresses // of the corresponding structs above. obj_t BLIS_TWO = bli_obj_init_const( &bli_two_buffer ); obj_t BLIS_ONE = bli_obj_init_const( &bli_one_buffer ); obj_t BLIS_ZERO = bli_obj_init_const( &bli_zero_buffer ); obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer ); obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer ); #if 0 obj_t BLIS_TWO = {}; obj_t BLIS_ONE = {}; obj_t BLIS_ZERO = {}; obj_t BLIS_MINUS_ONE = {}; obj_t BLIS_MINUS_TWO = {}; void bli_const_init( void ) { bli_obj_create_const( 2.0, &BLIS_TWO ); bli_obj_create_const( 1.0, &BLIS_ONE ); bli_obj_create_const( 0.5, &BLIS_ONE_HALF ); bli_obj_create_const( 0.0, &BLIS_ZERO ); bli_obj_create_const( -0.5, &BLIS_MINUS_ONE_HALF ); bli_obj_create_const( -1.0, &BLIS_MINUS_ONE ); bli_obj_create_const( -2.0, &BLIS_MINUS_TWO ); } void bli_const_finalize( void ) { bli_obj_free( &BLIS_TWO ); bli_obj_free( &BLIS_ONE ); bli_obj_free( &BLIS_ONE_HALF ); bli_obj_free( &BLIS_ZERO ); bli_obj_free( &BLIS_MINUS_ONE_HALF ); bli_obj_free( &BLIS_MINUS_ONE ); bli_obj_free( &BLIS_MINUS_TWO ); } #endif cython-blis-0.9.1/blis/_src/frame/base/bli_const.h000066400000000000000000000033101427272030600217640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_const_init( void ); void bli_const_finalize( void ); cython-blis-0.9.1/blis/_src/frame/base/bli_cpuid.c000066400000000000000000001164171427272030600217520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018-2020, Advanced Micro Devices, Inc. Copyright (C) 2019, Dave Love, University of Manchester Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if 0 // Used only during standalone testing of ARM support. #include "bli_system.h" #include "bli_type_defs.h" #include "bli_cpuid.h" #undef __x86_64__ #undef _M_X64 #undef __i386 #undef _M_IX86 #define __arm__ #endif #ifdef BLIS_CONFIGURETIME_CPUID // NOTE: If you need to make any changes to this cpp branch, it's probably // the case that you also need to modify bli_arch.c, bli_cpuid.c, and // bli_env.c. Don't forget to update these other files as needed! // The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp // branch in bli_system.h is processed. (This macro is normally defined in // bli_config.h.) #define BLIS_ENABLE_SYSTEM // Use C-style static inline functions for any static inline functions that // happen to be defined by the headers below. (This macro is normally defined // in bli_config_macro_defs.h.) #define BLIS_INLINE static // Since we're not building a shared library, we can forgo the use of the // BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro // is normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS #include "bli_system.h" #include "bli_type_defs.h" #include "bli_arch.h" #include "bli_cpuid.h" //#include "bli_env.h" #else #include "blis.h" #endif // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) #include "cpuid.h" arch_t bli_cpuid_query_id( void ) { uint32_t vendor, family, model, features; // Call the CPUID instruction and parse its results into a family id, // model id, and a feature bit field. The return value encodes the // vendor. vendor = bli_cpuid_query( &family, &model, &features ); #if 0 printf( "vendor = %s\n", vendor==1 ? "AMD": "INTEL" ); printf("family = %x\n", family ); printf( "model = %x\n", model ); printf( "features = %x\n", features ); #endif if ( vendor == VENDOR_INTEL ) { // Check for each Intel configuration that is enabled, check for that // microarchitecture. We check from most recent to most dated. #ifdef BLIS_CONFIG_SKX if ( bli_cpuid_is_skx( family, model, features ) ) return BLIS_ARCH_SKX; #endif #ifdef BLIS_CONFIG_KNL if ( bli_cpuid_is_knl( family, model, features ) ) return BLIS_ARCH_KNL; #endif #ifdef BLIS_CONFIG_HASWELL if ( bli_cpuid_is_haswell( family, model, features ) ) return BLIS_ARCH_HASWELL; #endif #ifdef BLIS_CONFIG_SANDYBRIDGE if ( bli_cpuid_is_sandybridge( family, model, features ) ) return BLIS_ARCH_SANDYBRIDGE; #endif #ifdef BLIS_CONFIG_PENRYN if ( bli_cpuid_is_penryn( family, model, features ) ) return BLIS_ARCH_PENRYN; #endif // If none of the other sub-configurations were detected, return // the 'generic' arch_t id value. return BLIS_ARCH_GENERIC; } else if ( vendor == VENDOR_AMD ) { // Check for each AMD configuration that is enabled, check for that // microarchitecture. We check from most recent to most dated. #ifdef BLIS_CONFIG_ZEN3 if ( bli_cpuid_is_zen3( family, model, features ) ) return BLIS_ARCH_ZEN3; #endif #ifdef BLIS_CONFIG_ZEN2 if ( bli_cpuid_is_zen2( family, model, features ) ) return BLIS_ARCH_ZEN2; #endif #ifdef BLIS_CONFIG_ZEN if ( bli_cpuid_is_zen( family, model, features ) ) return BLIS_ARCH_ZEN; #endif #ifdef BLIS_CONFIG_EXCAVATOR if ( bli_cpuid_is_excavator( family, model, features ) ) return BLIS_ARCH_EXCAVATOR; #endif #ifdef BLIS_CONFIG_STEAMROLLER if ( bli_cpuid_is_steamroller( family, model, features ) ) return BLIS_ARCH_STEAMROLLER; #endif #ifdef BLIS_CONFIG_PILEDRIVER if ( bli_cpuid_is_piledriver( family, model, features ) ) return BLIS_ARCH_PILEDRIVER; #endif #ifdef BLIS_CONFIG_BULLDOZER if ( bli_cpuid_is_bulldozer( family, model, features ) ) return BLIS_ARCH_BULLDOZER; #endif // If none of the other sub-configurations were detected, return // the 'generic' arch_t id value. return BLIS_ARCH_GENERIC; } else if ( vendor == VENDOR_UNKNOWN ) { return BLIS_ARCH_GENERIC; } return BLIS_ARCH_GENERIC; } // ----------------------------------------------------------------------------- bool bli_cpuid_is_skx ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_AVX2 | FEATURE_AVX512F | FEATURE_AVX512DQ | FEATURE_AVX512BW | FEATURE_AVX512VL ; int nvpu = vpu_count(); if ( bli_cpuid_has_features( features, expected ) ) { switch ( nvpu ) { case 1: bli_arch_log( "Hardware has 1 FMA unit; using 'haswell' (not 'skx') sub-config.\n" ); return FALSE; case 2: bli_arch_log( "Hardware has 2 FMA units; using 'skx' sub-config.\n" ); return TRUE; default: bli_arch_log( "Number of FMA units unknown; using 'haswell' (not 'skx') config.\n" ); return FALSE; } } else return FALSE; return TRUE; } bool bli_cpuid_is_knl ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_AVX2 | FEATURE_AVX512F | FEATURE_AVX512PF; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; return TRUE; } bool bli_cpuid_is_haswell ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_AVX2; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; return TRUE; } bool bli_cpuid_is_sandybridge ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; return TRUE; } bool bli_cpuid_is_penryn ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_SSE3 | FEATURE_SSSE3; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; return TRUE; } // ----------------------------------------------------------------------------- bool bli_cpuid_is_zen3 ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_AVX2; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; // All Zen3 cores have a family of 0x19. if ( family != 0x19 ) return FALSE; // Finally, check for specific models: // - 0x00 ~ 0xff // NOTE: We accept any model because the family 25 (0x19) is unique. const bool is_arch = ( 0x00 <= model && model <= 0xff ); if ( !is_arch ) return FALSE; return TRUE; } bool bli_cpuid_is_zen2 ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_AVX2; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; // All Zen2 cores have a family of 0x17. if ( family == 0x17 ) { return 0x30 <= model && model <= 0xff; } #ifndef BLIS_CONFIG_ZEN3 // Fallback to Zen 2 kernels on Zen 3, when blis is compiled without // Zen 3 support (e.g. because it requires a newer compiler). if ( family == 0x19 ) { return 0x00 <= model && model <= 0xff; } #endif return FALSE; } bool bli_cpuid_is_zen ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_AVX2; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; // All Zen cores have a family of 0x17. if ( family != 0x17 ) return FALSE; // Finally, check for specific models: // - 0x00 ~ 0x2f // NOTE: We must check model because the family 23 (0x17) is shared with // zen2. const bool is_arch = ( 0x00 <= model && model <= 0x2f ); if ( !is_arch ) return FALSE; return TRUE; } bool bli_cpuid_is_excavator ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_AVX2; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; // All Excavator cores have a family of 0x15. if ( family != 0x15 ) return FALSE; // Finally, check for specific models: // - 0x60 ~ 0x7f const bool is_arch = ( 0x60 <= model && model <= 0x7f ); if ( !is_arch ) return FALSE; return TRUE; } bool bli_cpuid_is_steamroller ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_FMA4; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; // All Steamroller cores have a family of 0x15. if ( family != 0x15 ) return FALSE; // Finally, check for specific models: // - 0x30 ~ 0x3f const bool is_arch = ( 0x30 <= model && model <= 0x3f ); if ( !is_arch ) return FALSE; return TRUE; } bool bli_cpuid_is_piledriver ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA3 | FEATURE_FMA4; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; // All Piledriver cores have a family of 0x15. if ( family != 0x15 ) return FALSE; // Finally, check for specific models: // - 0x02 // - 0x10 ~ 0x1f const bool is_arch = model == 0x02 || ( 0x10 <= model && model <= 0x1f ); if ( !is_arch ) return FALSE; return TRUE; } bool bli_cpuid_is_bulldozer ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_AVX | FEATURE_FMA4; if ( !bli_cpuid_has_features( features, expected ) ) return FALSE; // All Bulldozer cores have a family of 0x15. if ( family != 0x15 ) return FALSE; // Finally, check for specific models: // - 0x00 // - 0x01 const bool is_arch = ( model == 0x00 || model == 0x01 ); if ( !is_arch ) return FALSE; return TRUE; } #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) arch_t bli_cpuid_query_id( void ) { uint32_t vendor, model, part, features; vendor = bli_cpuid_query( &model, &part, &features ); #if 0 printf( "vendor = %u\n", vendor ); printf( "model = %u\n", model ); printf( "part = 0x%x\n", part ); printf( "features = %u\n", features ); #endif if ( vendor == VENDOR_ARM ) { if ( model == MODEL_ARMV8 ) { return part; // Check for each ARMv8 configuration that is enabled, check for that // microarchitecture. We check from most recent to most dated. // If none of the other sub-configurations were detected, return // the 'generic' arch_t id value. return BLIS_ARCH_GENERIC; } else if ( model == MODEL_ARMV7 ) { // Check for each ARMv7 configuration that is enabled, check for that // microarchitecture. We check from most recent to most dated. #ifdef BLIS_CONFIG_CORTEXA15 if ( bli_cpuid_is_cortexa15( model, part, features ) ) return BLIS_ARCH_CORTEXA15; #endif #ifdef BLIS_CONFIG_CORTEXA9 if ( bli_cpuid_is_cortexa9( model, part, features ) ) return BLIS_ARCH_CORTEXA9; #endif // If none of the other sub-configurations were detected, return // the 'generic' arch_t id value. return BLIS_ARCH_GENERIC; } } else if ( vendor == VENDOR_UNKNOWN ) { return BLIS_ARCH_GENERIC; } return BLIS_ARCH_GENERIC; } bool bli_cpuid_is_cortexa15 ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_NEON; return bli_cpuid_has_features( features, expected ) && model == 0xc0f; } bool bli_cpuid_is_cortexa9 ( uint32_t family, uint32_t model, uint32_t features ) { // Check for expected CPU features. const uint32_t expected = FEATURE_NEON; return bli_cpuid_has_features( features, expected ) && model == 0xc09; } #endif // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.cxx from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // /* Copyright (C) 2017, The University of Texas at Austin Copyright (C) 2017, Devin Matthews Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) enum { // input register(s) output register FEATURE_MASK_SSE3 = (1u<< 0), // cpuid[eax=1] :ecx[0] FEATURE_MASK_SSSE3 = (1u<< 9), // cpuid[eax=1] :ecx[9] FEATURE_MASK_SSE41 = (1u<<19), // cpuid[eax=1] :ecx[19] FEATURE_MASK_SSE42 = (1u<<20), // cpuid[eax=1] :ecx[20] FEATURE_MASK_AVX = (1u<<28), // cpuid[eax=1] :ecx[28] FEATURE_MASK_AVX2 = (1u<< 5), // cpuid[eax=7,ecx=0] :ebx[5] FEATURE_MASK_FMA3 = (1u<<12), // cpuid[eax=1] :ecx[12] FEATURE_MASK_FMA4 = (1u<<16), // cpuid[eax=0x80000001]:ecx[16] FEATURE_MASK_AVX512F = (1u<<16), // cpuid[eax=7,ecx=0] :ebx[16] FEATURE_MASK_AVX512DQ = (1u<<17), // cpuid[eax=7,ecx=0] :ebx[17] FEATURE_MASK_AVX512PF = (1u<<26), // cpuid[eax=7,ecx=0] :ebx[26] FEATURE_MASK_AVX512ER = (1u<<27), // cpuid[eax=7,ecx=0] :ebx[27] FEATURE_MASK_AVX512CD = (1u<<28), // cpuid[eax=7,ecx=0] :ebx[28] FEATURE_MASK_AVX512BW = (1u<<30), // cpuid[eax=7,ecx=0] :ebx[30] FEATURE_MASK_AVX512VL = (1u<<31), // cpuid[eax=7,ecx=0] :ebx[31] FEATURE_MASK_XGETBV = (1u<<26)| (1u<<27), // cpuid[eax=1] :ecx[27:26] XGETBV_MASK_XMM = 0x02u, // xcr0[1] XGETBV_MASK_YMM = 0x04u, // xcr0[2] XGETBV_MASK_ZMM = 0xe0u // xcr0[7:5] }; uint32_t bli_cpuid_query ( uint32_t* family, uint32_t* model, uint32_t* features ) { uint32_t eax, ebx, ecx, edx; uint32_t old_model = 0; uint32_t old_family = 0; uint32_t ext_model = 0; uint32_t ext_family = 0; *family = 0; *model = 0; *features = 0; //fprintf( stderr, "checking cpuid\n" ); uint32_t cpuid_max = __get_cpuid_max( 0, 0 ); uint32_t cpuid_max_ext = __get_cpuid_max( 0x80000000u, 0 ); //fprintf( stderr, "max cpuid leaf: %d\n", cpuid_max ); //fprintf( stderr, "max extended cpuid leaf: %08x\n", cpuid_max_ext ); if ( cpuid_max < 1 ) return VENDOR_UNKNOWN; // The fourth '0' serves as the NULL-terminator for the vendor string. uint32_t vendor_string[4] = { 0, 0, 0, 0 }; // This is actually a macro that modifies the last four operands, // hence why they are not passed by address. __cpuid( 0, eax, vendor_string[0], vendor_string[2], vendor_string[1] ); // Check extended feature bits for post-AVX2 features. if ( cpuid_max >= 7 ) { // This is actually a macro that modifies the last four operands, // hence why they are not passed by address. __cpuid_count( 7, 0, eax, ebx, ecx, edx ); //fprintf( stderr, "cpuid leaf 7:\n" ); //print_binary( eax ); //print_binary( ebx ); //print_binary( ecx ); //print_binary( edx ); if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX2 ) ) *features |= FEATURE_AVX2; if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512F ) ) *features |= FEATURE_AVX512F; if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512DQ ) ) *features |= FEATURE_AVX512DQ; if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512PF ) ) *features |= FEATURE_AVX512PF; if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512ER ) ) *features |= FEATURE_AVX512ER; if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512CD ) ) *features |= FEATURE_AVX512CD; if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512BW ) ) *features |= FEATURE_AVX512BW; if ( bli_cpuid_has_features( ebx, FEATURE_MASK_AVX512VL ) ) *features |= FEATURE_AVX512VL; } // Check extended processor info / features bits for AMD-specific features. if ( cpuid_max_ext >= 0x80000001u ) { // This is actually a macro that modifies the last four operands, // hence why they are not passed by address. __cpuid( 0x80000001u, eax, ebx, ecx, edx ); //fprintf(stderr, "extended cpuid leaf 0x80000001:\n"); //print_binary(eax); //print_binary(ebx); //print_binary(ecx); //print_binary(edx); if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA4 ) ) *features |= FEATURE_FMA4; } // Unconditionally check processor info / features bits. { // This is actually a macro that modifies the last four operands, // hence why they are not passed by address. __cpuid( 1, eax, ebx, ecx, edx ); //fprintf(stderr, "cpuid leaf 1:\n"); //print_binary(eax); //print_binary(ebx); //print_binary(ecx); //print_binary(edx); /* cpuid(eax=1): eax[27:0] 3: 0 - Stepping 7: 4 - Model 11: 8 - Family 13:12 - Processor Type 19:16 - Extended Model 27:20 - Extended Family Intel and AMD have suggested applications to display the family of a CPU as the sum of the "Family" and the "Extended Family" fields shown above, and the model as the sum of the "Model" and the 4-bit left-shifted "Extended Model" fields. If "Family" is different than 6 or 15, only the "Family" and "Model" fields should be used while the "Extended Family" and "Extended Model" bits are reserved. If "Family" is set to 15, then "Extended Family" and the 4-bit left-shifted "Extended Model" should be added to the respective base values, and if "Family" is set to 6, then only the 4-bit left-shifted "Extended Model" should be added to "Model". */ old_model = ( eax >> 4 ) & ( 0xF ); // bits 7:4 old_family = ( eax >> 8 ) & ( 0xF ); // bits 11:8 ext_model = ( eax >> 16 ) & ( 0xF ); // bits 19:16 ext_family = ( eax >> 20 ) & ( 0xFF ); // bits 27:20 // Set the display model and family values based on the original family // value. See explanation above. if ( old_family == 6 ) { *model = ( ext_model << 4 ) + old_model; *family = old_family; } else if ( old_family == 15 ) { *model = ( ext_model << 4 ) + old_model; *family = ( ext_family ) + old_family; } else { *model = old_model; *family = old_family; } // Check for SSE, AVX, and FMA3 features. if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE3 ) ) *features |= FEATURE_SSE3; if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSSE3 ) ) *features |= FEATURE_SSSE3; if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE41 ) ) *features |= FEATURE_SSE41; if ( bli_cpuid_has_features( ecx, FEATURE_MASK_SSE42 ) ) *features |= FEATURE_SSE42; if ( bli_cpuid_has_features( ecx, FEATURE_MASK_AVX ) ) *features |= FEATURE_AVX; if ( bli_cpuid_has_features( ecx, FEATURE_MASK_FMA3 ) ) *features |= FEATURE_FMA3; // Check whether the hardware supports xsave/xrestor/xsetbv/xgetbv AND // support for these is enabled by the OS. If so, then we proceed with // checking that various register-state saving features are available. if ( bli_cpuid_has_features( ecx, FEATURE_MASK_XGETBV ) ) { uint32_t xcr = 0; // Call xgetbv to get xcr0 (the extended control register) copied // to [edx:eax]. This encodes whether software supports various // register state-saving features. __asm__ __volatile__ ( ".byte 0x0F, 0x01, 0xD0" : "=a" (eax), "=d" (edx) : "c" (xcr) : "cc" ); //fprintf(stderr, "xcr0:\n"); //print_binary(eax); //print_binary(edx); //fprintf(stderr, "xgetbv: xmm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM)); //fprintf(stderr, "xgetbv: ymm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM| // XGETBV_MASK_YMM)); //fprintf(stderr, "xgetbv: zmm: %d\n", bli_cpuid_has_features(eax, XGETBV_MASK_XMM| // XGETBV_MASK_YMM| // XGETBV_MASK_ZMM)); // The OS can manage the state of 512-bit zmm (AVX-512) registers // only if the xcr[7:5] bits are set. If they are not set, then // clear all feature bits related to AVX-512. if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM | XGETBV_MASK_YMM | XGETBV_MASK_ZMM ) ) { *features &= ~( FEATURE_AVX512F | FEATURE_AVX512DQ | FEATURE_AVX512PF | FEATURE_AVX512ER | FEATURE_AVX512CD | FEATURE_AVX512BW | FEATURE_AVX512VL ); } // The OS can manage the state of 256-bit ymm (AVX) registers // only if the xcr[2] bit is set. If it is not set, then // clear all feature bits related to AVX. if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM | XGETBV_MASK_YMM ) ) { *features &= ~( FEATURE_AVX | FEATURE_AVX2 | FEATURE_FMA3 | FEATURE_FMA4 ); } // The OS can manage the state of 128-bit xmm (SSE) registers // only if the xcr[1] bit is set. If it is not set, then // clear all feature bits related to SSE (which means the // entire bitfield is clear). if ( !bli_cpuid_has_features( eax, XGETBV_MASK_XMM ) ) { *features = 0; } } else { // If the hardware does not support xsave/xrestor/xsetbv/xgetbv, // OR these features are not enabled by the OS, then we clear // the bitfield, because it means that not even xmm support is // present. //fprintf(stderr, "xgetbv: no\n"); features = 0; } } //fprintf(stderr, "vendor: %12s\n", vendor_string); //fprintf(stderr, "family: %d\n", family); //fprintf(stderr, "model: %d\n", model); //fprintf(stderr, "sse3: %d\n", bli_cpuid_has_features(features, FEATURE_SSE3)); //fprintf(stderr, "ssse3: %d\n", bli_cpuid_has_features(features, FEATURE_SSSE3)); //fprintf(stderr, "sse4.1: %d\n", bli_cpuid_has_features(features, FEATURE_SSE41)); //fprintf(stderr, "sse4.2: %d\n", bli_cpuid_has_features(features, FEATURE_SSE42)); //fprintf(stderr, "avx: %d\n", bli_cpuid_has_features(features, FEATURE_AVX)); //fprintf(stderr, "avx2: %d\n", bli_cpuid_has_features(features, FEATURE_AVX2)); //fprintf(stderr, "fma3: %d\n", bli_cpuid_has_features(features, FEATURE_FMA3)); //fprintf(stderr, "fma4: %d\n", bli_cpuid_has_features(features, FEATURE_FMA4)); //fprintf(stderr, "avx512f: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512F)); //fprintf(stderr, "avx512pf: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512PF)); //fprintf(stderr, "avx512dq: %d\n", bli_cpuid_has_features(features, FEATURE_AVX512DQ)); // Check the vendor string and return a value to indicate Intel or AMD. if ( strcmp( ( char* )vendor_string, "AuthenticAMD" ) == 0 ) return VENDOR_AMD; else if ( strcmp( ( char* )vendor_string, "GenuineIntel" ) == 0 ) return VENDOR_INTEL; else return VENDOR_UNKNOWN; } void get_cpu_name( char *cpu_name ) { uint32_t eax, ebx, ecx, edx; __cpuid( 0x80000002u, eax, ebx, ecx, edx ); //printf("%x %x %x %x\n", eax, ebx, ecx, edx); *( uint32_t* )&cpu_name[0 + 0] = eax; *( uint32_t* )&cpu_name[0 + 4] = ebx; *( uint32_t* )&cpu_name[0 + 8] = ecx; *( uint32_t* )&cpu_name[0 +12] = edx; __cpuid( 0x80000003u, eax, ebx, ecx, edx ); //printf("%x %x %x %x\n", eax, ebx, ecx, edx); *( uint32_t* )&cpu_name[16+ 0] = eax; *( uint32_t* )&cpu_name[16+ 4] = ebx; *( uint32_t* )&cpu_name[16+ 8] = ecx; *( uint32_t* )&cpu_name[16+12] = edx; __cpuid( 0x80000004u, eax, ebx, ecx, edx ); //printf("%x %x %x %x\n", eax, ebx, ecx, edx); *( uint32_t* )&cpu_name[32+ 0] = eax; *( uint32_t* )&cpu_name[32+ 4] = ebx; *( uint32_t* )&cpu_name[32+ 8] = ecx; *( uint32_t* )&cpu_name[32+12] = edx; } // Return the number of FMA units _assuming avx512 is supported_. // This needs updating for new processor types, sigh. // See https://ark.intel.com/content/www/us/en/ark.html#@Processors // and also https://github.com/jeffhammond/vpu-count int vpu_count( void ) { char cpu_name[48] = {}; char* loc; char model_num[5]; int sku; get_cpu_name( cpu_name ); if ( strstr( cpu_name, "Intel(R) Xeon(R)" ) != NULL ) { if (( loc = strstr( cpu_name, "Platinum" ) )) return 2; if ( loc == NULL ) loc = strstr( cpu_name, "Gold" ); // 1 or 2, tested below if ( loc == NULL ) if (( loc = strstr( cpu_name, "Silver" ) )) return 1; if ( loc == NULL ) if (( loc = strstr( cpu_name, "Bronze" ) )) return 1; if ( loc == NULL ) loc = strstr( cpu_name, "W" ); if ( loc == NULL ) if (( loc = strstr( cpu_name, "D" ) )) // Fixme: May be wrong // return 1; if ( loc == NULL ) return -1; // We may have W-nnnn rather than, say, Gold nnnn if ( 'W' == *loc && '-' == *(loc+1) ) loc++; else loc = strstr( loc+1, " " ); if ( loc == NULL ) return -1; strncpy( model_num, loc+1, 4 ); model_num[4] = '\0'; // Things like i9-10900X matched above sku = atoi( model_num ); // These were derived from ARK listings as of 2019-10-09, but // may not be complete, especially as the ARK Skylake listing // seems to be limited. if ( 8199 >= sku && sku >= 8100 ) return 2; else if ( 6199 >= sku && sku >= 6100 ) return 2; else if ( sku == 5122 ) return 2; else if ( 6299 >= sku && sku >= 6200 ) return 2; // Cascade Lake Gold else if ( 5299 >= sku && sku >= 5200 ) return 1; // Cascade Lake Gold else if ( 5199 >= sku && sku >= 5100 ) return 1; else if ( 4199 >= sku && sku >= 4100 ) return 1; else if ( 3199 >= sku && sku >= 3100 ) return 1; else if ( 3299 >= sku && sku >= 3200 ) return 2; // Cascade Lake W else if ( 2299 >= sku && sku >= 2200 ) return 2; // Cascade Lake W else if ( 2199 >= sku && sku >= 2120 ) return 2; else if ( 2102 == sku || sku == 2104 ) return 2; // Gold exceptions else if ( 2119 >= sku && sku >= 2100 ) return 1; else return -1; } else if ( strstr( cpu_name, "Intel(R) Core(TM)" ) != NULL ) return 2; // All i7/i9 with avx512? else { return -1; } } #elif defined(__aarch64__) #ifdef __linux__ // This is adapted from OpenBLAS. See // https://www.kernel.org/doc/html/latest/arm64/cpu-feature-registers.html // for the mechanism, but not the magic numbers. // Fixme: Could these be missing in older Linux? #include #include #ifndef HWCAP_CPUID #define HWCAP_CPUID (1 << 11) #endif /* From https://www.kernel.org/doc/html/latest/arm64/sve.html and the aarch64 hwcap.h */ #ifndef HWCAP_SVE #define HWCAP_SVE (1 << 22) #endif /* Maybe also for AT_HWCAP2 #define HWCAP2_SVE2(1 << 1) et al ) */ #endif //__linux__ #ifdef __APPLE__ #include // #include #endif static uint32_t get_coretype ( uint32_t* features ) { int implementer = 0x00, part = 0x000; *features = FEATURE_NEON; #ifdef __linux__ if ( getauxval( AT_HWCAP ) & HWCAP_CPUID ) { // Also available from // /sys/devices/system/cpu/cpu0/regs/identification/midr_el1 // and split out in /proc/cpuinfo (with a tab before the colon): // CPU part : 0x0a1 uint64_t midr_el1; __asm("mrs %0, MIDR_EL1" : "=r" (midr_el1)); /* * MIDR_EL1 * * 31 24 23 20 19 16 15 4 3 0 * ----------------------------------------------------------------- * | Implementer | Variant | Architecture | Part Number | Revision | * ----------------------------------------------------------------- */ implementer = (midr_el1 >> 24) & 0xFF; part = (midr_el1 >> 4) & 0xFFF; } bool has_sve = getauxval( AT_HWCAP ) & HWCAP_SVE; if (has_sve) *features |= FEATURE_SVE; #endif //__linux__ #ifdef __APPLE__ // Better values could be obtained from sysctlbyname() implementer = 0x61; //Apple part = 0x023; //Firestorm #endif //__APPLE__ // From Linux arch/arm64/include/asm/cputype.h // ARM_CPU_IMP_ARM 0x41 // ARM_CPU_IMP_APM 0x50 // ARM_CPU_IMP_CAVIUM 0x43 // ARM_CPU_IMP_BRCM 0x42 // ARM_CPU_IMP_QCOM 0x51 // ARM_CPU_IMP_NVIDIA 0x4E // ARM_CPU_IMP_FUJITSU 0x46 // ARM_CPU_IMP_HISI 0x48 // ARM_CPU_IMP_APPLE 0x61 // // ARM_CPU_PART_AEM_V8 0xD0F // ARM_CPU_PART_FOUNDATION 0xD00 // ARM_CPU_PART_CORTEX_A57 0xD07 // ARM_CPU_PART_CORTEX_A72 0xD08 // ARM_CPU_PART_CORTEX_A53 0xD03 // ARM_CPU_PART_CORTEX_A73 0xD09 // ARM_CPU_PART_CORTEX_A75 0xD0A // ARM_CPU_PART_CORTEX_A35 0xD04 // ARM_CPU_PART_CORTEX_A55 0xD05 // ARM_CPU_PART_CORTEX_A76 0xD0B // ARM_CPU_PART_NEOVERSE_N1 0xD0C // ARM_CPU_PART_CORTEX_A77 0xD0D // from GCC: // ARM_CPU_PART_CORTEX_A78 0xd41 // ARM_CPU_PART_CORTEX_X1 0xd44 // ARM_CPU_PART_CORTEX_V1 0xd40 // ARM_CPU_PART_CORTEX_N2 0xd49 // ARM_CPU_PART_CORTEX_R82 0xd15 // // APM_CPU_PART_POTENZA 0x000 // // CAVIUM_CPU_PART_THUNDERX 0x0A1 // CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 // CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3 // CAVIUM_CPU_PART_THUNDERX2 0x0AF // CAVIUM_CPU_PART_THUNDERX3 0x0B8 // taken from OpenBLAS // // BRCM_CPU_PART_BRAHMA_B53 0x100 // BRCM_CPU_PART_VULCAN 0x516 // // QCOM_CPU_PART_FALKOR_V1 0x800 // QCOM_CPU_PART_FALKOR 0xC00 // QCOM_CPU_PART_KRYO 0x200 // QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 // QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 // QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 // // NVIDIA_CPU_PART_DENVER 0x003 // NVIDIA_CPU_PART_CARMEL 0x004 // // FUJITSU_CPU_PART_A64FX 0x001 // // HISI_CPU_PART_TSV110 0xD01 // APPLE_CPU_PART_M1_ICESTORM 0x022 // APPLE_CPU_PART_M1_FIRESTORM 0x023 // Fixme: After merging the vpu_count branch we could report the // part here with bli_dolog. switch(implementer) { case 0x41: // ARM switch (part) { #ifdef BLIS_CONFIG_CORTEXA57 case 0xd07: // Cortex A57 return BLIS_ARCH_CORTEXA57; #endif #ifdef BLIS_CONFIG_CORTEXA53 case 0xd03: // Cortex A53 return BLIS_ARCH_CORTEXA53; #endif #ifdef BLIS_CONFIG_THUNDERX2 case 0xd0c: // Neoverse N1 (and Graviton G2?) return BLIS_ARCH_THUNDERX2; //placeholder for N1 #endif } break; case 0x42: // Broadcom switch (part) { #ifdef BLIS_CONFIG_THUNDERX2 case 0x516: // Vulcan return BLIS_ARCH_THUNDERX2; #endif } break; case 0x43: // Cavium switch (part) { #ifdef BLIS_CONFIG_THUNDERX2 case 0x0af: // ThunderX2 case 0x0b8: // ThunderX3 return BLIS_ARCH_THUNDERX2; #endif } break; case 0x46: // Fujitsu switch (part) { #ifdef BLIS_CONFIG_A64FX case 0x001: // A64FX return BLIS_ARCH_A64FX; #endif } break; case 0x61: // Apple switch (part) { #ifdef BLIS_CONFIG_FIRESTORM case 0x022: // Icestorm (M1.LITTLE) case 0x023: // Firestorm (M1.big) return BLIS_ARCH_FIRESTORM; #endif } break; } #ifdef BLIS_CONFIG_ARMSVE if (has_sve) return BLIS_ARCH_ARMSVE; #endif // Can't use #if defined(...) here because of parsing done for autoconfiguration #ifdef BLIS_CONFIG_CORTEXA57 return BLIS_ARCH_CORTEXA57; #else #ifdef BLIS_CONFIG_CORTEXA53 return BLIS_ARCH_CORTEXA53; #else return BLIS_ARCH_GENERIC; #endif #endif } uint32_t bli_cpuid_query ( uint32_t* model, uint32_t* part, uint32_t* features ) { *model = MODEL_ARMV8; *part = get_coretype(features); return VENDOR_ARM; } #elif defined(__arm__) || defined(_M_ARM) /* I can't easily find documentation to do this as for aarch64, though it presumably could be unearthed from Linux code. However, on Linux 5.2 (and Androids's 3.4), /proc/cpuinfo has this sort of thing, used below: CPU implementer : 0x41 CPU architecture: 7 CPU variant : 0x3 CPU part : 0xc09 The complication for family selection is that Neon is optional for CortexA9, for instance. That's tested in bli_cpuid_is_cortexa9. */ #define TEMP_BUFFER_SIZE 200 uint32_t bli_cpuid_query ( uint32_t* model, uint32_t* part, uint32_t* features ) { *model = MODEL_UNKNOWN; *part = 0; *features = 0; char* pci_str = "/proc/cpuinfo"; char proc_str[ TEMP_BUFFER_SIZE ]; char ptno_str[ TEMP_BUFFER_SIZE ]; char feat_str[ TEMP_BUFFER_SIZE ]; char* r_val; //printf( "bli_cpuid_query(): beginning search\n" ); // Search /proc/cpuinfo for the 'Processor' entry. r_val = find_string_in( "Processor", proc_str, TEMP_BUFFER_SIZE, pci_str ); if ( r_val == NULL ) return VENDOR_ARM; // Search /proc/cpuinfo for the 'CPU part' entry. r_val = find_string_in( "CPU part", ptno_str, TEMP_BUFFER_SIZE, pci_str ); if ( r_val == NULL ) return VENDOR_ARM; // Search /proc/cpuinfo for the 'Features' entry. r_val = find_string_in( "Features", feat_str, TEMP_BUFFER_SIZE, pci_str ); if ( r_val == NULL ) return VENDOR_ARM; #if 0 printf( "bli_cpuid_query(): full processor string: %s\n", proc_str ); printf( "bli_cpuid_query(): full part num string: %s\n", ptno_str ); printf( "bli_cpuid_query(): full features string: %s\n", feat_str ); #endif // Parse the feature string to check for SIMD features. if ( strstr( feat_str, "neon" ) != NULL || strstr( feat_str, "asimd" ) != NULL ) *features |= FEATURE_NEON; // Parse the feature string to check for SVE features. if ( strstr( feat_str, "sve" ) != NULL ) *features |= FEATURE_SVE; //printf( "bli_cpuid_query(): features var: %u\n", *features ); // Parse the processor string to uncover the model. if ( strstr( proc_str, "ARMv7" ) != NULL ) *model = MODEL_ARMV7; else if ( strstr( proc_str, "AArch64" ) != NULL || strstr( proc_str, "ARMv8" ) ) *model = MODEL_ARMV8; //printf( "bli_cpuid_query(): model: %u\n", *model ); // Parse the part number string. r_val = strstr( ptno_str, "0x" ); if ( r_val != NULL) { *part = strtol( r_val, NULL, 16 ); } //printf( "bli_cpuid_query(): part#: %x\n", *part ); return VENDOR_ARM; } char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ) { // This function searches for the first line of the file located at // 'filepath' that contains the string 'target' and then copies that // line (actually, the substring of the line starting with 'target') // to 'buffer', which is 'buf_len' bytes long. char* r_val = NULL; // Allocate a temporary local buffer equal to the size of buffer. char* buf_local = malloc( buf_len * sizeof( char ) ); // Open the file stream. FILE* stream = fopen( filepath, "r" ); // Repeatedly read in a line from the stream, storing the contents of // the stream into buf_local. while ( !feof( stream ) ) { // Read in the current line, up to buf_len-1 bytes. r_val = fgets( buf_local, buf_len-1, stream ); //printf( "read line: %s", buf_local ); // fgets() returns the pointer specified by the first argument (in // this case, buf_local) on success and NULL on error. if ( r_val == NULL ) break; // Since fgets() was successful, we can search for the target string // within the current line, as captured in buf_local. r_val = strstr( buf_local, target ); // If the target string was found in buf_local, we save it to buffer. if ( r_val != NULL ) { //printf( " found match to '%s'\n", target ); // Copy the string read by fgets() to the caller's buffer. strncpy( buffer, buf_local, buf_len ); // Make sure that we have a terminating null character by the // end of the buffer. if ( buf_len > 0 ) buffer[ buf_len - 1 ] = '\0'; // Leave the loop since we found the target string. break; } } // Close the file stream. fclose( stream ); // Free the temporary local buffer. free( buf_local ); // Return r_val so the caller knows if we failed. return r_val; } #endif cython-blis-0.9.1/blis/_src/frame/base/bli_cpuid.h000066400000000000000000000153231427272030600217510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018-2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // /* Copyright (C) 2017, The University of Texas at Austin Copyright (C) 2017, Devin Matthews Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif cython-blis-0.9.1/blis/_src/frame/base/bli_env.c000066400000000000000000000106131427272030600214250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_CONFIGURETIME_CPUID // NOTE: If you need to make any changes to this cpp branch, it's probably // the case that you also need to modify bli_arch.c, bli_cpuid.c, and // bli_env.c. Don't forget to update these other files as needed! // The BLIS_ENABLE_SYSTEM macro must be defined so that the correct cpp // branch in bli_system.h is processed. (This macro is normally defined in // bli_config.h.) #define BLIS_ENABLE_SYSTEM // Use C-style static inline functions for any static inline functions that // happen to be defined by the headers below. (This macro is normally defined // in bli_config_macro_defs.h.) #define BLIS_INLINE static // Since we're not building a shared library, we can forgo the use of the // BLIS_EXPORT_BLIS annotations by #defining them to be nothing. (This macro // is normally defined in bli_config_macro_defs.h.) #define BLIS_EXPORT_BLIS #include "bli_system.h" #include "bli_type_defs.h" //#include "bli_arch.h" //#include "bli_cpuid.h" #include "bli_env.h" #else #include "blis.h" #endif // ----------------------------------------------------------------------------- gint_t bli_env_get_var( const char* env, gint_t fallback ) { gint_t r_val; char* str; // Query the environment variable and store the result in str. str = getenv( env ); // Set the return value based on the string obtained from getenv(). if ( str != NULL ) { // If there was no error, convert the string to an integer and // prepare to return that integer. r_val = ( gint_t )strtol( str, NULL, 10 ); } else { // If there was an error, use the "fallback" as the return value. r_val = fallback; } return r_val; } #if 0 #ifdef _MSC_VER #define strerror_r(errno,buf,len) strerror_s(buf,len,errno) #endif void bli_env_set_var( const char* env, dim_t value ) { dim_t r_val; char value_str[32]; const char* fs_32 = "%u"; const char* fs_64 = "%lu"; // Convert the string to an integer, but vary the format specifier // depending on the integer type size. if ( bli_info_get_int_type_size() == 32 ) sprintf( value_str, fs_32, value ); else sprintf( value_str, fs_64, value ); // Set the environment variable using the string we just wrote to via // sprintf(). (The 'TRUE' argument means we want to overwrite the current // value if the environment variable already exists.) r_val = bli_setenv( env, value_str, TRUE ); // Check the return value in case something went horribly wrong. if ( r_val == -1 ) { char err_str[128]; // Query the human-readable error string corresponding to errno. strerror_r( errno, err_str, 128 ); // Print the error message. bli_print_msg( err_str, __FILE__, __LINE__ ); } } #endif cython-blis-0.9.1/blis/_src/frame/base/bli_env.h000066400000000000000000000036431427272030600214370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_error.c000066400000000000000000000206771427272030600220010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Internal array to hold error strings. static char *bli_error_string[-BLIS_ERROR_CODE_MAX] = { [-BLIS_INVALID_ERROR_CHECKING_LEVEL] = "Invalid error checking level.", [-BLIS_UNDEFINED_ERROR_CODE] = "Undefined error code.", [-BLIS_NULL_POINTER] = "Encountered unexpected null pointer.", [-BLIS_NOT_YET_IMPLEMENTED] = "Requested functionality not yet implemented.", [-BLIS_INVALID_SIDE] = "Invalid side parameter value.", [-BLIS_INVALID_UPLO] = "Invalid uplo_t parameter value.", [-BLIS_INVALID_TRANS] = "Invalid trans_t parameter value.", [-BLIS_INVALID_CONJ] = "Invalid conj_t parameter value.", [-BLIS_INVALID_DIAG] = "Invalid diag_t parameter value.", [-BLIS_EXPECTED_NONUNIT_DIAG] = "Expected object with non-unit diagonal.", [-BLIS_INVALID_DATATYPE] = "Invalid datatype value.", [-BLIS_EXPECTED_FLOATING_POINT_DATATYPE] = "Expected floating-point datatype value.", [-BLIS_EXPECTED_NONINTEGER_DATATYPE] = "Expected non-integer datatype value.", [-BLIS_EXPECTED_NONCONSTANT_DATATYPE] = "Expected non-constant datatype value.", [-BLIS_EXPECTED_REAL_DATATYPE] = "Expected real datatype value.", [-BLIS_EXPECTED_INTEGER_DATATYPE] = "Expected integer datatype value.", [-BLIS_INCONSISTENT_DATATYPES] = "Expected consistent datatypes (equal, or one being constant).", [-BLIS_EXPECTED_REAL_PROJ_OF] = "Expected second datatype to be real projection of first.", [-BLIS_EXPECTED_REAL_VALUED_OBJECT] = "Expected real-valued object (ie: if complex, imaginary component equals zero).", [-BLIS_INCONSISTENT_PRECISIONS] = "Expected consistent precisions (both single or both double).", [-BLIS_NONCONFORMAL_DIMENSIONS] = "Encountered non-conformal dimensions between objects.", [-BLIS_EXPECTED_SCALAR_OBJECT] = "Expected scalar object.", [-BLIS_EXPECTED_VECTOR_OBJECT] = "Expected vector object.", [-BLIS_UNEQUAL_VECTOR_LENGTHS] = "Encountered unequal vector lengths.", [-BLIS_EXPECTED_SQUARE_OBJECT] = "Expected square object.", [-BLIS_UNEXPECTED_OBJECT_LENGTH] = "Unexpected object length.", [-BLIS_UNEXPECTED_OBJECT_WIDTH] = "Unexpected object width.", [-BLIS_UNEXPECTED_VECTOR_DIM] = "Unexpected vector dimension.", [-BLIS_UNEXPECTED_DIAG_OFFSET] = "Unexpected object diagonal offset.", [-BLIS_NEGATIVE_DIMENSION] = "Encountered negative dimension.", [-BLIS_INVALID_ROW_STRIDE] = "Encountered invalid row stride relative to n dimension.", [-BLIS_INVALID_COL_STRIDE] = "Encountered invalid col stride relative to m dimension.", [-BLIS_INVALID_DIM_STRIDE_COMBINATION] = "Encountered invalid stride/dimension combination.", [-BLIS_EXPECTED_GENERAL_OBJECT] = "Expected general object.", [-BLIS_EXPECTED_HERMITIAN_OBJECT] = "Expected Hermitian object.", [-BLIS_EXPECTED_SYMMETRIC_OBJECT] = "Expected symmetric object.", [-BLIS_EXPECTED_TRIANGULAR_OBJECT] = "Expected triangular object.", [-BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT] = "Expected upper or lower triangular object.", [-BLIS_INVALID_3x1_SUBPART] = "Encountered invalid 3x1 (vertical) subpartition label.", [-BLIS_INVALID_1x3_SUBPART] = "Encountered invalid 1x3 (horizontal) subpartition label.", [-BLIS_INVALID_3x3_SUBPART] = "Encountered invalid 3x3 (diagonal) subpartition label.", [-BLIS_UNEXPECTED_NULL_CONTROL_TREE] = "Encountered unexpected null control tree node.", [-BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK] = "Pack schema not yet supported/implemented for use with unpacking.", [-BLIS_EXPECTED_NONNULL_OBJECT_BUFFER] = "Encountered object with non-zero dimensions containing null buffer.", [-BLIS_MALLOC_RETURNED_NULL] = "malloc() returned NULL; heap memory is likely exhausted.", [-BLIS_INVALID_PACKBUF] = "Invalid packbuf_t value.", [-BLIS_EXHAUSTED_CONTIG_MEMORY_POOL] = "Attempted to allocate more memory from contiguous pool than is available.", [-BLIS_INSUFFICIENT_STACK_BUF_SIZE] = "Configured maximum stack buffer size is insufficient for register blocksizes currently in use.", [-BLIS_ALIGNMENT_NOT_POWER_OF_TWO] = "Encountered memory alignment value that is either zero or not a power of two.", [-BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE] = "Encountered memory alignment value that is not a multiple of sizeof(void*).", [-BLIS_EXPECTED_OBJECT_ALIAS] = "Expected object to be alias.", [-BLIS_INVALID_ARCH_ID] = "Invalid architecture id value.", [-BLIS_UNINITIALIZED_GKS_CNTX] = "Accessed uninitialized context in gks; BLIS_ARCH_TYPE is probably set to an invalid architecture id.", [-BLIS_MC_DEF_NONMULTIPLE_OF_MR] = "Default MC is non-multiple of MR for one or more datatypes.", [-BLIS_MC_MAX_NONMULTIPLE_OF_MR] = "Maximum MC is non-multiple of MR for one or more datatypes.", [-BLIS_NC_DEF_NONMULTIPLE_OF_NR] = "Default NC is non-multiple of NR for one or more datatypes.", [-BLIS_NC_MAX_NONMULTIPLE_OF_NR] = "Maximum NC is non-multiple of NR for one or more datatypes.", [-BLIS_KC_DEF_NONMULTIPLE_OF_KR] = "Default KC is non-multiple of KR for one or more datatypes.", [-BLIS_KC_MAX_NONMULTIPLE_OF_KR] = "Maximum KC is non-multiple of KR for one or more datatypes.", }; // ----------------------------------------------------------------------------- void bli_print_msg( char* str, char* file, guint_t line ) { fprintf( stderr, "\n" ); fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line ); fprintf( stderr, "libblis: %s\n", str ); fflush( stderr ); } void bli_abort( void ) { fprintf( stderr, "libblis: Aborting.\n" ); //raise( SIGABRT ); abort(); } // ----------------------------------------------------------------------------- // Current error checking level. static BLIS_THREAD_LOCAL errlev_t bli_err_chk_level = BLIS_FULL_ERROR_CHECKING; errlev_t bli_error_checking_level( void ) { return bli_err_chk_level; } void bli_error_checking_level_set( errlev_t new_level ) { err_t e_val; e_val = bli_check_valid_error_level( new_level ); bli_check_error_code( e_val ); bli_err_chk_level = new_level; } bool bli_error_checking_is_enabled( void ) { return bli_error_checking_level() != BLIS_NO_ERROR_CHECKING; } char* bli_error_string_for_code( gint_t code ) { return bli_error_string[-code]; } cython-blis-0.9.1/blis/_src/frame/base/bli_error.h000066400000000000000000000041161427272030600217740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); cython-blis-0.9.1/blis/_src/frame/base/bli_func.c000066400000000000000000000061421427272030600215720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ) { func_t* f; err_t r_val; f = ( func_t* )bli_malloc_intl( sizeof( func_t ), &r_val ); bli_func_init ( f, ptr_s, ptr_d, ptr_c, ptr_z ); return f; } void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ) { bli_func_set_dt( ptr_s, BLIS_FLOAT, f ); bli_func_set_dt( ptr_d, BLIS_DOUBLE, f ); bli_func_set_dt( ptr_c, BLIS_SCOMPLEX, f ); bli_func_set_dt( ptr_z, BLIS_DCOMPLEX, f ); } void bli_func_init_null ( func_t* f ) { bli_func_set_dt( NULL, BLIS_FLOAT, f ); bli_func_set_dt( NULL, BLIS_DOUBLE, f ); bli_func_set_dt( NULL, BLIS_SCOMPLEX, f ); bli_func_set_dt( NULL, BLIS_DCOMPLEX, f ); } void bli_func_free( func_t* f ) { bli_free_intl( f ); } // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ) { return ( bli_func_get_dt( dt, f ) == NULL ); } bool bli_func_is_null( func_t* f ) { bool r_val = TRUE; num_t dt; // Iterate over all floating-point datatypes. If any is non-null, // return FALSE. Otherwise, if they are all null, return TRUE. for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { if ( bli_func_get_dt( dt, f ) != NULL ) { r_val = FALSE; break; } } return r_val; } cython-blis-0.9.1/blis/_src/frame/base/bli_func.h000066400000000000000000000055421427272030600216020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); cython-blis-0.9.1/blis/_src/frame/base/bli_getopt.c000066400000000000000000000141141427272030600221370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" static const char OPT_MARKER = '-'; void bli_getopt_init_state( int opterr, getopt_t* state ) { state->optarg = NULL; state->optind = 1; state->opterr = opterr; state->optopt = 0; } int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ) { static char* nextchar = NULL; char* elem_str; char* optstr_char; // If argv contains no more arguments to process, return. if ( state->optind == argc ) return -1; // Get a pointer to the current argv element string to process. If // nextchar is non-NULL, then it means the previous call processed // an element of argv with more than one option character, in which // case we need to pick up where we left off (which is the address // contained in nextchar). if ( nextchar == NULL ) { elem_str = argv[ state->optind ]; // elem_str[0] should be an OPT_MARKER if it is an option. In the // event that it is not an option, argv should be permuted so that // the non-option argument moves back toward the end of the list. // This functionality is not supported/implemented here. Therefore, // we require all of the program's option arguments to precede all of // its non-option arguments. if ( elem_str[0] != OPT_MARKER ) { state->optarg = NULL; //state->optind += 1; return -1; } // Skip over the OPT_MARKER. elem_str++; } else { // Note we don't need to skip the OPT_MARKER here since we are // continuing processing of a string with more than one option // character. // Use the nextchar pointer as our element string. elem_str = nextchar; // Reset nextchar to NULL. nextchar = NULL; } // Find the first occurrence of elem_str[0] in optstring. optstr_char = strchr( optstring, elem_str[0] ); // If the option character in elem_str[0] is absent from the option // string, store it and return '?'. if ( optstr_char == NULL ) { if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' missing from option string \"%s\"\n", elem_str[0], optstring ); // We can't dereference optstr_char since it is NULL, so we use // elem_str[0] instead. state->optopt = elem_str[0]; state->optind += 1; return '?'; } // We can now safely assume that an option characer was found in the // option string. Now we need to check if the option takes an argument. if ( optstr_char[1] == ':' ) { // If the current element string ends after the option character, // then the companion argument must be stored in the next element // of argv. Otherwise, the argument begins immediately after the // option character. if ( elem_str[1] == '\0' ) { // If there are no more elements in argv, the argument was // omitted. Store the corresponding option character and // return '?'. if ( state->optind + 1 >= argc ) { if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (end of argv)\n", elem_str[0] ); state->optopt = *optstr_char; state->optind += 1; return '?'; } // If there are still more elements in argv yet to process AND // the next one is an option, then the argument was omitted. else if ( argv[ state->optind + 1 ][0] == OPT_MARKER ) { if ( state->opterr == 1 ) fprintf( stderr, "bli_getopt(): **error**: option character '%c' is missing an argument (next element of argv is option '%c')\n", elem_str[0], argv[ state->optind + 1 ][1] ); state->optopt = *optstr_char; state->optind += 1; return '?'; } // If no error was deteced above, we can safely assign optarg // to be the next element in argv and increment optind by two. state->optarg = argv[ state->optind + 1 ]; state->optind += 2; } else { // We don't need to check for missing arguments since we know // that because the char after the option character is not NULL, // the character(s) after it must constitute the argument. state->optarg = &elem_str[1]; state->optind += 1; } return *optstr_char; } // The current option character does NOT take an argument. However, we // still need to check if the next char is an option argument (such as // occurs when the user runs "program -rv" instead of "program -r -v"). if ( elem_str[1] != '\0' ) { if ( strchr( optstring, elem_str[1] ) != NULL ) { nextchar = &elem_str[1]; return *optstr_char; } } state->optarg = NULL; state->optind += 1; return *optstr_char; } cython-blis-0.9.1/blis/_src/frame/base/bli_getopt.h000066400000000000000000000036421427272030600221500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); cython-blis-0.9.1/blis/_src/frame/base/bli_gks.c000066400000000000000000000673671427272030600214430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018-2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // The array of cntx_t* pointers to cache modified contexts used by // induced methods. static cntx_t** gks[ BLIS_NUM_ARCHS ]; // The array of function pointers holding the registered context initialization // functions for induced methods. static void_fp cntx_ind_init[ BLIS_NUM_ARCHS ]; // The array of function pointers holding the registered context initialization // functions for reference kernels. static void_fp cntx_ref_init[ BLIS_NUM_ARCHS ]; // Define a function pointer type for context initialization functions. typedef void (*nat_cntx_init_ft)( cntx_t* cntx ); typedef void (*ref_cntx_init_ft)( cntx_t* cntx ); typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx ); // ----------------------------------------------------------------------------- void bli_gks_init( void ) { { // Initialize the internal data structure we use to track registered // contexts. bli_gks_init_index(); // Register a context for each architecture that was #define'd in // bli_config.h. // Intel architectures #ifdef BLIS_CONFIG_SKX bli_gks_register_cntx( BLIS_ARCH_SKX, bli_cntx_init_skx, bli_cntx_init_skx_ref, bli_cntx_init_skx_ind ); #endif #ifdef BLIS_CONFIG_KNL bli_gks_register_cntx( BLIS_ARCH_KNL, bli_cntx_init_knl, bli_cntx_init_knl_ref, bli_cntx_init_knl_ind ); #endif #ifdef BLIS_CONFIG_KNC bli_gks_register_cntx( BLIS_ARCH_KNC, bli_cntx_init_knc, bli_cntx_init_knc_ref, bli_cntx_init_knc_ind ); #endif #ifdef BLIS_CONFIG_HASWELL bli_gks_register_cntx( BLIS_ARCH_HASWELL, bli_cntx_init_haswell, bli_cntx_init_haswell_ref, bli_cntx_init_haswell_ind ); #endif #ifdef BLIS_CONFIG_SANDYBRIDGE bli_gks_register_cntx( BLIS_ARCH_SANDYBRIDGE, bli_cntx_init_sandybridge, bli_cntx_init_sandybridge_ref, bli_cntx_init_sandybridge_ind ); #endif #ifdef BLIS_CONFIG_PENRYN bli_gks_register_cntx( BLIS_ARCH_PENRYN, bli_cntx_init_penryn, bli_cntx_init_penryn_ref, bli_cntx_init_penryn_ind ); #endif // AMD architectures #ifdef BLIS_CONFIG_ZEN3 bli_gks_register_cntx( BLIS_ARCH_ZEN3, bli_cntx_init_zen3, bli_cntx_init_zen3_ref, bli_cntx_init_zen3_ind ); #endif #ifdef BLIS_CONFIG_ZEN2 bli_gks_register_cntx( BLIS_ARCH_ZEN2, bli_cntx_init_zen2, bli_cntx_init_zen2_ref, bli_cntx_init_zen2_ind ); #endif #ifdef BLIS_CONFIG_ZEN bli_gks_register_cntx( BLIS_ARCH_ZEN, bli_cntx_init_zen, bli_cntx_init_zen_ref, bli_cntx_init_zen_ind ); #endif #ifdef BLIS_CONFIG_EXCAVATOR bli_gks_register_cntx( BLIS_ARCH_EXCAVATOR, bli_cntx_init_excavator, bli_cntx_init_excavator_ref, bli_cntx_init_excavator_ind ); #endif #ifdef BLIS_CONFIG_STEAMROLLER bli_gks_register_cntx( BLIS_ARCH_STEAMROLLER, bli_cntx_init_steamroller, bli_cntx_init_steamroller_ref, bli_cntx_init_steamroller_ind ); #endif #ifdef BLIS_CONFIG_PILEDRIVER bli_gks_register_cntx( BLIS_ARCH_PILEDRIVER, bli_cntx_init_piledriver, bli_cntx_init_piledriver_ref, bli_cntx_init_piledriver_ind ); #endif #ifdef BLIS_CONFIG_BULLDOZER bli_gks_register_cntx( BLIS_ARCH_BULLDOZER, bli_cntx_init_bulldozer, bli_cntx_init_bulldozer_ref, bli_cntx_init_bulldozer_ind ); #endif // ARM architectures #ifdef BLIS_CONFIG_A64FX bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx, bli_cntx_init_a64fx_ref, bli_cntx_init_a64fx_ind ); #endif #ifdef BLIS_CONFIG_THUNDERX2 bli_gks_register_cntx( BLIS_ARCH_THUNDERX2, bli_cntx_init_thunderx2, bli_cntx_init_thunderx2_ref, bli_cntx_init_thunderx2_ind ); #endif #ifdef BLIS_CONFIG_CORTEXA57 bli_gks_register_cntx( BLIS_ARCH_CORTEXA57, bli_cntx_init_cortexa57, bli_cntx_init_cortexa57_ref, bli_cntx_init_cortexa57_ind ); #endif #ifdef BLIS_CONFIG_CORTEXA53 bli_gks_register_cntx( BLIS_ARCH_CORTEXA53, bli_cntx_init_cortexa53, bli_cntx_init_cortexa53_ref, bli_cntx_init_cortexa53_ind ); #endif #ifdef BLIS_CONFIG_ARMSVE bli_gks_register_cntx( BLIS_ARCH_ARMSVE, bli_cntx_init_armsve, bli_cntx_init_armsve_ref, bli_cntx_init_armsve_ind ); #endif #ifdef BLIS_CONFIG_A64FX bli_gks_register_cntx( BLIS_ARCH_A64FX, bli_cntx_init_a64fx, bli_cntx_init_a64fx_ref, bli_cntx_init_a64fx_ind ); #endif #ifdef BLIS_CONFIG_FIRESTORM bli_gks_register_cntx( BLIS_ARCH_FIRESTORM, bli_cntx_init_firestorm, bli_cntx_init_firestorm_ref, bli_cntx_init_firestorm_ind ); #endif #ifdef BLIS_CONFIG_CORTEXA15 bli_gks_register_cntx( BLIS_ARCH_CORTEXA15, bli_cntx_init_cortexa15, bli_cntx_init_cortexa15_ref, bli_cntx_init_cortexa15_ind ); #endif #ifdef BLIS_CONFIG_CORTEXA9 bli_gks_register_cntx( BLIS_ARCH_CORTEXA9, bli_cntx_init_cortexa9, bli_cntx_init_cortexa9_ref, bli_cntx_init_cortexa9_ind ); #endif // IBM architectures #ifdef BLIS_CONFIG_POWER10 bli_gks_register_cntx( BLIS_ARCH_POWER10, bli_cntx_init_power10, bli_cntx_init_power10_ref, bli_cntx_init_power10_ind ); #endif #ifdef BLIS_CONFIG_POWER9 bli_gks_register_cntx( BLIS_ARCH_POWER9, bli_cntx_init_power9, bli_cntx_init_power9_ref, bli_cntx_init_power9_ind ); #endif #ifdef BLIS_CONFIG_POWER7 bli_gks_register_cntx( BLIS_ARCH_POWER7, bli_cntx_init_power7, bli_cntx_init_power7_ref, bli_cntx_init_power7_ind ); #endif #ifdef BLIS_CONFIG_BGQ bli_gks_register_cntx( BLIS_ARCH_BGQ, bli_cntx_init_bgq, bli_cntx_init_bgq_ref, bli_cntx_init_bgq_ind ); #endif // Generic architectures #ifdef BLIS_CONFIG_GENERIC bli_gks_register_cntx( BLIS_ARCH_GENERIC, bli_cntx_init_generic, bli_cntx_init_generic_ref, bli_cntx_init_generic_ind ); #endif } } // ----------------------------------------------------------------------------- void bli_gks_finalize( void ) { arch_t id; ind_t ind; // BEGIN CRITICAL SECTION // NOTE: This critical section is implicit. We assume this function is only // called from within the critical section within bli_finalize(). { // Iterate over the architectures in the gks array. for ( id = 0; id < BLIS_NUM_ARCHS; ++id ) { cntx_t** restrict gks_id = gks[ id ]; // Only consider context arrays for architectures that were allocated // in the first place. if ( gks_id != NULL ) { // Iterate over the induced methods in the current sub-array // referenced by cntx_pp. for ( ind = 0; ind < BLIS_NUM_IND_METHODS; ++ind ) { cntx_t* restrict gks_id_ind = gks_id[ ind ]; // If the current context was allocated, free it. if ( gks_id_ind != NULL ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_gks_finalize(): cntx for ind_t %d: ", ( int )ind ); #endif bli_free_intl( gks_id_ind ); } } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_gks_finalize(): gks for arch_t %d: ", ( int )id ); #endif // Free the array of BLIS_NUM_IND_METHODS cntx* elements. bli_free_intl( gks_id ); } } } // END CRITICAL SECTION } // ----------------------------------------------------------------------------- void bli_gks_init_index( void ) { // This function is called by bli_gks_init(). It simply initializes all // architecture id elements of the internal arrays to NULL. const size_t gks_size = sizeof( cntx_t* ) * BLIS_NUM_ARCHS; const size_t fpa_size = sizeof( void_fp ) * BLIS_NUM_ARCHS; // Set every entry in gks and context init function pointer arrays to // zero/NULL. This is done so that later on we know which ones were // allocated. memset( gks, 0, gks_size ); memset( cntx_ref_init, 0, fpa_size ); memset( cntx_ind_init, 0, fpa_size ); } // ----------------------------------------------------------------------------- cntx_t* bli_gks_lookup_nat_cntx ( arch_t id ) { // Return the address of the (native) context for a given architecture id. // This function assumes the architecture has already been registered. return bli_gks_lookup_ind_cntx( id, BLIS_NAT ); } // ----------------------------------------------------------------------------- cntx_t* bli_gks_lookup_ind_cntx ( arch_t id, ind_t ind ) { // Return the address of the context for a given architecture id and // induced method. This function assumes the architecture has already // been registered. Note that this function returns NULL if the induced // method hasn't yet been called (and thus its context pointer is still // NULL). // Sanity check: verify that the arch_t id is valid. if ( bli_error_checking_is_enabled() ) { err_t e_val = bli_check_valid_arch_id( id ); bli_check_error_code( e_val ); } // Index into the array of context pointers for the given architecture id, // and then index into the subarray for the given induced method. cntx_t** restrict gks_id = gks[ id ]; cntx_t* restrict gks_id_ind = gks_id[ ind ]; // Return the context pointer at gks_id_ind. return gks_id_ind; } // ----------------------------------------------------------------------------- cntx_t** bli_gks_lookup_id ( arch_t id ) { // Return the address of the array of context pointers for a given // architecture id. This function is only used for sanity check purposes // to ensure that the underlying data structures for a particular id are // initialized. // Index into the array of context pointers for the given architecture id. cntx_t** restrict gks_id = gks[ id ]; // Return the context pointer at gks_id_ind. return gks_id; } // ----------------------------------------------------------------------------- void bli_gks_register_cntx ( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ) { err_t r_val; // This function is called by bli_gks_init() for each architecture that // will be supported by BLIS. It takes an architecture id and three // function pointers, one to a function that initializes a native context // (supplied by the kernel developer), one to a function that initializes // a reference context (with function pointers specific to the architecture // associated with id), and one to a function that initializes a // context for use with induced methods (again, with function pointers // to the architecture). The latter two functions are automatically // generated by the framework. Unlike with native contexts, we don't // actually store the induced contexts until that induced method is // called, and we don't ever store reference contexts. For this reason, we // can get away with only storing the pointers to the initialization // functions for those latter two types of contexts, which we can then // call at a later time when those contexts are needed. // Sanity check: verify that the arch_t id is valid. if ( bli_error_checking_is_enabled() ) { err_t e_val = bli_check_valid_arch_id( id ); bli_check_error_code( e_val ); } nat_cntx_init_ft f = nat_fp; // First, store the function pointers to the context initialization // functions for reference kernels and induced method execution. The // former will be used whenever we need to obtain reference kernels and // latter will be used later on if the user calls a level-3 function // with induced execution enabled. cntx_ref_init[ id ] = ref_fp; cntx_ind_init[ id ] = ind_fp; // If the the context array pointer isn't NULL, then it means the given // architecture id has already registered (and the underlying memory // allocations and context initializations have already been performed). // This is really just a safety feature to prevent memory leaks; this // early return should never occur, because the caller should never try // to register with an architecture id that has already been registered. if ( gks[ id ] != NULL ) return; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_gks_register_cntx(): " ); #endif // At this point, we know the pointer to the array of cntx_t* is NULL and // needs to be allocated. Allocate the memory and initialize it to // zeros/NULL, storing the address of the alloacted memory at the element // for the current architecture id. gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val ); // Alias the allocated array for readability. cntx_t** restrict gks_id = gks[ id ]; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_gks_register_cntx(): " ); #endif // Allocate memory for a single context and store the address at // the element in the gks[ id ] array that is reserved for native // execution. gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val ); // Alias the allocated context address for readability. cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ]; // Call the context initialization function on the element of the newly // allocated array corresponding to native execution. f( gks_id_nat ); // Verify that cache blocksizes are whole multiples of register blocksizes. // Specifically, verify that: // - MC is a whole multiple of MR. // - NC is a whole multiple of NR. // - KC is a whole multiple of KR. // These constraints are enforced because it makes it easier to handle diagonals // in the macro-kernel implementations. Additionally, we optionally verify that: // - MC is a whole multiple of NR. // - NC is a whole multiple of MR. // These latter constraints, guarded by #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS // below, are only enforced when we wish to be able to handle the trsm right- // side case handling that swaps A and B, so that B is the triangular matrix, // with NR blocking used to pack A and MR blocking used to pack B, with the // arguments to the gemmtrsm microkernel swapped at the last minute, as the // kernel is called. err_t e_val; blksz_t* restrict mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat ); blksz_t* restrict nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat ); blksz_t* restrict kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat ); blksz_t* restrict mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat ); blksz_t* restrict nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat ); blksz_t* restrict kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat ); e_val = bli_check_valid_mc_mod_mult( mc, mr ); bli_check_error_code( e_val ); e_val = bli_check_valid_nc_mod_mult( nc, nr ); bli_check_error_code( e_val ); e_val = bli_check_valid_kc_mod_mult( kc, kr ); bli_check_error_code( e_val ); #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS e_val = bli_check_valid_mc_mod_mult( mc, nr ); bli_check_error_code( e_val ); e_val = bli_check_valid_nc_mod_mult( nc, mr ); bli_check_error_code( e_val ); #endif // Verify that the register blocksizes in the context are sufficiently large // relative to the maximum stack buffer size defined at configure-time. e_val = bli_check_sufficient_stack_buf_size( gks_id_nat ); bli_check_error_code( e_val ); } // ----------------------------------------------------------------------------- cntx_t* bli_gks_query_cntx( void ) { return bli_gks_query_nat_cntx(); } cntx_t* bli_gks_query_nat_cntx( void ) { bli_init_once(); // Return the address of the native context for the architecture id // corresponding to the current hardware, as determined by // bli_arch_query_id(). // Query the architecture id. arch_t id = bli_arch_query_id(); // Use the architecture id to look up a pointer to its context. cntx_t* cntx = bli_gks_lookup_nat_cntx( id ); return cntx; } // ----------------------------------------------------------------------------- cntx_t* bli_gks_query_cntx_noinit( void ) { // This function is identical to bli_gks_query_cntx(), except that it // does not call bli_init_once(). // Query the architecture id. arch_t id = bli_arch_query_id(); // Use the architecture id to look up a pointer to its context. cntx_t* cntx = bli_gks_lookup_nat_cntx( id ); return cntx; } // ----------------------------------------------------------------------------- // A mutex to allow synchronous access to the gks when it needs to be updated // with a new entry corresponding to a context for an ind_t value. static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; cntx_t* bli_gks_query_ind_cntx ( ind_t ind, num_t dt ) { bli_init_once(); cntx_t* gks_id_ind; err_t r_val; // Return the address of a context that will be suited for executing a // level-3 operation via the requested induced method (and datatype) for // the architecture id corresponding to the current hardware, as // determined by bli_arch_query_id(). // This function is called when a level-3 operation via induced method is // called, e.g. bli_gemm1m(). If this is the first time that induced method // is being executed since bli_gks_init(), the necessary context structure // is allocated and initialized. If this is not the first time, then the // address of a previously-allocated and initialized (cached) context is // returned. Note that much of this must be done with mutual exclusion to // ensure thread safety and deterministic behavior. // Query the architecture id. arch_t id = bli_arch_query_id(); // Sanity check: verify that the arch_t id is valid. if ( bli_error_checking_is_enabled() ) { err_t e_val = bli_check_valid_arch_id( id ); bli_check_error_code( e_val ); } // NOTE: These initial statements can reside outside of the critical section // because gks[ id ] should have already been allocated, and the native // context in that array should have already been allocated/initialized. // Query the gks for the array of context pointers corresponding to the // given architecture id. cntx_t** restrict gks_id = gks[ id ]; cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ]; // If for some reason the native context was requested, we can return // its address early. if ( ind == BLIS_NAT ) return gks_id_nat; // This function assumes that the architecture idenified by id has // already been registered with the gks (which guarantees that // gks[ id ] is non-NULL and gks[ id ][ BLIS_NAT ] is also non-NULL // and refers to a context initialized with valid data). // Acquire the mutex protecting the gks. bli_pthread_mutex_lock( &gks_mutex ); // BEGIN CRITICAL SECTION { // Alias for readability the element of gks_id associated with the // requested induced method. gks_id_ind = gks_id[ ind ]; // If the context pointer is NULL, then we know we must allocate and // then initialize the context before returning its address. if ( gks_id_ind == NULL ) { // If gks_id_ind is NULL, then we know we must allocate and then // initialize the context, storing its address back to // gks_id[ ind ]. gks_id_ind = bli_calloc_intl( sizeof( cntx_t ), &r_val ); gks_id[ ind ] = gks_id_ind; // Before we can call the induced method context initialization // function on the newly allocated structure, we must first copy // over the contents of the native context. *gks_id_ind = *gks_id_nat; // Use the architecture id to look up the function pointer to the // context initialization function for induced methods. ind_cntx_init_ft f = cntx_ind_init[ id ]; // Now we modify the context (so that it contains the proper values // for its induced method) by calling the context initialization // function for the current induced method. (That function assumes // that the context is pre- initialized with values for native // execution.) f( ind, gks_id_ind ); } } // END CRITICAL SECTION // Release the mutex protecting the gks. bli_pthread_mutex_unlock( &gks_mutex ); // Return the address of the newly-allocated/initialized context. return gks_id_ind; } // ----------------------------------------------------------------------------- void bli_gks_init_ref_cntx ( cntx_t* cntx ) { // Query the architecture id. arch_t id = bli_arch_query_id(); // Sanity check: verify that the arch_t id is valid. if ( bli_error_checking_is_enabled() ) { err_t e_val = bli_check_valid_arch_id( id ); bli_check_error_code( e_val ); } // Obtain the function pointer to the context initialization function for // reference kernels. ref_cntx_init_ft f = cntx_ref_init[ id ]; // Initialize the caller's context with reference kernels and related values. f( cntx ); } // ----------------------------------------------------------------------------- bool bli_gks_cntx_l3_nat_ukr_is_ref ( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { cntx_t ref_cntx; // Initialize a context with reference kernels for the arch_t id queried // via bli_arch_query_id(). bli_gks_init_ref_cntx( &ref_cntx ); // Query each context for the micro-kernel function pointer for the // specified datatype. void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, &ref_cntx ); void_fp fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, cntx ); // Return the result. return fp == ref_fp; } // // -- level-3 micro-kernel implementation strings ------------------------------ // static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] = { "refrnce", "virtual", "optimzd", "notappl", }; // ----------------------------------------------------------------------------- char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ) { kimpl_t ki; // Query the context for the current induced method and datatype, and // then query the ukernel function pointer for the given datatype from // that context. cntx_t* cntx = bli_gks_query_ind_cntx( method, dt ); void_fp fp = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx ); // Check whether the ukernel function pointer is NULL for the given // datatype. If it is NULL, return the string for not applicable. // Otherwise, query the ukernel implementation type using the method // provided and return the associated string. if ( fp == NULL ) ki = BLIS_NOTAPPLIC_UKERNEL; else ki = bli_gks_l3_ukr_impl_type( ukr, method, dt ); return bli_gks_l3_ukr_impl_str[ ki ]; } #if 0 char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ) { opid_t oper; ind_t method; kimpl_t ki; // We need to decide which operation we will use to query the // current available induced method. If the ukr type given is // BLIS_GEMM_UKR, we use gemm. Otherwise, we use trsm (since // the four other defined ukr types are trsm-related). if ( ukr == BLIS_GEMM_UKR ) oper = BLIS_GEMM; else oper = BLIS_TRSM; // Query the current available induced method using the // chosen operation id type. method = bli_l3_ind_oper_find_avail( oper, dt ); // Query the ukernel implementation type using the current // available method. ki = bli_gks_l3_ukr_impl_type( ukr, method, dt ); return bli_ukr_impl_str[ ki ]; } #endif kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ) { // If the current available induced method is not native, it // must be virtual. if ( method != BLIS_NAT ) return BLIS_VIRTUAL_UKERNEL; else { // If the current available induced method for the gemm // operation is native, then it might be reference or // optimized. To determine which, we compare the // datatype-specific function pointer within the ukrs // object corresponding to the current available induced // method to the typed function pointer within the known // reference ukrs object. cntx_t ref_cntx_l; // Query the architecture id. arch_t id = bli_arch_query_id(); // Sanity check: verify that the arch_t id is valid. if ( bli_error_checking_is_enabled() ) { err_t e_val = bli_check_valid_arch_id( id ); bli_check_error_code( e_val ); } // Obtain the function pointer to the context initialization function // for reference kernels. ref_cntx_init_ft f = cntx_ref_init[ id ]; // Initialize a local context with reference kernels and related values. f( &ref_cntx_l ); // Query the native context from the gks. cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id ); // Query the native ukernel func_t from both the native and reference // contexts. void_fp nat_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, nat_cntx ); void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, &ref_cntx_l ); if ( nat_fp == ref_fp ) return BLIS_REFERENCE_UKERNEL; else return BLIS_OPTIMIZED_UKERNEL; } } cython-blis-0.9.1/blis/_src/frame/base/bli_gks.h000066400000000000000000000052011427272030600214230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_ind.c000066400000000000000000000136541427272030600214170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] = { /* 1m */ "1m", /* nat */ "native", }; // ----------------------------------------------------------------------------- void bli_ind_init( void ) { // NOTE: Instead of calling bli_gks_query_cntx(), we call // bli_gks_query_cntx_noinit() to avoid the call to bli_init_once(). cntx_t* cntx = bli_gks_query_cntx_noinit(); // For each precision, enable the default induced method (1m) if both of // the following conditions are met: // - the complex domain kernel is the (unoptimized) reference kernel // - the real domain kernel is NOT the (unoptimized) reference kernel // The second condition means that BLIS will not bother to use an induced // method if both the real and complex domain kernels are reference. bool s_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_FLOAT, BLIS_GEMM_UKR, cntx ); bool d_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_DOUBLE, BLIS_GEMM_UKR, cntx ); bool c_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_SCOMPLEX, BLIS_GEMM_UKR, cntx ); bool z_is_ref = bli_gks_cntx_l3_nat_ukr_is_ref( BLIS_DCOMPLEX, BLIS_GEMM_UKR, cntx ); if ( c_is_ref && !s_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_SCOMPLEX ); if ( z_is_ref && !d_is_ref ) bli_ind_enable_dt( BLIS_1M, BLIS_DCOMPLEX ); } void bli_ind_finalize( void ) { } // ----------------------------------------------------------------------------- void bli_ind_enable( ind_t method ) { bli_ind_enable_dt( method, BLIS_SCOMPLEX ); bli_ind_enable_dt( method, BLIS_DCOMPLEX ); } void bli_ind_disable( ind_t method ) { bli_ind_disable_dt( method, BLIS_SCOMPLEX ); bli_ind_disable_dt( method, BLIS_DCOMPLEX ); } void bli_ind_disable_all( void ) { bli_ind_disable_all_dt( BLIS_SCOMPLEX ); bli_ind_disable_all_dt( BLIS_DCOMPLEX ); } // ----------------------------------------------------------------------------- void bli_ind_enable_dt( ind_t method, num_t dt ) { if ( !bli_is_complex( dt ) ) return; bli_l3_ind_set_enable_dt( method, dt, TRUE ); } void bli_ind_disable_dt( ind_t method, num_t dt ) { if ( !bli_is_complex( dt ) ) return; bli_l3_ind_set_enable_dt( method, dt, FALSE ); } void bli_ind_disable_all_dt( num_t dt ) { ind_t im; for ( im = 0; im < BLIS_NUM_IND_METHODS; ++im ) { // Never disable native execution. if ( im != BLIS_NAT ) bli_ind_disable_dt( im, dt ); } } // ----------------------------------------------------------------------------- void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ) { if ( !bli_is_complex( dt ) ) return; if ( bli_opid_is_level3( oper ) ) { bli_l3_ind_oper_enable_only( oper, method, dt ); } else { // Other operations are not implemented, so requests to enable // them for any given induced method are currently no-ops. ; } } // ----------------------------------------------------------------------------- bool bli_ind_oper_is_impl( opid_t oper, ind_t method ) { bool is_impl = FALSE; if ( bli_opid_is_level3( oper ) ) { // Look up whether the operation is implemented for the given induced // method id. is_impl = bli_l3_ind_oper_is_impl( oper, method ); } else { // All other operations should be reported as not implemented, // unless the requested check was for BLIS_NAT, in which case // all operations are implemented. if ( method == BLIS_NAT ) is_impl = TRUE; else is_impl = FALSE; } return is_impl; } ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ) { ind_t method; if ( bli_opid_is_level3( oper ) ) { method = bli_l3_ind_oper_find_avail( oper, dt ); } else { // Currently, any operation that is not level-3 is guaranteed // to be native. method = BLIS_NAT; } return method; } char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ) { ind_t method = bli_ind_oper_find_avail( oper, dt ); return bli_ind_get_impl_string( method ); } // ----------------------------------------------------------------------------- char* bli_ind_get_impl_string( ind_t method ) { return bli_ind_impl_str[ method ]; } num_t bli_ind_map_cdt_to_index( num_t dt ) { // A non-complex datatype should never be passed in. if ( !bli_is_complex( dt ) ) bli_abort(); // Map the complex datatype to a zero-based index. if ( bli_is_scomplex( dt ) ) return 0; else /* if ( bli_is_dcomplex( dt ) ) */ return 1; } cython-blis-0.9.1/blis/_src/frame/base/bli_ind.h000066400000000000000000000051221427272030600214130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management #include "bli_l3_ind.h" void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_info.c000066400000000000000000000170501427272030600215720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // -- General library information ---------------------------------------------- // This string gets defined via -D on the command line when BLIS is compiled. // This string is (or rather, should be) only used here. static char* bli_version_str = BLIS_VERSION_STRING; static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE ); char* bli_info_get_version_str( void ) { return bli_version_str; } char* bli_info_get_int_type_size_str( void ) { return bli_int_type_size_str; } // -- General configuration-related -------------------------------------------- gint_t bli_info_get_int_type_size( void ) { return BLIS_INT_TYPE_SIZE; } gint_t bli_info_get_num_fp_types( void ) { return BLIS_NUM_FP_TYPES; } gint_t bli_info_get_max_type_size( void ) { return BLIS_MAX_TYPE_SIZE; } gint_t bli_info_get_page_size( void ) { return BLIS_PAGE_SIZE; } gint_t bli_info_get_simd_num_registers( void ) { return BLIS_SIMD_MAX_NUM_REGISTERS; } gint_t bli_info_get_simd_size( void ) { return BLIS_SIMD_MAX_SIZE; } gint_t bli_info_get_simd_align_size( void ) { return BLIS_SIMD_ALIGN_SIZE; } gint_t bli_info_get_stack_buf_max_size( void ) { return BLIS_STACK_BUF_MAX_SIZE; } gint_t bli_info_get_stack_buf_align_size( void ) { return BLIS_STACK_BUF_ALIGN_SIZE; } gint_t bli_info_get_heap_addr_align_size( void ) { return BLIS_HEAP_ADDR_ALIGN_SIZE; } gint_t bli_info_get_heap_stride_align_size( void ) { return BLIS_HEAP_STRIDE_ALIGN_SIZE; } gint_t bli_info_get_pool_addr_align_size_a( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_A; } gint_t bli_info_get_pool_addr_align_size_b( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_B; } gint_t bli_info_get_pool_addr_align_size_c( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_C; } gint_t bli_info_get_pool_addr_align_size_gen( void ) { return BLIS_POOL_ADDR_ALIGN_SIZE_GEN; } gint_t bli_info_get_pool_addr_offset_size_a( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_A; } gint_t bli_info_get_pool_addr_offset_size_b( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_B; } gint_t bli_info_get_pool_addr_offset_size_c( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_C; } gint_t bli_info_get_pool_addr_offset_size_gen( void ) { return BLIS_POOL_ADDR_OFFSET_SIZE_GEN; } gint_t bli_info_get_enable_blas( void ) { #ifdef BLIS_ENABLE_BLAS return 1; #else return 0; #endif } gint_t bli_info_get_enable_cblas( void ) { #ifdef BLIS_ENABLE_CBLAS return 1; #else return 0; #endif } gint_t bli_info_get_blas_int_type_size( void ) { return BLIS_BLAS_INT_TYPE_SIZE; } gint_t bli_info_get_enable_pba_pools( void ) { #ifdef BLIS_ENABLE_PBA_POOLS return 1; #else return 0; #endif } gint_t bli_info_get_enable_sba_pools( void ) { #ifdef BLIS_ENABLE_SBA_POOLS return 1; #else return 0; #endif } gint_t bli_info_get_enable_threading( void ) { if ( bli_info_get_enable_openmp() || bli_info_get_enable_pthreads() ) return 1; else return 0; } gint_t bli_info_get_enable_openmp( void ) { #ifdef BLIS_ENABLE_OPENMP return 1; #else return 0; #endif } gint_t bli_info_get_enable_pthreads( void ) { #ifdef BLIS_ENABLE_PTHREADS return 1; #else return 0; #endif } gint_t bli_info_get_thread_part_jrir_slab( void ) { #ifdef BLIS_ENABLE_JRIR_SLAB return 1; #else return 0; #endif } gint_t bli_info_get_thread_part_jrir_rr( void ) { #ifdef BLIS_ENABLE_JRIR_RR return 1; #else return 0; #endif } gint_t bli_info_get_enable_memkind( void ) { #ifdef BLIS_ENABLE_MEMKIND return 1; #else return 0; #endif } gint_t bli_info_get_enable_sandbox( void ) { #ifdef BLIS_ENABLE_SANDBOX return 1; #else return 0; #endif } // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMM_UKR, method, dt ); } char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_L_UKR, method, dt ); } char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_U_UKR, method, dt ); } char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_L_UKR, method, dt ); } char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ) { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_U_UKR, method, dt ); } // -- BLIS implementation query (level-3) -------------------------------------- char* bli_info_get_gemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM, dt ); } char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_hemm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM, dt ); } char* bli_info_get_herk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_symm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM, dt ); } char* bli_info_get_syrk_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); } char* bli_info_get_trmm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM, dt ); } char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); } char* bli_info_get_trsm_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM, dt ); } cython-blis-0.9.1/blis/_src/frame/base/bli_info.h000066400000000000000000000125171427272030600216020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); cython-blis-0.9.1/blis/_src/frame/base/bli_init.c000066400000000000000000000105341427272030600216020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // ----------------------------------------------------------------------------- void bli_init( void ) { bli_init_once(); } void bli_finalize( void ) { bli_finalize_once(); } // ----------------------------------------------------------------------------- void bli_init_auto( void ) { bli_init_once(); } void bli_finalize_auto( void ) { // The _auto() functions are used when initializing the BLAS compatibility // layer. It would not make much sense to automatically initialize and // finalize for every BLAS routine call; therefore, we remain initialized // unless and until the application explicitly calls bli_finalize(). } // ----------------------------------------------------------------------------- // A pthread_once_t variable is a pthread structure used in pthread_once(). // pthread_once() is guaranteed to execute exactly once among all threads that // pass in this control object (until/unless the variable is reset). static bli_pthread_once_t once_init = BLIS_PTHREAD_ONCE_INIT; static bli_pthread_once_t once_finalize = BLIS_PTHREAD_ONCE_INIT; void bli_init_once( void ) { bli_pthread_once( &once_init, bli_init_apis ); } void bli_finalize_once( void ) { bli_pthread_once( &once_finalize, bli_finalize_apis ); } // ----------------------------------------------------------------------------- void bli_init_apis( void ) { // Initialize various sub-APIs. bli_gks_init(); bli_ind_init(); bli_thread_init(); bli_pack_init(); bli_memsys_init(); // Reset the control variable that will allow finalization. // NOTE: We must initialize a fresh pthread_once_t object and THEN copy the // contents to the static control variable because some implementations of // pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as // a struct initializer expression (i.e. { ... }), which cannot be used in // post-declaration struct assignment in strict C99. const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT; once_finalize = once_new; } void bli_finalize_apis( void ) { // Finalize various sub-APIs. bli_memsys_finalize(); bli_pack_finalize(); bli_thread_finalize(); bli_ind_finalize(); bli_gks_finalize(); // Reset the control variable that will allow (re-)initialization. // NOTE: We must initialize a fresh pthread_once_t object and THEN copy the // contents to the static control variable because some implementations of // pthreads define pthread_once_t as a struct and BLIS_PTHREAD_ONCE_INIT as // a struct initializer expression (i.e. { ... }), which cannot be used in // post-declaration struct assignment in strict C99. const bli_pthread_once_t once_new = BLIS_PTHREAD_ONCE_INIT; once_init = once_new; } cython-blis-0.9.1/blis/_src/frame/base/bli_init.h000066400000000000000000000036251427272030600216120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); cython-blis-0.9.1/blis/_src/frame/base/bli_machval.c000066400000000000000000000067351427272030600222620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define FUNCPTR_T machval_fp typedef void (*FUNCPTR_T)( machval_t mval, void* v ); static FUNCPTR_T GENARRAY(ftypes,machval); // // Define object-based interface. // void bli_machval( machval_t mval, obj_t* v ) { num_t dt_v = bli_obj_dt( v ); void* buf_v = bli_obj_buffer_at_off( v ); FUNCPTR_T f; // Index into the function pointer array. f = ftypes[dt_v]; // Invoke the function. f( mval, buf_v ); } // // Define BLAS-like interfaces. // #undef GENTFUNCR #define GENTFUNCR( ctype_v, ctype_vr, chv, chvr, opname, varname ) \ \ void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ) \ { \ static ctype_vr pvals[ BLIS_NUM_MACH_PARAMS ]; \ \ static bool first_time = TRUE; \ \ dim_t val_i = mval - BLIS_MACH_PARAM_FIRST; \ ctype_v* v_cast = v; \ \ /* If this is the first time through, call the underlying code to discover each machine parameter. */ \ if ( first_time ) \ { \ char lapack_mval; \ dim_t m, i; \ \ for( i = 0, m = BLIS_MACH_PARAM_FIRST; \ i < BLIS_NUM_MACH_PARAMS - 1; \ ++i, ++m ) \ { \ bli_param_map_blis_to_netlib_machval( m, &lapack_mval ); \ \ /*printf( "bli_machval: querying %u %c\n", m, lapack_mval );*/ \ \ pvals[i] = PASTEMAC(chvr,varname)( &lapack_mval, 1 ); \ \ /*printf( "bli_machval: got back %34.29e\n", pvals[i] ); */ \ } \ \ /* Store epsilon^2 in the last element. */ \ pvals[i] = pvals[0] * pvals[0]; \ \ first_time = FALSE; \ } \ \ /* Copy the requested parameter value to the output buffer, which may involve a demotion from the complex to real domain. */ \ PASTEMAC2(chvr,chv,copys)( pvals[ val_i ], *v_cast ); \ } INSERT_GENTFUNCR_BASIC( machval, lamch ) cython-blis-0.9.1/blis/_src/frame/base/bli_machval.h000066400000000000000000000041131427272030600222530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_lsame.h" #include "bli_slamch.h" #include "bli_dlamch.h" // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) cython-blis-0.9.1/blis/_src/frame/base/bli_malloc.c000066400000000000000000000167611427272030600221160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" //#define BLIS_ENABLE_MEM_TRACING // ----------------------------------------------------------------------------- // NOTE: These functions are no longer used. Instead, the relevant sections // of code call bli_fmalloc_align() and pass in the desired malloc()-like // function, such as BLIS_MALLOC_POOL. #if 0 void* bli_malloc_pool( size_t size ) { const malloc_ft malloc_fp = BLIS_MALLOC_POOL; const size_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_malloc_pool(): size %ld, align size %ld\n", ( long )size, ( long )align_size ); fflush( stdout ); #endif return bli_fmalloc_align( malloc_fp, size, align_size ); } void bli_free_pool( void* p ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_free_pool(): freeing block\n" ); fflush( stdout ); #endif bli_ffree_align( BLIS_FREE_POOL, p ); } #endif // ----------------------------------------------------------------------------- void* bli_malloc_user( size_t size, err_t* r_val ) { const malloc_ft malloc_fp = BLIS_MALLOC_USER; const size_t align_size = BLIS_HEAP_ADDR_ALIGN_SIZE; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_malloc_user(): size %ld, align size %ld\n", ( long )size, ( long )align_size ); fflush( stdout ); #endif void* p = bli_fmalloc_align( malloc_fp, size, align_size, r_val ); return p; } void bli_free_user( void* p ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_free_user(): freeing block\n" ); fflush( stdout ); #endif bli_ffree_align( BLIS_FREE_USER, p ); } // ----------------------------------------------------------------------------- void* bli_malloc_intl( size_t size, err_t* r_val ) { const malloc_ft malloc_fp = BLIS_MALLOC_INTL; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_malloc_intl(): size %ld\n", ( long )size ); fflush( stdout ); #endif void* p = bli_fmalloc_noalign( malloc_fp, size, r_val ); return p; } void* bli_calloc_intl( size_t size, err_t* r_val ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_calloc_intl(): " ); #endif void* p = bli_malloc_intl( size, r_val ); if ( bli_is_success( *r_val ) ) memset( p, 0, size ); return p; } void bli_free_intl( void* p ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_free_intl(): freeing block\n" ); fflush( stdout ); #endif bli_ffree_noalign( BLIS_FREE_INTL, p ); } // ----------------------------------------------------------------------------- void* bli_fmalloc_align ( malloc_ft f, size_t size, size_t align_size, err_t* r_val ) { const size_t ptr_size = sizeof( void* ); size_t align_offset = 0; void* p_orig; int8_t* p_byte; void** p_addr; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_fmalloc_align_check( f, size, align_size ); // Return early if zero bytes were requested. if ( size == 0 ) return NULL; // Add the alignment size and the size of a pointer to the number // of bytes to allocate. size += align_size + ptr_size; // Call the allocation function. p_orig = f( size ); // Check the pointer returned by malloc(). if ( bli_error_checking_is_enabled() ) bli_fmalloc_post_check( p_orig ); // The pseudo-return value isn't used yet. *r_val = BLIS_SUCCESS; // Advance the pointer by one pointer element. p_byte = p_orig; p_byte += ptr_size; // Compute the offset to the desired alignment. if ( bli_is_unaligned_to( ( siz_t )p_byte, ( siz_t )align_size ) ) { align_offset = align_size - bli_offset_past_alignment( ( siz_t )p_byte, ( siz_t )align_size ); } // Advance the pointer using the difference between the alignment // size and the alignment offset. p_byte += align_offset; // Compute the address of the pointer element just before the start // of the aligned address, and store the original address there. p_addr = ( void** )(p_byte - ptr_size); *p_addr = p_orig; // Return the aligned pointer. return p_byte; } void bli_ffree_align ( free_ft f, void* p ) { const size_t ptr_size = sizeof( void* ); void* p_orig; int8_t* p_byte; void** p_addr; // If the pointer to free is NULL, it was obviously not aligned and // does not need to be freed. if ( p == NULL ) return; // Since the bli_fmalloc_align() function returned the aligned pointer, // we have to first recover the original pointer before we can free the // memory. // Start by casting the pointer to a byte pointer. p_byte = p; // Compute the address of the pointer element just before the start // of the aligned address, and recover the original address. p_addr = ( void** )( p_byte - ptr_size ); p_orig = *p_addr; // Free the original pointer. f( p_orig ); } // ----------------------------------------------------------------------------- void* bli_fmalloc_noalign ( malloc_ft f, size_t size, err_t* r_val ) { void* p = f( size ); // Check the pointer returned by malloc(). if ( bli_error_checking_is_enabled() ) bli_fmalloc_post_check( p ); // The pseudo-return value isn't used yet. *r_val = BLIS_SUCCESS; return p; } void bli_ffree_noalign ( free_ft f, void* p ) { f( p ); } // ----------------------------------------------------------------------------- void bli_fmalloc_align_check ( malloc_ft f, size_t size, size_t align_size ) { err_t e_val; // Check for valid alignment. e_val = bli_check_alignment_is_power_of_two( align_size ); bli_check_error_code( e_val ); e_val = bli_check_alignment_is_mult_of_ptr_size( align_size ); bli_check_error_code( e_val ); } void bli_fmalloc_post_check ( void* p ) { err_t e_val; // Check for valid values from malloc(). e_val = bli_check_valid_malloc_buf( p ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/bli_malloc.h000066400000000000000000000053641427272030600221200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); cython-blis-0.9.1/blis/_src/frame/base/bli_mbool.c000066400000000000000000000044531427272030600217520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ) { mbool_t* b; err_t r_val; b = ( mbool_t* ) bli_malloc_intl( sizeof( mbool_t ), &r_val ); bli_mbool_init ( b, b_s, b_d, b_c, b_z ); return b; } void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ) { bli_mbool_set_dt( b_s, BLIS_FLOAT, b ); bli_mbool_set_dt( b_d, BLIS_DOUBLE, b ); bli_mbool_set_dt( b_c, BLIS_SCOMPLEX, b ); bli_mbool_set_dt( b_z, BLIS_DCOMPLEX, b ); } void bli_mbool_free( mbool_t* b ) { bli_free_intl( b ); } cython-blis-0.9.1/blis/_src/frame/base/bli_mbool.h000066400000000000000000000044621427272030600217570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); cython-blis-0.9.1/blis/_src/frame/base/bli_mem.h000066400000000000000000000102521427272030600214170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) /* typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; typedef struct { void* buf; siz_t block_size; } pblk_t; */ // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif cython-blis-0.9.1/blis/_src/frame/base/bli_memsys.c000066400000000000000000000051151427272030600221530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_memsys_init( void ) { // Query a native context so we have something to pass into // bli_pba_init_pools(). We use BLIS_DOUBLE for the datatype, // but the dt argument is actually only used when initializing // contexts for induced methods. // NOTE: Instead of calling bli_gks_query_cntx(), we call // bli_gks_query_cntx_noinit() to avoid the call to bli_init_once(). cntx_t* cntx_p = bli_gks_query_cntx_noinit(); // Initialize the packing block allocator and its data structures. bli_pba_init( cntx_p ); // Initialize the small block allocator and its data structures. bli_sba_init(); } void bli_memsys_finalize( void ) { // Finalize the small block allocator and its data structures. bli_sba_finalize(); // Finalize the packing block allocator and its data structures. bli_pba_finalize(); } cython-blis-0.9.1/blis/_src/frame/base/bli_memsys.h000066400000000000000000000037161427272030600221650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_obj.c000066400000000000000000000470621427272030600214170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ) { bli_init_once(); bli_obj_create_without_buffer( dt, m, n, obj ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_obj_create(): " ); #endif bli_obj_alloc_buffer( rs, cs, 1, obj ); } void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_init_once(); bli_obj_create_without_buffer( dt, m, n, obj ); bli_obj_attach_buffer( p, rs, cs, 1, obj ); } void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ) { siz_t elem_size; void* s; bli_init_once(); if ( bli_error_checking_is_enabled() ) bli_obj_create_without_buffer_check( dt, m, n, obj ); // Query the size of one element of the object's pre-set datatype. elem_size = bli_dt_size( dt ); // Set any default properties that are appropriate. bli_obj_set_defaults( obj ); // Set the object root to itself, since obj is not presumed to be a view // into a larger matrix. This is typically the only time this field is // ever set; henceforth, subpartitions and aliases to this object will // get copies of this field, and thus always have access to its // "greatest-grand" parent (ie: the original parent, or "root", object). // However, there ARE a few places where it is convenient to reset the // root field explicitly via bli_obj_set_as_root(). (We do not list // those places here. Just grep for bli_obj_set_as_root within the // top-level 'frame' directory to see them. bli_obj_set_as_root( obj ); // Set individual fields. bli_obj_set_buffer( NULL, obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_diag_offset( 0, obj ); bli_obj_set_pack_fn( NULL, obj ); bli_obj_set_pack_params( NULL, obj ); bli_obj_set_ker_fn( NULL, obj ); bli_obj_set_ker_params( NULL, obj ); // Set the internal scalar to 1.0. bli_obj_set_scalar_dt( dt, obj ); s = bli_obj_internal_scalar_buffer( obj ); // Always writing the imaginary component is needed in mixed-domain // scenarios. Failing to do this can lead to reading uninitialized // memory just before calling the macrokernel (as the internal scalars // for A and B are merged). //if ( bli_is_float( dt ) ) { bli_sset1s( *(( float* )s) ); } //else if ( bli_is_double( dt ) ) { bli_dset1s( *(( double* )s) ); } if ( bli_is_float( dt ) ) { bli_cset1s( *(( scomplex* )s) ); } else if ( bli_is_double( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); } else if ( bli_is_scomplex( dt ) ) { bli_cset1s( *(( scomplex* )s) ); } else if ( bli_is_dcomplex( dt ) ) { bli_zset1s( *(( dcomplex* )s) ); } } void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ) { dim_t n_elem = 0; dim_t m, n; siz_t elem_size; siz_t buffer_size; void* p; err_t r_val; bli_init_once(); // Query the dimensions of the object we are allocating. m = bli_obj_length( obj ); n = bli_obj_width( obj ); // Query the size of one element. elem_size = bli_obj_elem_size( obj ); // Adjust the strides, if needed, before doing anything else // (particularly, before doing any error checking). bli_adjust_strides( m, n, elem_size, &rs, &cs, &is ); if ( bli_error_checking_is_enabled() ) bli_obj_alloc_buffer_check( rs, cs, is, obj ); // Determine how much object to allocate. if ( m == 0 || n == 0 ) { // For empty objects, set n_elem to zero. Row and column strides // should remain unchanged (because alignment is not needed). n_elem = 0; } else { // The number of elements to allocate is given by the distance from // the element with the lowest address (usually {0, 0}) to the element // with the highest address (usually {m-1, n-1}), plus one for the // highest element itself. n_elem = (m-1) * bli_abs( rs ) + (n-1) * bli_abs( cs ) + 1; } // Handle the special case where imaginary stride is larger than // normal. if ( bli_obj_is_complex( obj ) ) { // Notice that adding is/2 works regardless of whether the // imaginary stride is unit, something between unit and // 2*n_elem, or something bigger than 2*n_elem. n_elem = bli_abs( is ) / 2 + n_elem; } // Compute the size of the total buffer to be allocated, which includes // padding if the leading dimension was increased for alignment purposes. buffer_size = ( siz_t )n_elem * elem_size; // Allocate the buffer. p = bli_malloc_user( buffer_size, &r_val ); // Set individual fields. bli_obj_set_buffer( p, obj ); bli_obj_set_strides( rs, cs, obj ); bli_obj_set_imag_stride( is, obj ); } void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ) { bli_init_once(); // Interpret is = 0 as a request for the default, which is is = 1; if ( is == 0 ) is = 1; // Check that the strides and lengths are compatible. Note that the // user *must* specify valid row and column strides when attaching an // external buffer. if ( bli_error_checking_is_enabled() ) bli_obj_attach_buffer_check( p, rs, cs, is, obj ); // Update the object. bli_obj_set_buffer( p, obj ); bli_obj_set_strides( rs, cs, obj ); bli_obj_set_imag_stride( is, obj ); } void bli_obj_create_1x1 ( num_t dt, obj_t* obj ) { bli_obj_create_without_buffer( dt, 1, 1, obj ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_obj_create_1x1(): " ); #endif bli_obj_alloc_buffer( 1, 1, 1, obj ); } void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ) { bli_obj_create_without_buffer( dt, 1, 1, obj ); bli_obj_attach_buffer( p, 1, 1, 1, obj ); } void bli_obj_create_conf_to ( obj_t* s, obj_t* d ) { const num_t dt = bli_obj_dt( s ); const dim_t m = bli_obj_length( s ); const dim_t n = bli_obj_width( s ); const inc_t rs = bli_obj_row_stride( s ); const inc_t cs = bli_obj_col_stride( s ); bli_obj_create( dt, m, n, rs, cs, d ); } void bli_obj_free ( obj_t* obj ) { if ( bli_error_checking_is_enabled() ) bli_obj_free_check( obj ); // Don't dereference obj if it is NULL. if ( obj != NULL ) { // Idiot safety: Don't try to free the buffer field if the object // is a detached scalar (ie: if the buffer pointer refers to the // address of the internal scalar buffer). if ( bli_obj_buffer( obj ) != bli_obj_internal_scalar_buffer( obj ) ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_obj_free(): " ); #endif bli_free_user( bli_obj_buffer( obj ) ); } } } #if 0 //void bli_obj_create_const ( double value, obj_t* obj ) { gint_t* temp_i; float* temp_s; double* temp_d; scomplex* temp_c; dcomplex* temp_z; if ( bli_error_checking_is_enabled() ) bli_obj_create_const_check( value, obj ); bli_obj_create( BLIS_CONSTANT, 1, 1, 1, 1, obj ); //temp_s = bli_obj_buffer_for_const( BLIS_FLOAT, obj ); //temp_d = bli_obj_buffer_for_const( BLIS_DOUBLE, obj ); //temp_c = bli_obj_buffer_for_const( BLIS_SCOMPLEX, obj ); //temp_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, obj ); //temp_i = bli_obj_buffer_for_const( BLIS_INT, obj ); bli_dssets( value, 0.0, *temp_s ); bli_ddsets( value, 0.0, *temp_d ); bli_dcsets( value, 0.0, *temp_c ); bli_dzsets( value, 0.0, *temp_z ); *temp_i = ( gint_t ) value; } //void bli_obj_create_const_copy_of ( obj_t* a, obj_t* b ) { gint_t* temp_i; float* temp_s; double* temp_d; scomplex* temp_c; dcomplex* temp_z; void* buf_a; dcomplex value; if ( bli_error_checking_is_enabled() ) bli_obj_create_const_copy_of_check( a, b ); bli_obj_create( BLIS_CONSTANT, 1, 1, 1, 1, b ); //temp_s = bli_obj_buffer_for_const( BLIS_FLOAT, b ); //temp_d = bli_obj_buffer_for_const( BLIS_DOUBLE, b ); //temp_c = bli_obj_buffer_for_const( BLIS_SCOMPLEX, b ); //temp_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, b ); //temp_i = bli_obj_buffer_for_const( BLIS_INT, b ); buf_a = bli_obj_buffer_at_off( a ); bli_zzsets( 0.0, 0.0, value ); if ( bli_obj_is_float( a ) ) { bli_szcopys( *(( float* )buf_a), value ); } else if ( bli_obj_is_double( a ) ) { bli_dzcopys( *(( double* )buf_a), value ); } else if ( bli_obj_is_scomplex( a ) ) { bli_czcopys( *(( scomplex* )buf_a), value ); } else if ( bli_obj_is_dcomplex( a ) ) { bli_zzcopys( *(( dcomplex* )buf_a), value ); } else { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } bli_zscopys( value, *temp_s ); bli_zdcopys( value, *temp_d ); bli_zccopys( value, *temp_c ); bli_zzcopys( value, *temp_z ); *temp_i = ( gint_t ) bli_zreal( value ); } #endif void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ) { // Here, we check the strides that were input from the user and modify // them if needed. // Handle the special "empty" case first. If either dimension is zero, // do nothing (this could represent a zero-length "slice" of another // matrix). if ( m == 0 || n == 0 ) return; // Interpret rs = cs = 0 as request for column storage and -1 as a request // for row storage. if ( *rs == 0 && *cs == 0 && ( *is == 0 || *is == 1 ) ) { // First we handle the 1x1 scalar case explicitly. if ( m == 1 && n == 1 ) { *rs = 1; *cs = 1; } // We use column-major storage, except when m == 1, in which case we // use what amounts to row-major storage because we don't want both // strides to be unit. else if ( m == 1 && n > 1 ) { *rs = n; *cs = 1; } else { *rs = 1; *cs = m; } // Use default complex storage. *is = 1; // Align the strides depending on the tilt of the matrix. Note that // scalars are neither row nor column tilted. Also note that alignment // is only done for rs = cs = 0, and any user-supplied row and column // strides are preserved. if ( bli_is_col_tilted( m, n, *rs, *cs ) ) { *cs = bli_align_dim_to_size( *cs, elem_size, BLIS_HEAP_STRIDE_ALIGN_SIZE ); } else if ( bli_is_row_tilted( m, n, *rs, *cs ) ) { *rs = bli_align_dim_to_size( *rs, elem_size, BLIS_HEAP_STRIDE_ALIGN_SIZE ); } } else if ( *rs == -1 && *cs == -1 && ( *is == 0 || *is == 1 ) ) { // First we handle the 1x1 scalar case explicitly. if ( m == 1 && n == 1 ) { *rs = 1; *cs = 1; } // We use row-major storage, except when n == 1, in which case we // use what amounts to column-major storage because we don't want both // strides to be unit. else if ( n == 1 && m > 1 ) { *rs = 1; *cs = m; } else { *rs = n; *cs = 1; } // Use default complex storage. *is = 1; // Align the strides depending on the tilt of the matrix. Note that // scalars are neither row nor column tilted. Also note that alignment // is only done for rs = cs = -1, and any user-supplied row and column // strides are preserved. if ( bli_is_col_tilted( m, n, *rs, *cs ) ) { *cs = bli_align_dim_to_size( *cs, elem_size, BLIS_HEAP_STRIDE_ALIGN_SIZE ); } else if ( bli_is_row_tilted( m, n, *rs, *cs ) ) { *rs = bli_align_dim_to_size( *rs, elem_size, BLIS_HEAP_STRIDE_ALIGN_SIZE ); } } else if ( *rs == 1 && *cs == 1 ) { // If both strides are unit, this is probably a "lazy" request for a // single vector (but could also be a request for a 1xn matrix in // column-major order or an mx1 matrix in row-major order). In BLIS, // we have decided to "reserve" the case where rs = cs = 1 for // 1x1 scalars only. if ( m > 1 && n == 1 ) { // Set the column stride to indicate that this is a column vector // stored in column-major order. This is done for legacy reasons, // because we at one time we had to satisify the error checking // in the underlying BLAS library, which expects the leading // dimension to be set to at least m, even if it will never be // used for indexing since it is a vector and thus only has one // column of data. *cs = m; } else if ( m == 1 && n > 1 ) { // Set the row stride to indicate that this is a row vector stored // in row-major order. *rs = n; } // Nothing needs to be done for the 1x1 scalar case where m == n == 1. } } static siz_t dt_sizes[6] = { sizeof( float ), sizeof( scomplex ), sizeof( double ), sizeof( dcomplex ), sizeof( gint_t ), sizeof( constdata_t ) }; siz_t bli_dt_size ( num_t dt ) { if ( bli_error_checking_is_enabled() ) bli_dt_size_check( dt ); return dt_sizes[dt]; } static char* dt_names[ BLIS_NUM_FP_TYPES+1 ] = { "float", "scomplex", "double", "dcomplex", "int" }; char* bli_dt_string ( num_t dt ) { if ( bli_error_checking_is_enabled() ) bli_dt_string_check( dt ); return dt_names[dt]; } dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ) { // We return the dimension unmodified if the multiple is zero // (to avoid division by zero). if ( dim_mult == 0 ) return dim; dim = ( ( dim + dim_mult - 1 ) / dim_mult ) * dim_mult; return dim; } dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ) { dim = ( ( dim * ( dim_t )elem_size + ( dim_t )align_size - 1 ) / ( dim_t )align_size ) * ( dim_t )align_size / ( dim_t )elem_size; return dim; } dim_t bli_align_ptr_to_size ( void* p, size_t align_size ) { dim_t dim; dim = ( ( ( uintptr_t )p + align_size - 1 ) / align_size ) * align_size; return dim; } #if 0 static num_t type_union[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = { // s c d z /* s */ { BLIS_FLOAT, BLIS_SCOMPLEX, BLIS_DOUBLE, BLIS_DCOMPLEX }, /* c */ { BLIS_SCOMPLEX, BLIS_SCOMPLEX, BLIS_DCOMPLEX, BLIS_DCOMPLEX }, /* d */ { BLIS_DOUBLE, BLIS_DCOMPLEX, BLIS_DOUBLE, BLIS_DCOMPLEX }, /* z */ { BLIS_DCOMPLEX, BLIS_DCOMPLEX, BLIS_DCOMPLEX, BLIS_DCOMPLEX } }; num_t bli_dt_union( num_t dt1, num_t dt2 ) { if ( bli_error_checking_is_enabled() ) bli_dt_union_check( dt1, dt2 ); return type_union[dt1][dt2]; } #endif void bli_obj_print ( char* label, obj_t* obj ) { bli_init_once(); FILE* file = stdout; if ( bli_error_checking_is_enabled() ) bli_obj_print_check( label, obj ); fprintf( file, "\n" ); fprintf( file, "%s\n", label ); fprintf( file, "\n" ); fprintf( file, " m x n %lu x %lu\n", ( unsigned long )bli_obj_length( obj ), ( unsigned long )bli_obj_width( obj ) ); fprintf( file, "\n" ); fprintf( file, " offm, offn %lu, %lu\n", ( unsigned long )bli_obj_row_off( obj ), ( unsigned long )bli_obj_col_off( obj ) ); fprintf( file, " diagoff %ld\n", ( signed long int )bli_obj_diag_offset( obj ) ); fprintf( file, "\n" ); fprintf( file, " buf %p\n", ( void* )bli_obj_buffer( obj ) ); fprintf( file, " elem size %lu\n", ( unsigned long )bli_obj_elem_size( obj ) ); fprintf( file, " rs, cs %ld, %ld\n", ( signed long int )bli_obj_row_stride( obj ), ( signed long int )bli_obj_col_stride( obj ) ); fprintf( file, " is %ld\n", ( signed long int )bli_obj_imag_stride( obj ) ); fprintf( file, " m_padded %lu\n", ( unsigned long )bli_obj_padded_length( obj ) ); fprintf( file, " n_padded %lu\n", ( unsigned long )bli_obj_padded_width( obj ) ); fprintf( file, " pd %lu\n", ( unsigned long )bli_obj_panel_dim( obj ) ); fprintf( file, " ps %lu\n", ( unsigned long )bli_obj_panel_stride( obj ) ); fprintf( file, "\n" ); fprintf( file, " info %lX\n", ( unsigned long )(*obj).info ); fprintf( file, " - is complex %lu\n", ( unsigned long )bli_obj_is_complex( obj ) ); fprintf( file, " - is d. prec %lu\n", ( unsigned long )bli_obj_is_double_prec( obj ) ); fprintf( file, " - datatype %lu\n", ( unsigned long )bli_obj_dt( obj ) ); fprintf( file, " - target dt %lu\n", ( unsigned long )bli_obj_target_dt( obj ) ); fprintf( file, " - exec dt %lu\n", ( unsigned long )bli_obj_exec_dt( obj ) ); fprintf( file, " - comp dt %lu\n", ( unsigned long )bli_obj_comp_dt( obj ) ); fprintf( file, " - scalar dt %lu\n", ( unsigned long )bli_obj_scalar_dt( obj ) ); fprintf( file, " - has trans %lu\n", ( unsigned long )bli_obj_has_trans( obj ) ); fprintf( file, " - has conj %lu\n", ( unsigned long )bli_obj_has_conj( obj ) ); fprintf( file, " - unit diag? %lu\n", ( unsigned long )bli_obj_has_unit_diag( obj ) ); fprintf( file, " - struc type %lu\n", ( unsigned long )bli_obj_struc( obj ) >> BLIS_STRUC_SHIFT ); fprintf( file, " - uplo type %lu\n", ( unsigned long )bli_obj_uplo( obj ) >> BLIS_UPLO_SHIFT ); fprintf( file, " - is upper %lu\n", ( unsigned long )bli_obj_is_upper( obj ) ); fprintf( file, " - is lower %lu\n", ( unsigned long )bli_obj_is_lower( obj ) ); fprintf( file, " - is dense %lu\n", ( unsigned long )bli_obj_is_dense( obj ) ); fprintf( file, " - pack schema %lu\n", ( unsigned long )bli_obj_pack_schema( obj ) >> BLIS_PACK_SCHEMA_SHIFT ); fprintf( file, " - packinv diag? %lu\n", ( unsigned long )bli_obj_has_inverted_diag( obj ) ); fprintf( file, " - pack ordifup %lu\n", ( unsigned long )bli_obj_is_pack_rev_if_upper( obj ) ); fprintf( file, " - pack ordiflo %lu\n", ( unsigned long )bli_obj_is_pack_rev_if_lower( obj ) ); fprintf( file, " - packbuf type %lu\n", ( unsigned long )bli_obj_pack_buffer_type( obj ) >> BLIS_PACK_BUFFER_SHIFT ); fprintf( file, "\n" ); } cython-blis-0.9.1/blis/_src/frame/base/bli_obj.h000066400000000000000000000067641427272030600214300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_obj_check.h" BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); cython-blis-0.9.1/blis/_src/frame/base/bli_obj_scalar.c000066400000000000000000000166271427272030600227470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ) { void* p; // Initialize beta without a buffer and then attach its internal buffer. // NOTE: This initializes both the storage datatype and scalar datatype // bitfields within beta to dt. bli_obj_create_without_buffer( dt, 1, 1, beta ); // Query the address of the object's internal scalar buffer. p = bli_obj_internal_scalar_buffer( beta ); // Update the object. bli_obj_set_buffer( p, beta ); bli_obj_set_strides( 1, 1, beta ); bli_obj_set_imag_stride( 1, beta ); } void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ) { obj_t alpha_local; // Make a local copy of alpha so we can apply the conj parameter. bli_obj_alias_to( alpha, &alpha_local ); bli_obj_apply_conj( conj, &alpha_local ); // Initialize beta without a buffer and then attach its internal buffer. bli_obj_scalar_init_detached( dt, beta ); // Copy the scalar value in a to object b, conjugating and/or // typecasting if needed. bli_copysc( &alpha_local, beta ); } void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ) { // Use the scalar datatype of A as the storage datatype of the detached // object alpha. num_t dt_a = bli_obj_scalar_dt( a ); // Initialize alpha to be a bufferless internal scalar of the same // datatype as the scalar attached to A. bli_obj_scalar_init_detached( dt_a, alpha ); // Copy the internal scalar in A to alpha. // NOTE: This is simply a field-to-field copy with no typecasting. But // that's okay since bli_obj_scalar_init_detached() initializes the // storage datatype of alpha to be the same as the datatype of the // scalar queried from bli_obj_scalar_dt() above. bli_obj_copy_internal_scalar( a, alpha ); } void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ) { obj_t alpha_cast; // Use the target datatype of A as the datatype to which we cast // alpha locally. const num_t dt_targ = bli_obj_target_dt( a ); // Make a copy-cast of alpha to the target datatype of A, queried // above. This step gives us the opportunity to conjugate and/or // typecast alpha. bli_obj_scalar_init_detached_copy_of( dt_targ, conj, alpha, &alpha_cast ); // Copy the internal scalar in alpha_cast to A. bli_obj_copy_internal_scalar( &alpha_cast, a ); // Update the scalar datatype of A. bli_obj_set_scalar_dt( dt_targ, a ); } void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ) { obj_t alpha; obj_t alpha_cast; // Initialize an object alpha to be a bufferless scalar whose // storage datatype is equal to the scalar datatype of A. bli_obj_scalar_init_detached( bli_obj_scalar_dt( a ), &alpha ); // Copy the internal scalar in A to alpha. // NOTE: Since alpha was initialized with the scalar datatype of A, // a simple field-to-field copy is sufficient (no casting is needed // here). bli_obj_copy_internal_scalar( a, &alpha ); // Make a copy-cast of alpha, alpha_cast, with the datatype given by // the caller. (This is where the typecasting happens.) bli_obj_scalar_init_detached_copy_of( dt, BLIS_NO_CONJUGATE, &alpha, &alpha_cast ); // Copy the newly-typecasted value in alpha_cast back to A. bli_obj_copy_internal_scalar( &alpha_cast, a ); // Update the scalar datatype of A to reflect to new datatype used // in the typecast. bli_obj_set_scalar_dt( dt, a ); } void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ) { obj_t alpha_cast; obj_t scalar_a; // Make a copy of alpha, alpha_cast, with the same datatype as the // scalar datatype of A. (This is where the typecasting happens.) bli_obj_scalar_init_detached_copy_of( bli_obj_scalar_dt( a ), BLIS_NO_CONJUGATE, alpha, &alpha_cast ); // Detach the scalar from A. bli_obj_scalar_detach( a, &scalar_a ); // Scale the detached scalar by alpha. bli_mulsc( &alpha_cast, &scalar_a ); // Copy the internal scalar in scalar_a to A. bli_obj_copy_internal_scalar( &scalar_a, a ); } void bli_obj_scalar_reset ( obj_t* a ) { num_t dt = bli_obj_scalar_dt( a ); void* scalar_a = bli_obj_internal_scalar_buffer( a ); void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); if ( bli_is_float( dt ) ) *(( float* )scalar_a) = *(( float* )one); else if ( bli_is_double( dt ) ) *(( double* )scalar_a) = *(( double* )one); else if ( bli_is_scomplex( dt ) ) *(( scomplex* )scalar_a) = *(( scomplex* )one); else if ( bli_is_dcomplex( dt ) ) *(( dcomplex* )scalar_a) = *(( dcomplex* )one); // Alternate implementation: //bli_obj_scalar_attach( BLIS_NO_CONJUGATE, &BLIS_ONE, a ); } bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ) { bool r_val = FALSE; num_t dt = bli_obj_scalar_dt( a ); void* scalar_a = bli_obj_internal_scalar_buffer( a ); // FGVZ: Reimplement by using bli_obj_imag_part() and then // bli_obj_equals( &BLIS_ZERO, ... ). if ( bli_is_real( dt ) ) { r_val = FALSE; } else if ( bli_is_scomplex( dt ) ) { r_val = ( bli_cimag( *(( scomplex* )scalar_a) ) != 0.0F ); } else if ( bli_is_dcomplex( dt ) ) { r_val = ( bli_zimag( *(( dcomplex* )scalar_a) ) != 0.0 ); } return r_val; } bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ) { obj_t scalar_a; bool r_val; bli_obj_scalar_detach( a, &scalar_a ); r_val = bli_obj_equals( &scalar_a, beta ); return r_val; } cython-blis-0.9.1/blis/_src/frame/base/bli_obj_scalar.h000066400000000000000000000050471427272030600227460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); cython-blis-0.9.1/blis/_src/frame/base/bli_opid.h000066400000000000000000000034061427272030600215770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } cython-blis-0.9.1/blis/_src/frame/base/bli_pack.c000066400000000000000000000116671427272030600215650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // The global rntm_t structure. (The definition resides in bli_rntm.c.) extern rntm_t global_rntm; // A mutex to allow synchronous access to global_rntm. (The definition // resides in bli_rntm.c.) extern bli_pthread_mutex_t global_rntm_mutex; // ----------------------------------------------------------------------------- void bli_pack_init( void ) { // Read the environment variables and use them to initialize the // global runtime object. bli_pack_init_rntm_from_env( &global_rntm ); } void bli_pack_finalize( void ) { } // ----------------------------------------------------------------------------- void bli_pack_get_pack_a( bool* pack_a ) { // We must ensure that global_rntm has been initialized. bli_init_once(); *pack_a = bli_rntm_pack_a( &global_rntm ); } // ----------------------------------------------------------------------------- void bli_pack_get_pack_b( bool* pack_b ) { // We must ensure that global_rntm has been initialized. bli_init_once(); *pack_b = bli_rntm_pack_b( &global_rntm ); } // ---------------------------------------------------------------------------- void bli_pack_set_pack_a( bool pack_a ) { // We must ensure that global_rntm has been initialized. bli_init_once(); // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); bli_rntm_set_pack_a( pack_a, &global_rntm ); // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); } // ---------------------------------------------------------------------------- void bli_pack_set_pack_b( bool pack_b ) { // We must ensure that global_rntm has been initialized. bli_init_once(); // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); bli_rntm_set_pack_b( pack_b, &global_rntm ); // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); } // ---------------------------------------------------------------------------- void bli_pack_init_rntm_from_env ( rntm_t* rntm ) { // NOTE: We don't need to acquire the global_rntm_mutex here because this // function is only called from bli_pack_init(), which is only called // by bli_init_once(). bool pack_a; bool pack_b; #if 1 //def BLIS_ENABLE_SELECTIVE_PACKING // Try to read BLIS_PACK_A and BLIS_PACK_B. For each variable, default to // -1 if it is unset. gint_t pack_a_env = bli_env_get_var( "BLIS_PACK_A", -1 ); gint_t pack_b_env = bli_env_get_var( "BLIS_PACK_B", -1 ); // Enforce the default behavior first, then check for affirmative FALSE, and // finally assume anything else is TRUE. if ( pack_a_env == -1 ) pack_a = FALSE; // default behavior else if ( pack_a_env == 0 ) pack_a = FALSE; // zero is FALSE else pack_a = TRUE; // anything else is TRUE if ( pack_b_env == -1 ) pack_b = FALSE; // default behavior else if ( pack_b_env == 0 ) pack_b = FALSE; // zero is FALSE else pack_b = TRUE; // anything else is TRUE #else pack_a = TRUE; pack_b = TRUE; #endif // Save the results back in the runtime object. bli_rntm_set_pack_a( pack_a, rntm ); bli_rntm_set_pack_b( pack_b, rntm ); #if 0 printf( "bli_pack_init_rntm_from_env()\n" ); bli_rntm_print( rntm ); #endif } cython-blis-0.9.1/blis/_src/frame/base/bli_pack.h000066400000000000000000000040301427272030600215540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_param_map.c000066400000000000000000000170011427272030600225700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ) { if ( side == BLIS_LEFT ) *blas_side = 'L'; else if ( side == BLIS_RIGHT ) *blas_side = 'R'; else { bli_check_error_code( BLIS_INVALID_SIDE ); } } void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ) { if ( uplo == BLIS_LOWER ) *blas_uplo = 'L'; else if ( uplo == BLIS_UPPER ) *blas_uplo = 'U'; else { bli_check_error_code( BLIS_INVALID_UPLO ); } } void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ) { if ( trans == BLIS_NO_TRANSPOSE ) *blas_trans = 'N'; else if ( trans == BLIS_TRANSPOSE ) *blas_trans = 'T'; else if ( trans == BLIS_CONJ_TRANSPOSE ) *blas_trans = 'C'; else { bli_check_error_code( BLIS_INVALID_TRANS ); } } void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ) { if ( diag == BLIS_NONUNIT_DIAG ) *blas_diag = 'N'; else if ( diag == BLIS_UNIT_DIAG ) *blas_diag = 'U'; else { bli_check_error_code( BLIS_INVALID_DIAG ); } } void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ) { if ( machval == BLIS_MACH_EPS ) *blas_machval = 'E'; else if ( machval == BLIS_MACH_SFMIN ) *blas_machval = 'S'; else if ( machval == BLIS_MACH_BASE ) *blas_machval = 'B'; else if ( machval == BLIS_MACH_PREC ) *blas_machval = 'P'; else if ( machval == BLIS_MACH_NDIGMANT ) *blas_machval = 'N'; else if ( machval == BLIS_MACH_RND ) *blas_machval = 'R'; else if ( machval == BLIS_MACH_EMIN ) *blas_machval = 'M'; else if ( machval == BLIS_MACH_RMIN ) *blas_machval = 'U'; else if ( machval == BLIS_MACH_EMAX ) *blas_machval = 'L'; else if ( machval == BLIS_MACH_RMAX ) *blas_machval = 'O'; else { bli_check_error_code( BLIS_INVALID_MACHVAL ); } } // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These functions were converted into static functions. Please see this // file's corresponding header for those definitions. // --- BLIS char to BLIS mappings ---------------------------------------------- void bli_param_map_char_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { bli_check_error_code( BLIS_INVALID_SIDE ); } } void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else if ( uplo == 'e' || uplo == 'E' ) *blis_uplo = BLIS_DENSE; else { bli_check_error_code( BLIS_INVALID_UPLO ); } } void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_NO_TRANSPOSE; else if ( trans == 'h' || trans == 'H' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { bli_check_error_code( BLIS_INVALID_TRANS ); } } void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ) { if ( conj == 'n' || conj == 'N' ) *blis_conj = BLIS_NO_CONJUGATE; else if ( conj == 'c' || conj == 'C' ) *blis_conj = BLIS_CONJUGATE; else { bli_check_error_code( BLIS_INVALID_CONJ ); } } void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { bli_check_error_code( BLIS_INVALID_DIAG ); } } void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ) { if ( dt == 's' ) *blis_dt = BLIS_FLOAT; else if ( dt == 'd' ) *blis_dt = BLIS_DOUBLE; else if ( dt == 'c' ) *blis_dt = BLIS_SCOMPLEX; else if ( dt == 'z' ) *blis_dt = BLIS_DCOMPLEX; else if ( dt == 'i' ) *blis_dt = BLIS_INT; else { bli_check_error_code( BLIS_INVALID_DATATYPE ); } } // --- BLIS to BLIS char mappings ---------------------------------------------- void bli_param_map_blis_to_char_side( side_t blis_side, char* side ) { if ( blis_side == BLIS_LEFT ) *side = 'l'; else if ( blis_side == BLIS_RIGHT ) *side = 'r'; else { bli_check_error_code( BLIS_INVALID_SIDE ); } } void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ) { if ( blis_uplo == BLIS_LOWER ) *uplo = 'l'; else if ( blis_uplo == BLIS_UPPER ) *uplo = 'u'; else { bli_check_error_code( BLIS_INVALID_UPLO ); } } void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ) { if ( blis_trans == BLIS_NO_TRANSPOSE ) *trans = 'n'; else if ( blis_trans == BLIS_TRANSPOSE ) *trans = 't'; else if ( blis_trans == BLIS_CONJ_NO_TRANSPOSE ) *trans = 'c'; else if ( blis_trans == BLIS_CONJ_TRANSPOSE ) *trans = 'h'; else { bli_check_error_code( BLIS_INVALID_TRANS ); } } void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ) { if ( blis_conj == BLIS_NO_CONJUGATE ) *conj = 'n'; else if ( blis_conj == BLIS_CONJUGATE ) *conj = 'c'; else { bli_check_error_code( BLIS_INVALID_CONJ ); } } void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ) { if ( blis_diag == BLIS_NONUNIT_DIAG ) *diag = 'n'; else if ( blis_diag == BLIS_UNIT_DIAG ) *diag = 'u'; else { bli_check_error_code( BLIS_INVALID_DIAG ); } } void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ) { if ( blis_dt == BLIS_FLOAT ) *dt = 's'; else if ( blis_dt == BLIS_DOUBLE ) *dt = 'd'; else if ( blis_dt == BLIS_SCOMPLEX ) *dt = 'c'; else if ( blis_dt == BLIS_DCOMPLEX ) *dt = 'z'; else if ( blis_dt == BLIS_INT ) *dt = 'i'; else { bli_check_error_code( BLIS_INVALID_DATATYPE ); } } cython-blis-0.9.1/blis/_src/frame/base/bli_param_map.h000066400000000000000000000131051427272030600225760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); cython-blis-0.9.1/blis/_src/frame/base/bli_part.c000066400000000000000000000603121427272030600216040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // -- Matrix partitioning ------------------------------------------------------ void bli_acquire_mpart ( dim_t i, dim_t j, dim_t bm, dim_t bn, obj_t* parent, obj_t* child ) { // Query the dimensions of the parent object. const dim_t m_par = bli_obj_length( parent ); const dim_t n_par = bli_obj_width( parent ); // If either i or j is already beyond what exists of the parent matrix, // slide them back to the outer dimensions. (What will happen in this // scenario is that bm and bn and/or will be reduced to zero so that the // child matrix does not refer to anything beyond the bounds of the // parent. (Note: This is a safety measure and generally should never // be needed if the caller is passing in sane arguments.) if ( i > m_par ) i = m_par; if ( j > n_par ) j = n_par; // If either bm or bn spills out over the edge of the parent matrix, // reduce them so that the child matrix fits within the bounds of the // parent. (Note: This is a safety measure and generally should never // be needed if the caller is passing in sane arguments, though this // code is somewhat more likely to be needed than the code above.) if ( bm > m_par - i ) bm = m_par - i; if ( bn > n_par - j ) bn = n_par - j; // Alias the parent object's contents into the child object. bli_obj_alias_to( parent, child ); // Set the offsets and dimensions of the child object. Note that we // increment, rather than overwrite, the offsets of the child object // in case the parent object already had non-zero offsets (usually // because the parent was itself a child a larger grandparent object). bli_obj_inc_offs( i, j, child ); bli_obj_set_dims( bm, bn, child ); } void bli_acquire_mpart_t2b ( subpart_t req_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj ); } void bli_acquire_mpart_b2t ( subpart_t req_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj ); } void bli_acquire_mpart_mdim ( dir_t direct, subpart_t req_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { dim_t m; dim_t n; dim_t m_part = 0; dim_t n_part = 0; inc_t offm_inc = 0; inc_t offn_inc = 0; doff_t diag_off_inc; // Call a special function for partitioning packed objects. (By only // catching those objects packed to panels, we omit cases where the // object is packed to row or column storage, as such objects can be // partitioned through normally.) Note that the function called below // assumes forward partitioning. if ( bli_obj_is_panel_packed( obj ) ) { bli_packm_acquire_mpart_t2b( req_part, i, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) bli_acquire_mpart_t2b_check( req_part, i, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for // transposition, if indicated). if ( bli_obj_has_notrans( obj ) ) { m = bli_obj_length( obj ); n = bli_obj_width( obj ); } else // if ( bli_obj_has_trans( obj ) ) { m = bli_obj_width( obj ); n = bli_obj_length( obj ); } // Foolproofing: do not let b exceed what's left of the m dimension at // row offset i. if ( b > m - i ) b = m - i; // NOTE: Most of this function implicitly assumes moving forward. // When moving backward, we have to relocate i. if ( direct == BLIS_BWD ) { // Modify i to account for the fact that we are moving backwards. i = m - i - b; } // Support SUBPART1B (behind SUBPART1) and SUBPART1A (ahead of SUBPART1), // to refer to subpartitions 0 and 2 when moving forward, and 2 and 0 when // moving backward. subpart_t subpart0_alias; subpart_t subpart2_alias; if ( direct == BLIS_FWD ) { subpart0_alias = BLIS_SUBPART1B; subpart2_alias = BLIS_SUBPART1A; } else { subpart0_alias = BLIS_SUBPART1A; subpart2_alias = BLIS_SUBPART1B; } // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. if ( req_part == BLIS_SUBPART0 || req_part == subpart0_alias ) { // A0 (offm,offn) unchanged. // A0 is i x n. offm_inc = 0; offn_inc = 0; m_part = i; n_part = n; } else if ( req_part == BLIS_SUBPART1AND0 ) { // A1+A0 (offm,offn) unchanged. // A1+A0 is (i+b) x n. offm_inc = 0; offn_inc = 0; m_part = i + b; n_part = n; } else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (i,0). // A1 is b x n. offm_inc = i; offn_inc = 0; m_part = b; n_part = n; } else if ( req_part == BLIS_SUBPART1AND2 ) { // A1+A2 (offm,offn) += (i,0). // A1+A2 is (m-i) x n. offm_inc = i; offn_inc = 0; m_part = m - i; n_part = n; } else if ( req_part == BLIS_SUBPART2 || req_part == subpart2_alias ) { // A2 (offm,offn) += (i+b,0). // A2 is (m-i-b) x n. offm_inc = i + b; offn_inc = 0; m_part = m - i - b; n_part = n; } // Compute the diagonal offset based on the m and n offsets. diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column // stride fields of the parent object. Note that this omits copying view // information because the new partition will have its own dimensions // and offsets. bli_obj_init_subpart_from( obj, sub_obj ); // Modify offsets and dimensions of requested partition based on // whether it needs to be transposed. if ( bli_obj_has_notrans( obj ) ) { bli_obj_set_dims( m_part, n_part, sub_obj ); bli_obj_inc_offs( offm_inc, offn_inc, sub_obj ); bli_obj_inc_diag_offset( diag_off_inc, sub_obj ); } else // if ( bli_obj_has_trans( obj ) ) { bli_obj_set_dims( n_part, m_part, sub_obj ); bli_obj_inc_offs( offn_inc, offm_inc, sub_obj ); bli_obj_inc_diag_offset( -diag_off_inc, sub_obj ); } // If the root matrix is not general (ie: has structure defined by the // diagonal), and the subpartition does not intersect the root matrix's // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish // between uplo properties for the current and root objects... // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root // matrix is Hermitian or symmetric, then we reflect the partition to // the other side of the diagonal, toggling the transposition bit (and // conjugation bit if the root matrix is Hermitian). Or, if the root // matrix is triangular, the subpartition should be marked as zero. if ( bli_obj_is_unstored_subpart( sub_obj ) ) { if ( bli_obj_root_is_hermitian( sub_obj ) ) { bli_obj_reflect_about_diag( sub_obj ); bli_obj_toggle_conj( sub_obj ); } else if ( bli_obj_root_is_symmetric( sub_obj ) ) { bli_obj_reflect_about_diag( sub_obj ); } else if ( bli_obj_root_is_triangular( sub_obj ) ) { bli_obj_set_uplo( BLIS_ZEROS, sub_obj ); } } } } void bli_acquire_mpart_l2r ( subpart_t req_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { bli_acquire_mpart_ndim( BLIS_FWD, req_part, i, b, obj, sub_obj ); } void bli_acquire_mpart_r2l ( subpart_t req_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ) { bli_acquire_mpart_ndim( BLIS_BWD, req_part, j, b, obj, sub_obj ); } void bli_acquire_mpart_ndim ( dir_t direct, subpart_t req_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ) { dim_t m; dim_t n; dim_t m_part = 0; dim_t n_part = 0; inc_t offm_inc = 0; inc_t offn_inc = 0; doff_t diag_off_inc; // Call a special function for partitioning packed objects. (By only // catching those objects packed to panels, we omit cases where the // object is packed to row or column storage, as such objects can be // partitioned through normally.) Note that the function called below // assumes forward partitioning. if ( bli_obj_is_panel_packed( obj ) ) { bli_packm_acquire_mpart_l2r( req_part, j, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) bli_acquire_mpart_l2r_check( req_part, j, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for // transposition, if indicated). if ( bli_obj_has_notrans( obj ) ) { m = bli_obj_length( obj ); n = bli_obj_width( obj ); } else // if ( bli_obj_has_trans( obj ) ) { m = bli_obj_width( obj ); n = bli_obj_length( obj ); } // Foolproofing: do not let b exceed what's left of the n dimension at // column offset j. if ( b > n - j ) b = n - j; // NOTE: Most of this function implicitly assumes moving forward. // When moving backward, we have to relocate j. if ( direct == BLIS_BWD ) { // Modify j to account for the fact that we are moving backwards. j = n - j - b; } // Support SUBPART1B (behind SUBPART1) and SUBPART1A (ahead of SUBPART1), // to refer to subpartitions 0 and 2 when moving forward, and 2 and 0 when // moving backward. subpart_t subpart0_alias; subpart_t subpart2_alias; if ( direct == BLIS_FWD ) { subpart0_alias = BLIS_SUBPART1B; subpart2_alias = BLIS_SUBPART1A; } else { subpart0_alias = BLIS_SUBPART1A; subpart2_alias = BLIS_SUBPART1B; } // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. if ( req_part == BLIS_SUBPART0 || req_part == subpart0_alias ) { // A0 (offm,offn) unchanged. // A0 is m x j. offm_inc = 0; offn_inc = 0; m_part = m; n_part = j; } else if ( req_part == BLIS_SUBPART1AND0 ) { // A1+A0 (offm,offn) unchanged. // A1+A0 is m x (j+b). offm_inc = 0; offn_inc = 0; m_part = m; n_part = j + b; } else if ( req_part == BLIS_SUBPART1 ) { // A1 (offm,offn) += (0,j). // A1 is m x b. offm_inc = 0; offn_inc = j; m_part = m; n_part = b; } else if ( req_part == BLIS_SUBPART1AND2 ) { // A1+A2 (offm,offn) += (0,j). // A1+A2 is m x (n-j). offm_inc = 0; offn_inc = j; m_part = m; n_part = n - j; } else if ( req_part == BLIS_SUBPART2 || req_part == subpart2_alias ) { // A2 (offm,offn) += (0,j+b). // A2 is m x (n-j-b). offm_inc = 0; offn_inc = j + b; m_part = m; n_part = n - j - b; } // Compute the diagonal offset based on the m and n offsets. diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column // stride fields of the parent object. Note that this omits copying view // information because the new partition will have its own dimensions // and offsets. bli_obj_init_subpart_from( obj, sub_obj ); // Modify offsets and dimensions of requested partition based on // whether it needs to be transposed. if ( bli_obj_has_notrans( obj ) ) { bli_obj_set_dims( m_part, n_part, sub_obj ); bli_obj_inc_offs( offm_inc, offn_inc, sub_obj ); bli_obj_inc_diag_offset( diag_off_inc, sub_obj ); } else // if ( bli_obj_has_trans( obj ) ) { bli_obj_set_dims( n_part, m_part, sub_obj ); bli_obj_inc_offs( offn_inc, offm_inc, sub_obj ); bli_obj_inc_diag_offset( -diag_off_inc, sub_obj ); } // If the root matrix is not general (ie: has structure defined by the // diagonal), and the subpartition does not intersect the root matrix's // diagonal, then we might need to modify some of the subpartition's // properties, depending on its structure type. if ( !bli_obj_root_is_general( sub_obj ) && bli_obj_is_outside_diag( sub_obj ) ) { // NOTE: This comment may be out-of-date since we now distinguish // between uplo properties for the current and root objects... // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root // matrix is Hermitian or symmetric, then we reflect the partition to // the other side of the diagonal, toggling the transposition bit (and // conjugation bit if the root matrix is Hermitian). Or, if the root // matrix is triangular, the subpartition should be marked as zero. if ( bli_obj_is_unstored_subpart( sub_obj ) ) { if ( bli_obj_root_is_hermitian( sub_obj ) ) { bli_obj_reflect_about_diag( sub_obj ); bli_obj_toggle_conj( sub_obj ); } else if ( bli_obj_root_is_symmetric( sub_obj ) ) { bli_obj_reflect_about_diag( sub_obj ); } else if ( bli_obj_root_is_triangular( sub_obj ) ) { bli_obj_set_uplo( BLIS_ZEROS, sub_obj ); } } } } void bli_acquire_mpart_tl2br ( subpart_t req_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { bli_acquire_mpart_mndim( BLIS_FWD, req_part, i, b, obj, sub_obj ); } void bli_acquire_mpart_br2tl ( subpart_t req_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ) { bli_acquire_mpart_mndim( BLIS_BWD, req_part, j, b, obj, sub_obj ); } void bli_acquire_mpart_mndim ( dir_t direct, subpart_t req_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ) { dim_t m; dim_t n; dim_t min_m_n; dim_t m_part = 0; dim_t n_part = 0; inc_t offm_inc = 0; inc_t offn_inc = 0; doff_t diag_off_inc; // Call a special function for partitioning packed objects. (By only // catching those objects packed to panels, we omit cases where the // object is packed to row or column storage, as such objects can be // partitioned through normally.) Note that the function called below // assumes forward partitioning. if ( bli_obj_is_panel_packed( obj ) ) { bli_packm_acquire_mpart_tl2br( req_part, ij, b, obj, sub_obj ); return; } // Check parameters. if ( bli_error_checking_is_enabled() ) bli_acquire_mpart_tl2br_check( req_part, ij, b, obj, sub_obj ); // Query the m and n dimensions of the object (accounting for // transposition, if indicated). if ( bli_obj_has_notrans( obj ) ) { m = bli_obj_length( obj ); n = bli_obj_width( obj ); } else // if ( bli_obj_has_trans( obj ) ) { m = bli_obj_width( obj ); n = bli_obj_length( obj ); } // Foolproofing: do not let b exceed what's left of min(m,n) at // row/column offset ij. min_m_n = bli_min( m, n ); if ( b > min_m_n - ij ) b = min_m_n - ij; // NOTE: Most of this function implicitly assumes moving forward. // When moving backward, we have to relocate ij. if ( direct == BLIS_BWD ) { // Modify ij to account for the fact that we are moving backwards. ij = min_m_n - ij - b; } // Compute offset increments and dimensions based on which // subpartition is being requested, assuming no transposition. // Left column of subpartitions if ( req_part == BLIS_SUBPART00 ) { // A00 (offm,offn) unchanged. // A00 is ij x ij. offm_inc = 0; offn_inc = 0; m_part = ij; n_part = ij; } else if ( req_part == BLIS_SUBPART10 ) { // A10 (offm,offn) += (ij,0). // A10 is b x ij. offm_inc = ij; offn_inc = 0; m_part = b; n_part = ij; } else if ( req_part == BLIS_SUBPART20 ) { // A20 (offm,offn) += (ij+b,0). // A20 is (m-ij-b) x ij. offm_inc = ij + b; offn_inc = 0; m_part = m - ij - b; n_part = ij; } // Middle column of subpartitions. else if ( req_part == BLIS_SUBPART01 ) { // A01 (offm,offn) += (0,ij). // A01 is ij x b. offm_inc = 0; offn_inc = ij; m_part = ij; n_part = b; } else if ( req_part == BLIS_SUBPART11 ) { // A11 (offm,offn) += (ij,ij). // A11 is b x b. offm_inc = ij; offn_inc = ij; m_part = b; n_part = b; } else if ( req_part == BLIS_SUBPART21 ) { // A21 (offm,offn) += (ij+b,ij). // A21 is (m-ij-b) x b. offm_inc = ij + b; offn_inc = ij; m_part = m - ij - b; n_part = b; } // Right column of subpartitions. else if ( req_part == BLIS_SUBPART02 ) { // A02 (offm,offn) += (0,ij+b). // A02 is ij x (n-ij-b). offm_inc = 0; offn_inc = ij + b; m_part = ij; n_part = n - ij - b; } else if ( req_part == BLIS_SUBPART12 ) { // A12 (offm,offn) += (ij,ij+b). // A12 is b x (n-ij-b). offm_inc = ij; offn_inc = ij + b; m_part = b; n_part = n - ij - b; } else // if ( req_part == BLIS_SUBPART22 ) { // A22 (offm,offn) += (ij+b,ij+b). // A22 is (m-ij-b) x (n-ij-b). offm_inc = ij + b; offn_inc = ij + b; m_part = m - ij - b; n_part = n - ij - b; } // Compute the diagonal offset based on the m and n offsets. diag_off_inc = ( doff_t )offm_inc - ( doff_t )offn_inc; // Begin by copying the info, elem size, buffer, row stride, and column // stride fields of the parent object. Note that this omits copying view // information because the new partition will have its own dimensions // and offsets. bli_obj_init_subpart_from( obj, sub_obj ); // Modify offsets and dimensions of requested partition based on // whether it needs to be transposed. if ( bli_obj_has_notrans( obj ) ) { bli_obj_set_dims( m_part, n_part, sub_obj ); bli_obj_inc_offs( offm_inc, offn_inc, sub_obj ); bli_obj_inc_diag_offset( diag_off_inc, sub_obj ); } else // if ( bli_obj_has_trans( obj ) ) { bli_obj_set_dims( n_part, m_part, sub_obj ); bli_obj_inc_offs( offn_inc, offm_inc, sub_obj ); bli_obj_inc_diag_offset( -diag_off_inc, sub_obj ); } // If the root matrix is not general (ie: has structure defined by the // diagonal), and the subpartition does not intersect the root matrix's // diagonal, then set the subpartition structure to "general"; otherwise // we let the subpartition inherit the storage structure of its immediate // parent. if ( !bli_obj_root_is_general( sub_obj ) && req_part != BLIS_SUBPART00 && req_part != BLIS_SUBPART11 && req_part != BLIS_SUBPART22 ) { // FGVZ: Fix me. This needs to be cleaned up. Either non-diagonal // intersecting subpartitions should inherit their root object's // uplo field, or it should not. Right now, they DO inherit the // uplo (because they are not set to BLIS_DENSE when the diagonal // does not intersect). But the whole point of being able to query // the root object's properties (e.g. uplo field) was so that we // COULD mark such subpartitions as dense, to make it easier for // certain subproblems on those subpartitions--subproblems that // are agnostic to where the subpartition came from. // NOTE: This comment may be out-of-date since we now distinguish // between uplo properties for the current and root objects... // Note that we cannot mark the subpartition object as general/dense // here since it makes sense to preserve the existing uplo information // a while longer so that the correct kernels are invoked. (Example: // incremental packing/computing in gemmt produces subpartitions that // appear general/dense, but their uplo fields are needed to be either // lower or upper, to determine which macro-kernel gets called in the // gemmt_int() back-end.) // If the subpartition lies entirely in an "unstored" triangle of the // root matrix, then we need to tweak the subpartition. If the root // matrix is Hermitian or symmetric, then we reflect the partition to // the other side of the diagonal, toggling the transposition bit (and // conjugation bit if the root matrix is Hermitian). Or, if the root // matrix is triangular, the subpartition should be marked as zero. if ( bli_obj_is_unstored_subpart( sub_obj ) ) { if ( bli_obj_root_is_hermitian( sub_obj ) ) { bli_obj_reflect_about_diag( sub_obj ); bli_obj_toggle_conj( sub_obj ); } else if ( bli_obj_root_is_symmetric( sub_obj ) ) { bli_obj_reflect_about_diag( sub_obj ); } else if ( bli_obj_root_is_triangular( sub_obj ) ) { bli_obj_set_uplo( BLIS_ZEROS, sub_obj ); } } } } // -- Vector partitioning ------------------------------------------------------ void bli_acquire_vpart_f2b ( subpart_t req_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { if ( bli_obj_is_col_vector( obj ) ) bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( obj ) ) bli_acquire_mpart_ndim( BLIS_FWD, req_part, i, b, obj, sub_obj ); } void bli_acquire_vpart_b2f ( subpart_t req_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { if ( bli_obj_is_col_vector( obj ) ) bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj ); else // if ( bli_obj_is_row_vector( obj ) ) bli_acquire_mpart_ndim( BLIS_BWD, req_part, i, b, obj, sub_obj ); } // -- Scalar acquisition ------------------------------------------------------- void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ) { obj_t tmp_obj; bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, j, 1, obj, &tmp_obj ); bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, 1, &tmp_obj, sub_obj ); } void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ) { if ( bli_obj_is_col_vector( obj ) ) bli_acquire_mpart_mdim( BLIS_FWD, BLIS_SUBPART1, i, 1, obj, sub_obj ); else // if ( bli_obj_is_row_vector( obj ) ) bli_acquire_mpart_ndim( BLIS_FWD, BLIS_SUBPART1, i, 1, obj, sub_obj ); } cython-blis-0.9.1/blis/_src/frame/base/bli_part.h000066400000000000000000000065671427272030600216250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_part_check.h" // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); cython-blis-0.9.1/blis/_src/frame/base/bli_pba.c000066400000000000000000000422461427272030600214060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Statically initialize the mutex within the packing block allocator object. static pba_t pba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; // ----------------------------------------------------------------------------- pba_t* bli_pba_query( void ) { return &pba; } void bli_pba_init ( cntx_t* restrict cntx ) { pba_t* restrict pba = bli_pba_query(); const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN; malloc_ft malloc_fp = BLIS_MALLOC_POOL; free_ft free_fp = BLIS_FREE_POOL; // These fields are used for general-purpose allocation (ie: buf_type // equal to BLIS_BUFFER_FOR_GEN_USE) within bli_pba_acquire_m(). bli_pba_set_align_size( align_size, pba ); bli_pba_set_malloc_fp( malloc_fp, pba ); bli_pba_set_free_fp( free_fp, pba ); // The mutex field of pba is initialized statically above. This // keeps bli_pba_init() simpler and removes the possibility of // something going wrong during mutex initialization. #ifdef BLIS_ENABLE_PBA_POOLS bli_pba_init_pools( cntx, pba ); #endif } void bli_pba_finalize ( void ) { pba_t* restrict pba = bli_pba_query(); #ifdef BLIS_ENABLE_PBA_POOLS bli_pba_finalize_pools( pba ); #endif // The mutex field of pba is initialized statically above, and // therefore never destroyed. bli_pba_set_malloc_fp( NULL, pba ); bli_pba_set_free_fp( NULL, pba ); } void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ) { pool_t* pool; pblk_t* pblk; dim_t pi; err_t r_val; // If the internal memory pools for packing block allocator are disabled, // we spoof the buffer type as BLIS_BUFFER_FOR_GEN_USE to induce the // immediate usage of bli_pba_malloc(). #ifndef BLIS_ENABLE_PBA_POOLS buf_type = BLIS_BUFFER_FOR_GEN_USE; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pba_acquire_m(): bli_fmalloc_align(): size %ld\n", ( long )req_size ); #endif #endif // Query the memory broker from the runtime. pba_t* pba = bli_rntm_pba( rntm ); if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { malloc_ft malloc_fp = bli_pba_malloc_fp( pba ); siz_t align_size = bli_pba_align_size( pba ); // For general-use buffer requests, dynamically allocating memory // is assumed to be sufficient. void* buf = bli_fmalloc_align( malloc_fp, req_size, align_size, &r_val ); // Initialize the mem_t object with: // - the address of the memory block, // - the buffer type (a packbuf_t value), // - the size of the requested region, // - the pba_t from which the mem_t entry was acquired. // NOTE: We initialize the pool field to NULL since this block did not // come from a memory pool. bli_mem_set_buffer( buf, mem ); bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_pool( NULL, mem ); bli_mem_set_size( req_size, mem ); } else { // This branch handles cases where the memory block needs to come // from an internal memory pool, in which blocks are allocated once // and then recycled. // Map the requested packed buffer type to a zero-based index, which // we then use to select the corresponding memory pool. pi = bli_packbuf_index( buf_type ); pool = bli_pba_pool( pi, pba ); // Extract the address of the pblk_t struct within the mem_t. pblk = bli_mem_pblk( mem ); // Acquire the mutex associated with the pba object. bli_pba_lock( pba ); // BEGIN CRITICAL SECTION { // Checkout a block from the pool. If the pool's blocks are too // small, it will be reinitialized with blocks large enough to // accommodate the requested block size. If the pool is exhausted, // either because it is still empty or because all blocks have // been checked out already, additional blocks will be allocated // automatically, as-needed. Note that the addresses are stored // directly into the mem_t struct since pblk is the address of // the struct's pblk_t field. bli_pool_checkout_block( req_size, pblk, pool ); } // END CRITICAL SECTION // Release the mutex associated with the pba object. bli_pba_unlock( pba ); // Query the block_size from the pblk_t. This will be at least // req_size, perhaps larger. siz_t block_size = bli_pblk_block_size( pblk ); // Initialize the mem_t object with: // - the buffer type (a packbuf_t value), // - the address of the memory pool to which it belongs, // - the size of the contiguous memory block (NOT the size of the // requested region), // - the pba_t from which the mem_t entry was acquired. // The actual (aligned) address is already stored in the mem_t // struct's pblk_t field. bli_mem_set_buf_type( buf_type, mem ); bli_mem_set_pool( pool, mem ); bli_mem_set_size( block_size, mem ); } } void bli_pba_release ( rntm_t* rntm, mem_t* mem ) { packbuf_t buf_type; pool_t* pool; pblk_t* pblk; // Query the memory broker from the runtime. pba_t* pba = bli_rntm_pba( rntm ); // Extract the buffer type so we know what kind of memory was allocated. buf_type = bli_mem_buf_type( mem ); #ifndef BLIS_ENABLE_PBA_POOLS #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pba_release(): bli_ffree_align(): size %ld\n", ( long )bli_mem_size( mem ) ); #endif #endif if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { free_ft free_fp = bli_pba_free_fp( pba ); void* buf = bli_mem_buffer( mem ); // For general-use buffers, we dynamically allocate memory, and so // here we need to free it. bli_ffree_align( free_fp, buf ); } else { // Extract the address of the pool from which the memory was // allocated. pool = bli_mem_pool( mem ); // Extract the address of the pblk_t struct within the mem_t struct. pblk = bli_mem_pblk( mem ); // Acquire the mutex associated with the pba object. bli_pba_lock( pba ); // BEGIN CRITICAL SECTION { // Check the block back into the pool. bli_pool_checkin_block( pblk, pool ); } // END CRITICAL SECTION // Release the mutex associated with the pba object. bli_pba_unlock( pba ); } // Clear the mem_t object so that it appears unallocated. This clears: // - the pblk_t struct's fields (ie: the buffer addresses) // - the pool field // - the size field // - the pba field // NOTE: We do not clear the buf_type field since there is no // "uninitialized" value for packbuf_t. bli_mem_clear( mem ); } #if 0 void bli_pba_acquire_v ( pba_t* pba, siz_t req_size, mem_t* mem ) { bli_pba_acquire_m ( pba, req_size, BLIS_BUFFER_FOR_GEN_USE, mem ); } #endif siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ) { siz_t r_val; if ( buf_type == BLIS_BUFFER_FOR_GEN_USE ) { // We don't (yet) track the amount of general-purpose // memory that is currently allocated. r_val = 0; } else { dim_t pool_index; pool_t* pool; // Acquire the pointer to the pool corresponding to the buf_type // provided. pool_index = bli_packbuf_index( buf_type ); pool = bli_pba_pool( pool_index, pba ); // Compute the pool "size" as the product of the block size // and the number of blocks in the pool. r_val = bli_pool_block_size( pool ) * bli_pool_num_blocks( pool ); } return r_val; } // ----------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ) { // Map each of the packbuf_t values to an index starting at zero. const dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); const dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); const dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); // Alias the pool addresses to convenient identifiers. pool_t* pool_a = bli_pba_pool( index_a, pba ); pool_t* pool_b = bli_pba_pool( index_b, pba ); pool_t* pool_c = bli_pba_pool( index_c, pba ); // Start with empty pools. const dim_t num_blocks_a = 0; const dim_t num_blocks_b = 0; const dim_t num_blocks_c = 0; siz_t block_size_a = 0; siz_t block_size_b = 0; siz_t block_size_c = 0; // For blocks of A and panels of B, start off with block_ptrs arrays that // are of a decent length. For C, we can start off with an empty array. const dim_t block_ptrs_len_a = 80; const dim_t block_ptrs_len_b = 80; const dim_t block_ptrs_len_c = 0; // Use the address alignment sizes designated (at configure-time) for pools. const siz_t align_size_a = BLIS_POOL_ADDR_ALIGN_SIZE_A; const siz_t align_size_b = BLIS_POOL_ADDR_ALIGN_SIZE_B; const siz_t align_size_c = BLIS_POOL_ADDR_ALIGN_SIZE_C; // Use the offsets from the above alignments. const siz_t offset_size_a = BLIS_POOL_ADDR_OFFSET_SIZE_A; const siz_t offset_size_b = BLIS_POOL_ADDR_OFFSET_SIZE_B; const siz_t offset_size_c = BLIS_POOL_ADDR_OFFSET_SIZE_C; // Use the malloc() and free() designated (at configure-time) for pools. malloc_ft malloc_fp = BLIS_MALLOC_POOL; free_ft free_fp = BLIS_FREE_POOL; // Determine the block size for each memory pool. bli_pba_compute_pool_block_sizes( &block_size_a, &block_size_b, &block_size_c, cntx ); // Initialize the memory pools for A, B, and C. bli_pool_init( num_blocks_a, block_ptrs_len_a, block_size_a, align_size_a, offset_size_a, malloc_fp, free_fp, pool_a ); bli_pool_init( num_blocks_b, block_ptrs_len_b, block_size_b, align_size_b, offset_size_b, malloc_fp, free_fp, pool_b ); bli_pool_init( num_blocks_c, block_ptrs_len_c, block_size_c, align_size_c, offset_size_c, malloc_fp, free_fp, pool_c ); } void bli_pba_finalize_pools ( pba_t* pba ) { // Map each of the packbuf_t values to an index starting at zero. dim_t index_a = bli_packbuf_index( BLIS_BUFFER_FOR_A_BLOCK ); dim_t index_b = bli_packbuf_index( BLIS_BUFFER_FOR_B_PANEL ); dim_t index_c = bli_packbuf_index( BLIS_BUFFER_FOR_C_PANEL ); // Alias the pool addresses to convenient identifiers. pool_t* pool_a = bli_pba_pool( index_a, pba ); pool_t* pool_b = bli_pba_pool( index_b, pba ); pool_t* pool_c = bli_pba_pool( index_c, pba ); // Finalize the memory pools for A, B, and C. bli_pool_finalize( pool_a ); bli_pool_finalize( pool_b ); bli_pool_finalize( pool_c ); } // ----------------------------------------------------------------------------- void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ) { const ind_t im = bli_cntx_method( cntx ); siz_t bs_cand_a = 0; siz_t bs_cand_b = 0; siz_t bs_cand_c = 0; num_t dt; // Compute pool block sizes for each datatype and find the maximum // size for each pool. This is done so that new pools do not need // to be allocated if the user switches datatypes. for ( dt = BLIS_DT_LO; dt <= BLIS_DT_HI; ++dt ) { siz_t bs_dt_a; siz_t bs_dt_b; siz_t bs_dt_c; // Avoid considering induced methods for real datatypes. if ( bli_is_real( dt ) && im != BLIS_NAT ) continue; bli_pba_compute_pool_block_sizes_dt( dt, &bs_dt_a, &bs_dt_b, &bs_dt_c, cntx ); bs_cand_a = bli_max( bs_dt_a, bs_cand_a ); bs_cand_b = bli_max( bs_dt_b, bs_cand_b ); bs_cand_c = bli_max( bs_dt_c, bs_cand_c ); } // Save the results. *bs_a = bs_cand_a; *bs_b = bs_cand_b; *bs_c = bs_cand_c; } // ----------------------------------------------------------------------------- void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ) { siz_t size_dt = bli_dt_size( dt ); blksz_t* mr; blksz_t* nr; blksz_t* mc; blksz_t* kc; blksz_t* nc; dim_t mr_dt; dim_t nr_dt; dim_t max_mnr_dt; dim_t mc_max_dt; dim_t kc_max_dt; dim_t nc_max_dt; dim_t packmr_dt; dim_t packnr_dt; dim_t max_packmnr_dt; dim_t scale_num_dt; dim_t scale_den_dt; dim_t pool_mc_dt, left_mc_dt; dim_t pool_nc_dt, left_nc_dt; dim_t pool_kc_dt; // // Find the larger of the two register blocksizes. // // Query the mr and nr blksz_t objects for the given method of // execution. mr = bli_cntx_get_blksz( BLIS_MR, cntx ); nr = bli_cntx_get_blksz( BLIS_NR, cntx ); // Extract the mr and nr values specific to the current datatype. mr_dt = bli_blksz_get_def( dt, mr ); nr_dt = bli_blksz_get_def( dt, nr ); // Find the maximum of mr and nr. max_mnr_dt = bli_max( mr_dt, nr_dt ); // // Define local maximum cache blocksizes. // // Query the mc, kc, and nc blksz_t objects for native execution. mc = bli_cntx_get_blksz( BLIS_MC, cntx ); kc = bli_cntx_get_blksz( BLIS_KC, cntx ); nc = bli_cntx_get_blksz( BLIS_NC, cntx ); // Extract the maximum mc, kc, and nc values specific to the current // datatype. mc_max_dt = bli_blksz_get_max( dt, mc ); kc_max_dt = bli_blksz_get_max( dt, kc ); nc_max_dt = bli_blksz_get_max( dt, nc ); // Add max(mr,nr) to kc to make room for the nudging of kc at // runtime to be a multiple of mr or nr for triangular operations // trmm, trmm3, and trsm. kc_max_dt += max_mnr_dt; // // Compute scaling factors. // // Compute integer scaling factors (numerator and denominator) used // to account for situations when the packing register blocksizes are // larger than the regular register blocksizes. // In order to compute the scaling factors, we first have to determine // whether ( packmr / mr ) is greater than ( packnr / nr ). This is // needed ONLY because the amount of space allocated for a block of A // and a panel of B needs to be such that MR and NR can be swapped (ie: // A is packed with NR and B is packed with MR). This transformation is // needed for right-side trsm when inducing an algorithm that (a) has // favorable access patterns for column-stored C and (b) allows the // macro-kernel to reuse the existing left-side fused gemmtrsm micro- // kernels. We avoid integer division by cross-multiplying: // // ( packmr / mr ) >= ( packnr / nr ) // ( packmr / mr ) * nr >= packnr // packmr * nr >= packnr * mr // // So, if packmr * nr >= packnr * mr, then we will use packmr and mr as // our scaling factors. Otherwise, we'll use packnr and nr. packmr_dt = bli_blksz_get_max( dt, mr ); packnr_dt = bli_blksz_get_max( dt, nr ); if ( packmr_dt * nr_dt >= packnr_dt * mr_dt ) { scale_num_dt = packmr_dt; scale_den_dt = mr_dt; } else { scale_num_dt = packnr_dt; scale_den_dt = nr_dt; } // // Compute pool block dimensions. // pool_mc_dt = ( mc_max_dt * scale_num_dt ) / scale_den_dt; left_mc_dt = ( mc_max_dt * scale_num_dt ) % scale_den_dt; pool_nc_dt = ( nc_max_dt * scale_num_dt ) / scale_den_dt; left_nc_dt = ( nc_max_dt * scale_num_dt ) % scale_den_dt; pool_kc_dt = ( kc_max_dt ); if ( left_mc_dt > 0 ) pool_mc_dt += 1; if ( left_nc_dt > 0 ) pool_nc_dt += 1; // // Compute pool block sizes // // We add an extra micro-panel of space to the block sizes for A and B // just to be sure any pre-loading performed by the micro-kernel does // not cause a segmentation fault. max_packmnr_dt = bli_max( packmr_dt, packnr_dt ); *bs_a = ( pool_mc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; *bs_b = ( pool_nc_dt + max_packmnr_dt ) * pool_kc_dt * size_dt; *bs_c = ( pool_mc_dt ) * pool_nc_dt * size_dt; } cython-blis-0.9.1/blis/_src/frame/base/bli_pba.h000066400000000000000000000106061427272030600214060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) /* typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; */ // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_pool.c000066400000000000000000000530411427272030600216100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" //#define BLIS_ENABLE_MEM_TRACING void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ) { err_t r_val; // Make sure that block_ptrs_len is at least num_blocks. block_ptrs_len = bli_max( block_ptrs_len, num_blocks ); // Handle the case where block_ptrs_len is zero, we explicitly set it to 1, // to avoid any malloc() with zero size, whose behavior is not fixed, and // also to prevent from falling into any further memory corruption bug. block_ptrs_len = ( block_ptrs_len == 0 ) ? 1 : block_ptrs_len; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_init(): allocating block_ptrs (length %d): ", ( int )block_ptrs_len ); #endif // Allocate the block_ptrs array. // FGVZ: Do we want to call malloc_fp() for internal data structures as // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. pblk_t* restrict block_ptrs = bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ), &r_val ); // Allocate and initialize each entry in the block_ptrs array. for ( dim_t i = 0; i < num_blocks; ++i ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_init(): allocating block %d of size %d (align %d, offset %d).\n", ( int )i, ( int )block_size, ( int )align_size, ( int )offset_size ); fflush( stdout ); #endif bli_pool_alloc_block ( block_size, align_size, offset_size, malloc_fp, &(block_ptrs[i]) ); } // NOTE: The semantics of top_index approximate a stack, where a "full" // stack (no blocks checked out) is one where top_index == 0 and an empty // stack (all blocks checked out) one where top_index == num_blocks. // (Here, num_blocks tracks the number of blocks currently allocated as // part of the pool.) This "orientation" of the stack was chosen // intentionally, in contrast to one where top_index == -1 means the // stack is empty and top_index = num_blocks - 1 means the stack is // full. The chosen scheme allows one to conceptualize the stack as a // number line in which blocks are checked out from lowest to highest, // and additional blocks are added at the higher end. // Initialize the pool_t structure. bli_pool_set_block_ptrs( block_ptrs, pool ); bli_pool_set_block_ptrs_len( block_ptrs_len, pool ); bli_pool_set_top_index( 0, pool ); bli_pool_set_num_blocks( num_blocks, pool ); bli_pool_set_block_size( block_size, pool ); bli_pool_set_align_size( align_size, pool ); bli_pool_set_offset_size( offset_size, pool ); bli_pool_set_malloc_fp( malloc_fp, pool ); bli_pool_set_free_fp( free_fp, pool ); } void bli_pool_finalize ( pool_t* restrict pool ) { // NOTE: This implementation assumes that either: // - all blocks have been checked in by all threads, or // - some subset of blocks have been checked in and the caller // is bli_pool_reinit(). // Query the block_ptrs array. pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the total number of blocks currently allocated. const siz_t num_blocks = bli_pool_num_blocks( pool ); // NOTE: This sanity check has been disabled because bli_pool_reinit() // is currently implemented in terms of bli_pool_finalize() followed by // bli_pool_init(). If that _reinit() takes place when some blocks are // checked out, then we would expect top_index != 0, and therefore this // check is not universally appropriate. #if 0 // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); // Sanity check: The top_index should be zero. if ( top_index != 0 ) { printf( "bli_pool_finalize(): final top_index == %d (expected 0); block_size: %d.\n", ( int )top_index, ( int )bli_pool_block_size( pool ) ); printf( "bli_pool_finalize(): Implication: not all blocks were checked back in!\n" ); bli_abort(); } #endif // Query the free() function pointer for the pool. free_ft free_fp = bli_pool_free_fp( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_finalize(): freeing %d blocks of size %d (align %d, offset %d).\n", ( int )num_blocks, ( int )bli_pool_block_size( pool ), ( int )bli_pool_align_size( pool ), ( int )bli_pool_offset_size( pool ) ); fflush( stdout ); #endif // Query the offset size of the pool. const siz_t offset_size = bli_pool_offset_size( pool ); // Free the individual blocks currently in the pool. for ( dim_t i = 0; i < num_blocks; ++i ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_finalize(): block %d: ", ( int )i ); #endif bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) ); } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_finalize(): freeing block_ptrs (length %d): ", ( int )( bli_pool_block_ptrs_len( pool ) ) ); #endif // Free the block_ptrs array. bli_free_intl( block_ptrs ); // This explicit clearing of the pool_t struct is not strictly // necessary and so it has been commented out. #if 0 // Clear the contents of the pool_t struct. bli_pool_set_block_ptrs( NULL, pool ); bli_pool_set_block_ptrs_len( 0, pool ); bli_pool_set_num_blocks( 0, pool ); bli_pool_set_top_index( 0, pool ); bli_pool_set_block_size( 0, pool ); bli_pool_set_align_size( 0, pool ); bli_pool_set_offset_size( 0, pool ); #endif } void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ) { // Preserve the pointers to malloc() and free() provided when the pool // was first initialized. malloc_ft malloc_fp = bli_pool_malloc_fp( pool ); free_ft free_fp = bli_pool_free_fp( pool ); // Finalize the pool as it is currently configured. If some blocks // are still checked out to threads, those blocks are not freed // here, and instead will be freed when the threads attempt to check // those blocks back into the pool. (This condition can be detected // since the block size is encoded into each pblk, which is copied // upon checkout.) bli_pool_finalize( pool ); // Reinitialize the pool with the new parameters, in particular, // the new block size. bli_pool_init ( num_blocks_new, block_ptrs_len_new, block_size_new, align_size_new, offset_size_new, malloc_fp, free_fp, pool ); } void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ) { // If the requested block size is smaller than what the pool was // initialized with, reinitialize the pool to contain blocks of the // requested size. if ( bli_pool_block_size( pool ) < req_size ) { const siz_t num_blocks_new = bli_pool_num_blocks( pool ); const siz_t block_ptrs_len_new = bli_pool_block_ptrs_len( pool ); const siz_t align_size_new = bli_pool_align_size( pool ); const siz_t offset_size_new = bli_pool_offset_size( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_checkout_block(): old block size %d < req size %d; " "reiniting.\n", ( int )bli_pool_block_size( pool ), ( int )req_size ); fflush( stdout ); #endif bli_pool_reinit ( num_blocks_new, block_ptrs_len_new, req_size, align_size_new, offset_size_new, pool ); } // If the pool is exhausted, add a block. if ( bli_pool_is_exhausted( pool ) ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_checkout_block(): pool is exhausted (block size %d); " "growing by 1.\n", ( int )bli_pool_block_size( pool ) ); fflush( stdout ); #endif bli_pool_grow( 1, pool ); } // At this point, at least one block is guaranteed to be available. // Query the block_ptrs array. pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_checkout_block(): checking out block %d of size %d " "(align %d).\n", ( int )top_index, ( int )bli_pool_block_size( pool ), ( int )bli_pool_align_size( pool ) ); fflush( stdout ); #endif // Copy the pblk_t at top_index to the caller's pblk_t struct. *block = block_ptrs[ top_index ]; // Notice that we don't actually need to clear the contents of // block_ptrs[top_index]. It will get overwritten eventually when // the block is checked back in. bli_pblk_clear( &block_ptrs[top_index] ); // Increment the pool's top_index. bli_pool_set_top_index( top_index + 1, pool ); } void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ) { // If the pblk_t being checked in was allocated with a different block // size than is currently in use in the pool, we simply free it and // return. These "orphaned" blocks are no longer of use because the pool // has since been reinitialized to a different (larger) block size. if ( bli_pblk_block_size( block ) != bli_pool_block_size( pool ) ) { // Query the offset size of the pool. const siz_t offset_size = bli_pool_offset_size( pool ); // Query the free() function pointer for the pool. free_ft free_fp = bli_pool_free_fp( pool ); bli_pool_free_block( offset_size, free_fp, block ); return; } // Query the block_ptrs array. pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_checkin_block(): checking in block %d of size %d " "(align %d, offset %d).\n", ( int )top_index - 1, ( int )bli_pool_block_size( pool ), ( int )bli_pool_align_size( pool ), ( int )bli_pool_offset_size( pool ) ); fflush( stdout ); #endif // Copy the caller's pblk_t struct to the block at top_index - 1. block_ptrs[ top_index - 1 ] = *block; // Decrement the pool's top_index. bli_pool_set_top_index( top_index - 1, pool ); } void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ) { err_t r_val; // If the requested increase is zero, return early. if ( num_blocks_add == 0 ) return; // Query the allocated length of the block_ptrs array and also the // total number of blocks currently allocated. const siz_t block_ptrs_len_cur = bli_pool_block_ptrs_len( pool ); const siz_t num_blocks_cur = bli_pool_num_blocks( pool ); // Compute the total number of allocated blocks that will exist // after we grow the pool. const siz_t num_blocks_new = num_blocks_cur + num_blocks_add; // If adding num_blocks_add new blocks will exceed the current capacity // of the block_ptrs array, we need to first put in place a new (larger) // array. if ( block_ptrs_len_cur < num_blocks_new ) { // To prevent this from happening often, we double the current // length of the block_ptrs array. // Sanity: make sure that the block_ptrs_len_new will be at least // num_blocks_new, in case doubling the block_ptrs_len_cur is not enough. // Example 1: // - block_ptrs_len_cur == num_blocks_cur == 0 and num_blocks_add = 1 // - So doubling: 2 * block_ptrs_len_cur = 0, whereas 1 is expected // Example 2: // - block_ptrs_len_cur == num_blocks_cur == 10 and num_blocks_add = 30 // - So doubling: 2 * block_ptrs_len_cur = 20, whereas 40 is expected const siz_t block_ptrs_len_new = bli_max( (2 * block_ptrs_len_cur), num_blocks_new ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_grow(): growing block_ptrs_len (%d -> %d): ", ( int )block_ptrs_len_cur, ( int )block_ptrs_len_new ); #endif // Query the current block_ptrs array. pblk_t* restrict block_ptrs_cur = bli_pool_block_ptrs( pool ); // Allocate a new block_ptrs array. // FGVZ: Do we want to call malloc_fp() for internal data structures as // well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g. pblk_t* restrict block_ptrs_new = bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); // Copy the contents of the old block_ptrs array to the new/resized // array. Notice that we can begin with top_index since all entries // from 0 to top_index-1 have been (and are currently) checked out // to threads. for ( dim_t i = top_index; i < num_blocks_cur; ++i ) { block_ptrs_new[i] = block_ptrs_cur[i]; } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_grow(): freeing prev block_ptrs: " ); #endif // Free the old block_ptrs array. bli_free_intl( block_ptrs_cur ); // Update the pool_t struct with the new block_ptrs array and // record its allocated length. bli_pool_set_block_ptrs( block_ptrs_new, pool ); bli_pool_set_block_ptrs_len( block_ptrs_len_new, pool ); } // At this point, we are guaranteed to have enough unused elements // in the block_ptrs array to accommodate an additional num_blocks_add // blocks. // Query the current block_ptrs array (which was mabye just resized). pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Query the block size and alignment size of the pool. const siz_t block_size = bli_pool_block_size( pool ); const siz_t align_size = bli_pool_align_size( pool ); const siz_t offset_size = bli_pool_offset_size( pool ); // Query the malloc() function pointer for the pool. malloc_ft malloc_fp = bli_pool_malloc_fp( pool ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_grow(): growing pool from (%d -> %d).\n", ( int )num_blocks_cur, ( int )num_blocks_new ); fflush( stdout ); #endif // Allocate the requested additional blocks in the resized array. for ( dim_t i = num_blocks_cur; i < num_blocks_new; ++i ) { bli_pool_alloc_block ( block_size, align_size, offset_size, malloc_fp, &(block_ptrs[i]) ); } // Update the pool_t struct with the new number of allocated blocks. // Notice that top_index remains unchanged, as do the block_size and // align_size fields. bli_pool_set_num_blocks( num_blocks_new, pool ); } void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ) { // If the requested decrease is zero, return early. if ( num_blocks_sub == 0 ) return; // Query the total number of blocks currently allocated. const siz_t num_blocks = bli_pool_num_blocks( pool ); // Query the top_index of the pool. const siz_t top_index = bli_pool_top_index( pool ); // Compute the number of blocks available to be checked out // (and thus available for removal). const siz_t num_blocks_avail = num_blocks - top_index; // If the requested decrease is more than the number of available // blocks in the pool, only remove the number of blocks actually // available. num_blocks_sub = bli_min( num_blocks_sub, num_blocks_avail ); // Query the block_ptrs array. pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool ); // Compute the new total number of blocks. const siz_t num_blocks_new = num_blocks - num_blocks_sub; // Query the offset size of the pool. const siz_t offset_size = bli_pool_offset_size( pool ); // Query the free() function pointer for the pool. free_ft free_fp = bli_pool_free_fp( pool ); // Free the individual blocks. for ( dim_t i = num_blocks_new; i < num_blocks; ++i ) { bli_pool_free_block( offset_size, free_fp, &(block_ptrs[i]) ); } // Update the pool_t struct. bli_pool_set_num_blocks( num_blocks_new, pool ); // Note that after shrinking the pool, num_blocks < block_ptrs_len. // This means the pool can grow again by num_blocks_sub before // a re-allocation of block_ptrs is triggered. } void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ) { err_t r_val; #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_alloc_block(): calling fmalloc_align(): size %d (align %d, offset %d)\n", ( int )block_size, ( int )align_size, ( int )offset_size ); fflush( stdout ); #endif // Allocate the block via the bli_fmalloc_align() wrapper, which performs // alignment logic and opaquely saves the original pointer so that it can // be recovered when it's time to free the block. Note that we have to // add offset_size to the number of bytes requested since we will skip // that many bytes at the beginning of the allocated memory. void* restrict buf = bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size, &r_val ); #if 0 // NOTE: This code is disabled because it is not needed, since // bli_fmalloc_align() is guaranteed to return an aligned address. // Advance the pointer to achieve the necessary alignment, if it is not // already aligned. if ( bli_is_unaligned_to( ( siz_t )buf_sys, ( siz_t )align_size ) ) { // C99's stdint.h guarantees that a void* can be safely cast to a // uintptr_t and then back to a void*, hence the casting of buf_sys // and align_size to uintptr_t. buf_align is initially cast to char* // to allow pointer arithmetic in units of bytes, and then advanced // to the next nearest alignment boundary, and finally cast back to // void* before being stored. Notice that the arithmetic works even // if the alignment value is not a power of two. buf_align = ( void* )( ( char* )buf_align + ( ( uintptr_t )align_size - ( uintptr_t )buf_sys % ( uintptr_t )align_size ) ); } #endif // Advance the pointer by offset_size bytes. buf = ( void* )( ( char* )buf + offset_size ); // Save the results in the pblk_t structure. bli_pblk_set_buf( buf, block ); bli_pblk_set_block_size( block_size, block ); } void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_pool_free_block(): calling ffree_align(): size %d.\n", ( int )bli_pblk_block_size( block ) ); fflush( stdout ); #endif // Extract the pblk_t buffer, which is the aligned address returned from // bli_fmalloc_align() when the block was allocated. void* restrict buf = bli_pblk_buf( block ); // Undo the pointer advancement by offset_size bytes performed previously // by bli_pool_alloc_block(). buf = ( void* )( ( char* )buf - offset_size ); // Free the block via the bli_ffree_align() wrapper, which recovers the // original pointer that was returned by the pool's malloc() function when // the block was allocated. bli_ffree_align( free_fp, buf ); } void bli_pool_print ( pool_t* restrict pool ) { pblk_t* block_ptrs = bli_pool_block_ptrs( pool ); siz_t block_ptrs_len = bli_pool_block_ptrs_len( pool ); siz_t top_index = bli_pool_top_index( pool ); siz_t num_blocks = bli_pool_num_blocks( pool ); siz_t block_size = bli_pool_block_size( pool ); siz_t align_size = bli_pool_align_size( pool ); siz_t offset_size = bli_pool_offset_size( pool ); printf( "pool struct ---------------\n" ); printf( " block_ptrs: %p\n", block_ptrs ); printf( " block_ptrs_len: %d\n", ( int )block_ptrs_len ); printf( " top_index: %d\n", ( int )top_index ); printf( " num_blocks: %d\n", ( int )num_blocks ); printf( " block_size: %d\n", ( int )block_size ); printf( " align_size: %d\n", ( int )align_size ); printf( " offset_size: %d\n", ( int )offset_size ); printf( " pblks sys align\n" ); for ( dim_t i = 0; i < num_blocks; ++i ) { printf( " %d: %p\n", ( int )i, bli_pblk_buf( &block_ptrs[i] ) ); } } void bli_pblk_print ( pblk_t* restrict pblk ) { void* buf = bli_pblk_buf( pblk ); printf( "pblk struct ---------------\n" ); printf( " block address (aligned): %p\n", buf ); } cython-blis-0.9.1/blis/_src/frame/base/bli_pool.h000066400000000000000000000151061427272030600216150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- /* typedef struct { void* buf; siz_t block_size; } pblk_t; */ // -- Pool type -- /* typedef struct { void* block_ptrs; siz_t block_ptrs_len; siz_t top_index; siz_t num_blocks; siz_t block_size; siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; */ // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_prune.c000066400000000000000000000131041427272030600217640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ) { // If the primary object is general, it has no structure, and // therefore, no unreferenced parts. if ( bli_obj_is_general( p ) ) return; // If the primary object is BLIS_ZEROS, set the dimensions so that the // matrix is empty. This is not strictly needed but rather a minor // optimization, as it would prevent threads that would otherwise get // subproblems on BLIS_ZEROS operands from calling the macro-kernel, // because bli_thread_range*() would return empty ranges, which would // cause the variant's for loop from executing any iterations. // NOTE: this should only ever execute if the primary object is // triangular because that is the only structure type with subpartitions // that can be marked as BLIS_ZEROS. if ( bli_obj_is_triangular( p ) && bli_obj_is_zeros( p ) ) { bli_obj_set_dim( mdim_p, 0, p ); bli_obj_set_dim( mdim_s, 0, s ); return; } // If the primary object is hermitian, symmetric, or triangular, we // assume that the unstored region will be unreferenced (otherwise, // the caller should not be invoking this function on that object). //if ( bli_obj_is_herm_or_symm( p ) || // bli_obj_is_triangular( p ) ) { doff_t diagoff_p = bli_obj_diag_offset( p ); dim_t m = bli_obj_length( p ); dim_t n = bli_obj_width( p ); uplo_t uplo = bli_obj_uplo( p ); dim_t off_inc = 0; dim_t q; // Support implicit transposition on p and s. if ( bli_obj_has_trans( p ) ) { bli_reflect_about_diag( &diagoff_p, &uplo, &m, &n ); bli_toggle_dim( &mdim_p ); } if ( bli_obj_has_trans( s ) ) { bli_toggle_dim( &mdim_s ); } // Prune away any zero region of the matrix depending on the // dimension of the primary object being partitioned and the // triangle in which it is stored. if ( bli_obj_is_lower( p ) ) { if ( bli_is_m_dim( mdim_p ) ) { bli_prune_unstored_region_top_l( &diagoff_p, &m, &n, &off_inc ); } else // if ( bli_is_n_dim( mdim_p ) ) { bli_prune_unstored_region_right_l( &diagoff_p, &m, &n, &off_inc ); } } else if ( bli_obj_is_upper( p ) ) { if ( bli_is_m_dim( mdim_p ) ) { bli_prune_unstored_region_bottom_u( &diagoff_p, &m, &n, &off_inc ); } else // if ( bli_is_n_dim( mdim_p ) ) { bli_prune_unstored_region_left_u( &diagoff_p, &m, &n, &off_inc ); } } else if ( bli_obj_is_dense( p ) ) { // Hermitian, symmetric, and triangular matrices are almost // never dense, but if one were found to be dense, it would // have no unreferenced regions to prune. return; } else // if ( bli_obj_is_zeros( p ) ) { // Sanity check. Hermitian/symmetric matrices should never have // zero subpartitions. bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } // Select the (potentially modified) dimension along which we are // partitioning. if ( bli_is_m_dim( mdim_p ) ) q = m; else /* if ( bli_is_n_dim( mdim_p ) ) */ q = n; // Update the affected objects in case anything changed. Notice that // it is okay to update the dimension and diagonal offset fields of // packed primary objects, as long as we do so in tandem with the // secondary object to maintain conformality. This just means that // the "ignore-able" zero region is skipped over here, rather than // within the macro-kernel. bli_obj_set_diag_offset( diagoff_p, p ); bli_obj_set_dim( mdim_p, q, p ); bli_obj_set_dim( mdim_s, q, s ); // Only update the affected offset fields if the object in question // is NOT a packed object. Otherwise, bli_obj_buffer_at_off() will // compute the wrong address within the macro-kernel object wrapper. if ( !bli_obj_is_packed( p ) ) { bli_obj_inc_off( mdim_p, off_inc, p ); } if ( !bli_obj_is_packed( s ) ) { bli_obj_inc_off( mdim_s, off_inc, s ); } } } cython-blis-0.9.1/blis/_src/frame/base/bli_prune.h000066400000000000000000000033671427272030600220030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); cython-blis-0.9.1/blis/_src/frame/base/bli_query.c000066400000000000000000000134501427272030600220040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" bool bli_obj_equals( obj_t* a, obj_t* b ) { #if 0 bool r_val = FALSE; num_t dt_a; num_t dt_b; num_t dt; // The function is not yet implemented for vectors and matrices. if ( !bli_obj_is_1x1( a ) || !bli_obj_is_1x1( b ) ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); dt_a = bli_obj_dt( a ); dt_b = bli_obj_dt( b ); // If B is BLIS_CONSTANT, then we need to test equality based on the // datatype of A--this works even if A is also BLIS_CONSTANT. If B // is a regular non-constant type, then we should use its datatype // to test equality. if ( dt_b == BLIS_CONSTANT ) dt = dt_a; else dt = dt_b; // Now test equality based on the chosen datatype. if ( dt == BLIS_CONSTANT ) { dcomplex* ap_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, a ); dcomplex* bp_z = bli_obj_buffer_for_const( BLIS_DCOMPLEX, b ); // We only test equality for one datatype (double complex) since // we expect either all fields within the constant to be equal or // none to be equal. Therefore, we can just test one of them. r_val = bli_zeqa( ap_z, bp_z ); } else { void* buf_a = bli_obj_buffer_for_1x1( dt, a ); void* buf_b = bli_obj_buffer_for_1x1( dt, b ); if ( dt == BLIS_FLOAT ) r_val = bli_seqa( buf_a, buf_b ); else if ( dt == BLIS_DOUBLE ) r_val = bli_deqa( buf_a, buf_b ); else if ( dt == BLIS_SCOMPLEX ) r_val = bli_ceqa( buf_a, buf_b ); else if ( dt == BLIS_DCOMPLEX ) r_val = bli_zeqa( buf_a, buf_b ); else if ( dt == BLIS_INT ) r_val = bli_ieqa( buf_a, buf_b ); } return r_val; #else bool r_val; if ( bli_obj_is_1x1( a ) && bli_obj_is_1x1( b ) ) bli_eqsc( a, b, &r_val ); else if ( bli_obj_is_vector( a ) && bli_obj_is_vector( b ) ) bli_eqv( a, b, &r_val ); else bli_eqm( a, b, &r_val ); return r_val; #endif } bool bli_obj_imag_equals( obj_t* a, obj_t* b ) { #if 0 bool r_val = FALSE; num_t dt_a; num_t dt_b; dt_a = bli_obj_dt( a ); dt_b = bli_obj_dt( b ); // The function is not yet implemented for vectors and matrices. if ( !bli_obj_is_1x1( a ) || !bli_obj_is_1x1( b ) || bli_is_constant( dt_a ) || bli_is_complex( dt_b ) ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); // Handle the special (trivial) case where a is real, in which // case all we have to do is test whether b is zero. if ( bli_is_real( dt_a ) ) { r_val = bli_obj_equals( &BLIS_ZERO, b ); } else // if ( bli_is_complex( dt_a ) ) { num_t dt_a_real = bli_dt_proj_to_real( dt_a ); // Now we compare the imaginary part of a to b. Notice that since // we are using bli_obj_buffer_for_1x1() to acquire the buffer for // b, this works regardless of whether b is BLIS_CONSTANT. if ( dt_a == BLIS_SCOMPLEX ) { scomplex* ap_c = bli_obj_buffer_at_off( a ); float* bp_c = bli_obj_buffer_for_1x1( dt_a_real, b ); r_val = bli_seq( bli_cimag( *ap_c ), *bp_c ); } else if ( dt_a == BLIS_DCOMPLEX ) { dcomplex* ap_z = bli_obj_buffer_at_off( a ); double* bp_z = bli_obj_buffer_for_1x1( dt_a_real, b ); r_val = bli_deq( bli_zimag( *ap_z ), *bp_z ); } } #endif bool r_val = FALSE; // The function is not yet implemented for vectors and matrices. if ( !bli_obj_is_1x1( a ) || !bli_obj_is_1x1( b ) || bli_obj_is_complex( b ) ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); double a_r, a_i; double b_r, b_i; // Get the real and imaginary parts of a and cast them to local doubles. bli_getsc( a, &a_r, &a_i ); // Get the value of b and cast to a local double. (Note: the imaginary part // of b is ignored since we know b is real.) bli_getsc( b, &b_r, &b_i ); // Compare the imaginary part of a to the real part of b. if ( a_i == b_r ) r_val = TRUE; return r_val; } bool bli_obj_imag_is_zero( obj_t* a ) { bool r_val = TRUE; // The function is not yet implemented for vectors and matrices. if ( !bli_obj_is_1x1( a ) ) bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); if ( bli_obj_is_complex( a ) ) { double a_r, a_i; // Get the real and imaginary parts and cast them to local doubles. bli_getsc( a, &a_r, &a_i ); // Compare the imaginary part of a to double-precision zero. if ( !bli_deq0( a_i ) ) r_val = FALSE; } return r_val; } cython-blis-0.9.1/blis/_src/frame/base/bli_query.h000066400000000000000000000035001427272030600220040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); cython-blis-0.9.1/blis/_src/frame/base/bli_rntm.c000066400000000000000000000346001427272030600216170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // The global rntm_t structure, which holds the global thread settings // along with a few other key parameters. rntm_t global_rntm; // A mutex to allow synchronous access to global_rntm. bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER; // ---------------------------------------------------------------------------- void bli_rntm_init_from_global( rntm_t* rntm ) { // We must ensure that global_rntm has been initialized. bli_init_once(); // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); *rntm = global_rntm; // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); } // ----------------------------------------------------------------------------- void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ) { // Set the number of ways for each loop, if needed, depending on what // kind of information is already stored in the rntm_t object. bli_rntm_set_ways_from_rntm( m, n, k, rntm ); #if 0 printf( "bli_rntm_set_ways_for_op()\n" ); bli_rntm_print( rntm ); #endif // Now modify the number of ways, if necessary, based on the operation. if ( l3_op == BLIS_TRMM || l3_op == BLIS_TRSM ) { dim_t jc = bli_rntm_jc_ways( rntm ); dim_t pc = bli_rntm_pc_ways( rntm ); dim_t ic = bli_rntm_ic_ways( rntm ); dim_t jr = bli_rntm_jr_ways( rntm ); dim_t ir = bli_rntm_ir_ways( rntm ); // Notice that, if we do need to update the ways, we don't need to // update the num_threads field since we only reshuffle where the // parallelism is extracted, not the total amount of parallelism. if ( l3_op == BLIS_TRMM ) { // We reconfigure the parallelism extracted from trmm_r due to a // dependency in the jc loop. (NOTE: This dependency does not exist // for trmm3.) if ( bli_is_left( side ) ) { bli_rntm_set_ways_only ( jc, pc, ic, jr, ir, rntm ); } else // if ( bli_is_right( side ) ) { bli_rntm_set_ways_only ( 1, pc, ic, jr * jc, ir, rntm ); } } else if ( l3_op == BLIS_TRSM ) { //printf( "bli_rntm_set_ways_for_op(): jc%d ic%d jr%d\n", (int)jc, (int)ic, (int)jr ); if ( bli_is_left( side ) ) { bli_rntm_set_ways_only ( jc, 1, ic * pc, jr * ir, 1, rntm ); } else // if ( bli_is_right( side ) ) { bli_rntm_set_ways_only ( 1, 1, ic * pc * jc * ir * jr, 1, 1, rntm ); } } } } void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ) { dim_t nt = bli_rntm_num_threads( rntm ); dim_t jc = bli_rntm_jc_ways( rntm ); dim_t pc = bli_rntm_pc_ways( rntm ); dim_t ic = bli_rntm_ic_ways( rntm ); dim_t jr = bli_rntm_jr_ways( rntm ); dim_t ir = bli_rntm_ir_ways( rntm ); bool auto_factor = FALSE; #ifdef BLIS_ENABLE_MULTITHREADING bool nt_set = FALSE; bool ways_set = FALSE; // If the rntm was fed in as a copy of the global runtime via // bli_rntm_init_from_global(), we know that either: // - the num_threads field is -1 and all of the ways are -1; // - the num_threads field is -1 and all of the ways are set; // - the num_threads field is set and all of the ways are -1. // However, we can't be sure that a user-provided rntm_t isn't // initialized uncleanly. So here we have to enforce some rules // to get the rntm_t into a predictable state. // First, we establish whether or not the number of threads is set. if ( nt > 0 ) nt_set = TRUE; // Take this opportunity to set the auto_factor field. if ( nt_set ) auto_factor = TRUE; // Next, we establish whether or not any of the ways of parallelism // for each loop were set. If any of the ways are set (positive), we // then we assume the user wanted to use those positive values and // default the non-positive values to 1. if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 ) { ways_set = TRUE; if ( jc < 1 ) jc = 1; if ( pc < 1 ) pc = 1; if ( ic < 1 ) ic = 1; if ( jr < 1 ) jr = 1; if ( ir < 1 ) ir = 1; } // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. if ( ways_set == TRUE ) { // If the ways were set, then we use the values that were given // and interpreted above (we set any non-positive value to 1). // The only thing left to do is calculate the correct number of // threads. nt = jc * pc * ic * jr * ir; } else if ( ways_set == FALSE && nt_set == TRUE ) { // If the ways were not set but the number of thread was set, then // we attempt to automatically generate a thread factorization that // will work given the problem size. #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // If use of prime numbers is disallowed for automatic thread // factorizations, we first check if the number of threads requested // is prime. If it is prime, and it exceeds a minimum threshold, then // we reduce the number of threads by one so that the number is not // prime. This will allow for automatic thread factorizations to span // two dimensions (loops), which tends to be more efficient. if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; #endif pc = 1; //printf( "m n = %d %d BLIS_THREAD_RATIO_M _N = %d %d\n", (int)m, (int)n, (int)BLIS_THREAD_RATIO_M, (int)BLIS_THREAD_RATIO_N ); bli_thread_partition_2x2( nt, m*BLIS_THREAD_RATIO_M, n*BLIS_THREAD_RATIO_N, &ic, &jc ); //printf( "jc ic = %d %d\n", (int)jc, (int)ic ); for ( ir = BLIS_THREAD_MAX_IR ; ir > 1 ; ir-- ) { if ( ic % ir == 0 ) { ic /= ir; break; } } for ( jr = BLIS_THREAD_MAX_JR ; jr > 1 ; jr-- ) { if ( jc % jr == 0 ) { jc /= jr; break; } } } else // if ( ways_set == FALSE && nt_set == FALSE ) { // If neither the ways nor the number of threads were set, then // the rntm was not meaningfully changed since initialization, // and thus we'll default to single-threaded execution. nt = 1; jc = pc = ic = jr = ir = 1; } #else // When multithreading is disabled, always set the rntm_t ways // values to 1. nt = 1; jc = pc = ic = jr = ir = 1; #endif // Save the results back in the runtime object. bli_rntm_set_auto_factor_only( auto_factor, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); } void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ) { dim_t nt = bli_rntm_num_threads( rntm ); dim_t jc = bli_rntm_jc_ways( rntm ); dim_t pc = bli_rntm_pc_ways( rntm ); dim_t ic = bli_rntm_ic_ways( rntm ); dim_t jr = bli_rntm_jr_ways( rntm ); dim_t ir = bli_rntm_ir_ways( rntm ); bool auto_factor = FALSE; #ifdef BLIS_ENABLE_MULTITHREADING bool nt_set = FALSE; bool ways_set = FALSE; // If the rntm was fed in as a copy of the global runtime via // bli_rntm_init_from_global(), we know that either: // - the num_threads field is -1 and all of the ways are -1; // - the num_threads field is -1 and all of the ways are set; // - the num_threads field is set and all of the ways are -1. // However, we can't be sure that a user-provided rntm_t isn't // initialized uncleanly. So here we have to enforce some rules // to get the rntm_t into a predictable state. // First, we establish whether or not the number of threads is set. if ( nt > 0 ) nt_set = TRUE; // Take this opportunity to set the auto_factor field. if ( nt_set ) auto_factor = TRUE; // Next, we establish whether or not any of the ways of parallelism // for each loop were set. If any of the ways are set (positive), we // then we assume the user wanted to use those positive values and // default the non-positive values to 1. if ( jc > 0 || pc > 0 || ic > 0 || jr > 0 || ir > 0 ) { ways_set = TRUE; if ( jc < 1 ) jc = 1; if ( pc < 1 ) pc = 1; if ( ic < 1 ) ic = 1; if ( jr < 1 ) jr = 1; if ( ir < 1 ) ir = 1; } // Now we use the values of nt_set and ways_set to determine how to // interpret the original values we found in the rntm_t object. if ( ways_set == TRUE ) { // If the ways were set, then we use the values that were given // and interpreted above (we set any non-positive value to 1). // The only thing left to do is calculate the correct number of // threads. nt = jc * pc * ic * jr * ir; } else if ( ways_set == FALSE && nt_set == TRUE ) { // If the ways were not set but the number of thread was set, then // we attempt to automatically generate a thread factorization that // will work given the problem size. #ifdef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // If use of prime numbers is disallowed for automatic thread // factorizations, we first check if the number of threads requested // is prime. If it is prime, and it exceeds a minimum threshold, then // we reduce the number of threads by one so that the number is not // prime. This will allow for automatic thread factorizations to span // two dimensions (loops), which tends to be more efficient. if ( bli_is_prime( nt ) && BLIS_NT_MAX_PRIME < nt ) nt -= 1; #endif pc = 1; //bli_thread_partition_2x2( nt, m*BLIS_THREAD_SUP_RATIO_M, // n*BLIS_THREAD_SUP_RATIO_N, &ic, &jc ); bli_thread_partition_2x2( nt, m, n, &ic, &jc ); //printf( "bli_rntm_set_ways_from_rntm_sup(): jc = %d ic = %d\n", (int)jc, (int)ic ); #if 0 for ( ir = BLIS_THREAD_SUP_MAX_IR ; ir > 1 ; ir-- ) { if ( ic % ir == 0 ) { ic /= ir; break; } } for ( jr = BLIS_THREAD_SUP_MAX_JR ; jr > 1 ; jr-- ) { if ( jc % jr == 0 ) { jc /= jr; break; } } #else ir = 1; jr = 1; #endif } else // if ( ways_set == FALSE && nt_set == FALSE ) { // If neither the ways nor the number of threads were set, then // the rntm was not meaningfully changed since initialization, // and thus we'll default to single-threaded execution. nt = 1; jc = pc = ic = jr = ir = 1; } #else // When multithreading is disabled, always set the rntm_t ways // values to 1. nt = 1; jc = pc = ic = jr = ir = 1; #endif // Save the results back in the runtime object. bli_rntm_set_auto_factor_only( auto_factor, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); } void bli_rntm_print ( rntm_t* rntm ) { dim_t af = bli_rntm_auto_factor( rntm ); dim_t nt = bli_rntm_num_threads( rntm ); dim_t jc = bli_rntm_jc_ways( rntm ); dim_t pc = bli_rntm_pc_ways( rntm ); dim_t ic = bli_rntm_ic_ways( rntm ); dim_t jr = bli_rntm_jr_ways( rntm ); dim_t ir = bli_rntm_ir_ways( rntm ); printf( "rntm contents nt jc pc ic jr ir\n" ); printf( "autofac? %1d | %4d%4d%4d%4d%4d%4d\n", (int)af, (int)nt, (int)jc, (int)pc, (int)ic, (int)jr, (int)ir ); } // ----------------------------------------------------------------------------- dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ) { /* // bp algorithm: bszid_t bszids[7] = { BLIS_NC, // level 0: 5th loop BLIS_KC, // level 1: 4th loop BLIS_NO_PART, // level 2: pack B BLIS_MC, // level 3: 3rd loop BLIS_NO_PART, // level 4: pack A BLIS_NR, // level 5: 2nd loop BLIS_MR, // level 6: 1st loop BLIS_KR // level 7: ukr loop ... // pb algorithm: BLIS_NR, // level 5: 2nd loop BLIS_MR, // level 6: 1st loop BLIS_KR // level 7: ukr loop }; */ dim_t n_threads_in = 1; // Starting with the current element of the bszids array (pointed // to by bszid_cur), multiply all of the corresponding ways of // parallelism. for ( ; *bszid_cur != BLIS_KR; bszid_cur++ ) { const bszid_t bszid = *bszid_cur; //if ( bszid == BLIS_KR ) break; // We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not // BLIS_NO_PART. if ( bszid != BLIS_NO_PART ) { const dim_t cur_way = bli_rntm_ways_for( bszid, rntm ); n_threads_in *= cur_way; } } return n_threads_in; } #if 0 for ( ; *bszid_cur != BLIS_KR; bszid_cur++ ) { const bszid_t bszid = *bszid_cur; dim_t cur_way = 1; // We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not // BLIS_NO_PART. if ( bszid != BLIS_NO_PART ) cur_way = bli_rntm_ways_for( bszid, rntm ); else cur_way = 1; n_threads_in *= cur_way; } #endif cython-blis-0.9.1/blis/_src/frame/base/bli_rntm.h000066400000000000000000000245401427272030600216260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) /* typedef struct rntm_s { bool auto_factor; dim_t num_threads; dim_t* thrloop; bool pack_a; bool pack_b; bool l3_sup; pool_t* sba_pool; pba_t* pba; } rntm_t; */ // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_sba.c000066400000000000000000000134321427272030600214040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Statically initialize the mutex within the small block allocator. // Note that the sba is an apool_t of array_t of pool_t. static apool_t sba = { .mutex = BLIS_PTHREAD_MUTEX_INITIALIZER }; apool_t* bli_sba_query( void ) { return &sba; } // ----------------------------------------------------------------------------- void bli_sba_init( void ) { bli_apool_init( &sba ); } void bli_sba_finalize( void ) { bli_apool_finalize( &sba ); } void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ) { void* block; err_t r_val; #ifdef BLIS_ENABLE_SBA_POOLS if ( rntm == NULL ) { block = bli_malloc_intl( req_size, &r_val ); } else { pblk_t pblk; // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); // We don't expect NULL sba_pool pointers in the normal course of BLIS // operation. However, there are rare instances where it is convenient // to support use of bli_sba_acquire() without having to pass in a valid // sba pool data structure. The case that inspired this branch was the // gemm_ukr and related test modules in the BLIS testsuite. (There, it // is convenient to not have to checkout an array_t from the sba, and it // does no harm since the malloc() happens outside of the region that // would be timed.) if ( pool == NULL ) { block = bli_malloc_intl( req_size, &r_val ); } else { // Query the block_size of the pool_t so that we can request the exact // size present. const siz_t block_size = bli_pool_block_size( pool ); // Sanity check: Make sure the requested size is no larger than the // block_size field of the pool. if ( block_size < req_size ) { printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n", ( int )block_size, ( int )req_size ); bli_abort(); } // Check out a block using the block_size queried above. bli_pool_checkout_block( block_size, &pblk, pool ); // The block address is stored within the pblk_t. block = bli_pblk_buf( &pblk ); } } #else block = bli_malloc_intl( req_size, &r_val ); #endif // Return the address obtained from the pblk_t. return block; } void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ) { #ifdef BLIS_ENABLE_SBA_POOLS if ( rntm == NULL ) { bli_free_intl( block ); } else { pblk_t pblk; // Query the small block pool from the rntm. pool_t* restrict pool = bli_rntm_sba_pool( rntm ); if ( pool == NULL ) { bli_free_intl( block ); } else { // Query the block_size field from the pool. This is not super-important // for this particular application of the pool_t (that is, the "leaf" // component of the sba), but it seems like good housekeeping to maintain // the block_size field of the pblk_t in case its ever needed/read. const siz_t block_size = bli_pool_block_size( pool ); // Embed the block's memory address into a pblk_t, along with the // block_size queried from the pool. bli_pblk_set_buf( block, &pblk ); bli_pblk_set_block_size( block_size, &pblk ); // Check the pblk_t back into the pool_t. (It's okay that the pblk_t is // a local variable since its contents are copied into the pool's internal // data structure--an array of pblk_t.) bli_pool_checkin_block( &pblk, pool ); } } #else bli_free_intl( block ); #endif } array_t* bli_sba_checkout_array ( const siz_t n_threads ) { #ifndef BLIS_ENABLE_SBA_POOLS return NULL; #endif return bli_apool_checkout_array( n_threads, &sba ); } void bli_sba_checkin_array ( array_t* restrict array ) { #ifndef BLIS_ENABLE_SBA_POOLS return; #endif bli_apool_checkin_array( array, &sba ); } void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ) { #ifndef BLIS_ENABLE_SBA_POOLS bli_rntm_set_sba_pool( NULL, rntm ); return; #endif // Query the pool_t* in the array_t corresponding to index. pool_t* restrict pool = bli_apool_array_elem( index, array ); // Embed the pool_t* into the rntm_t. bli_rntm_set_sba_pool( pool, rntm ); } cython-blis-0.9.1/blis/_src/frame/base/bli_sba.h000066400000000000000000000045161427272030600214140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif cython-blis-0.9.1/blis/_src/frame/base/bli_setgetijm.c000066400000000000000000000113271427272030600226330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" typedef void (*setijm_fp) ( double ar, double ai, dim_t i, dim_t j, void* restrict b, inc_t rs, inc_t cs ); static setijm_fp GENARRAY(ftypes_setijm,setijm); err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ) { dim_t m = bli_obj_length( b ); dim_t n = bli_obj_width( b ); dim_t rs = bli_obj_row_stride( b ); dim_t cs = bli_obj_col_stride( b ); num_t dt = bli_obj_dt( b ); // Return error if i or j is beyond bounds of the matrix/vector. if ( i < 0 || m <= i ) return BLIS_FAILURE; if ( j < 0 || n <= j ) return BLIS_FAILURE; // Don't modify scalar constants. if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; // Query the pointer to the buffer at the adjusted offsets. void* b_p = bli_obj_buffer_at_off( b ); // Index into the function pointer array. setijm_fp f = ftypes_setijm[ dt ]; // Invoke the type-specific function. f ( ar, ai, i, j, b_p, rs, cs ); return BLIS_SUCCESS; } #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ) \ { \ ctype* restrict b_cast = ( ctype* )b; \ \ ctype* restrict b_ij = b_cast + (i )*rs + (j )*cs; \ \ PASTEMAC2(z,ch,sets)( ar, ai, *b_ij ); \ } INSERT_GENTFUNC_BASIC0( setijm ) // ----------------------------------------------------------------------------- typedef void (*getijm_fp) ( dim_t i, dim_t j, void* restrict b, inc_t rs, inc_t cs, double* ar, double* ai ); static getijm_fp GENARRAY(ftypes_getijm,getijm); err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ) { dim_t m = bli_obj_length( b ); dim_t n = bli_obj_width( b ); dim_t rs = bli_obj_row_stride( b ); dim_t cs = bli_obj_col_stride( b ); num_t dt = bli_obj_dt( b ); // Return error if i or j is beyond bounds of the matrix/vector. if ( i < 0 || m <= i ) return BLIS_FAILURE; if ( j < 0 || n <= j ) return BLIS_FAILURE; // Disallow access into scalar constants. if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; // Query the pointer to the buffer at the adjusted offsets. void* b_p = bli_obj_buffer_at_off( b ); // Index into the function pointer array. getijm_fp f = ftypes_getijm[ dt ]; // Invoke the type-specific function. f ( i, j, b_p, rs, cs, ar, ai ); return BLIS_SUCCESS; } #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ) \ { \ ctype* restrict b_cast = ( ctype* )b; \ \ ctype* restrict b_ij = b_cast + (i )*rs + (j )*cs; \ \ PASTEMAC2(ch,z,gets)( *b_ij, *ar, *ai ); \ } INSERT_GENTFUNC_BASIC0( getijm ) cython-blis-0.9.1/blis/_src/frame/base/bli_setgetijm.h000066400000000000000000000051541427272030600226410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) cython-blis-0.9.1/blis/_src/frame/base/bli_setgetijv.c000066400000000000000000000104341427272030600226420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" typedef void (*setijv_fp) ( double ar, double ai, dim_t i, void* restrict x, inc_t incx ); static setijv_fp GENARRAY(ftypes_setijv,setijv); err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ) { dim_t n = bli_obj_vector_dim( x ); dim_t incx = bli_obj_vector_inc( x ); num_t dt = bli_obj_dt( x ); // Return error if i is beyond bounds of the vector. if ( i < 0 || n <= i ) return BLIS_FAILURE; // Don't modify scalar constants. if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; // Query the pointer to the buffer at the adjusted offsets. void* x_p = bli_obj_buffer_at_off( x ); // Index into the function pointer array. setijv_fp f = ftypes_setijv[ dt ]; // Invoke the type-specific function. f ( ar, ai, i, x_p, incx ); return BLIS_SUCCESS; } #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ) \ { \ ctype* restrict x_cast = ( ctype* )x; \ \ ctype* restrict x_i = x_cast + (i )*incx; \ \ PASTEMAC2(z,ch,sets)( ar, ai, *x_i ); \ } INSERT_GENTFUNC_BASIC0( setijv ) // ----------------------------------------------------------------------------- typedef void (*getijv_fp) ( dim_t i, void* restrict x, inc_t incx, double* ar, double* ai ); static getijv_fp GENARRAY(ftypes_getijv,getijv); err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ) { dim_t n = bli_obj_vector_dim( x ); dim_t incx = bli_obj_vector_inc( x ); num_t dt = bli_obj_dt( x ); // Return error if i is beyond bounds of the vector. if ( i < 0 || n <= i ) return BLIS_FAILURE; // Disallow access into scalar constants. if ( dt == BLIS_CONSTANT ) return BLIS_FAILURE; // Query the pointer to the buffer at the adjusted offsets. void* x_p = bli_obj_buffer_at_off( x ); // Index into the function pointer array. getijv_fp f = ftypes_getijv[ dt ]; // Invoke the type-specific function. f ( i, x_p, incx, ar, ai ); return BLIS_SUCCESS; } #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict x, inc_t incx, \ double* ar, \ double* ai \ ) \ { \ ctype* restrict x_cast = ( ctype* )x; \ \ ctype* restrict x_i = x_cast + (i )*incx; \ \ PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \ } INSERT_GENTFUNC_BASIC0( getijv ) cython-blis-0.9.1/blis/_src/frame/base/bli_setgetijv.h000066400000000000000000000050011427272030600226410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) cython-blis-0.9.1/blis/_src/frame/base/bli_setri.c000066400000000000000000000113361427272030600217660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // -- setr --------------------------------------------------------------------- void bli_setrm ( obj_t* alpha, obj_t* b ) { obj_t alpha_real; obj_t br; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_setm_check( alpha, b ); // Initialize a local scalar, alpha_real, using the real projection // of the datatype of b. bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( b ), &alpha_real ); // Copy/typecast alpha to alpha_real. This discards the imaginary // part of alpha (if it is complex). bli_copysc( alpha, &alpha_real ); // Acquire an alias to the real part of b. bli_obj_real_part( b, &br ); // Use setm to set the real part of b to alpha_real. bli_setm( &alpha_real, &br ); } void bli_setrv ( obj_t* alpha, obj_t* x ) { obj_t alpha_real; obj_t xr; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_setv_check( alpha, x ); // Initialize a local scalar, alpha_real, using the real projection // of the datatype of x. bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( x ), &alpha_real ); // Copy/typecast alpha to alpha_real. This discards the imaginary // part of alpha (if it is complex). bli_copysc( alpha, &alpha_real ); // Acquire an alias to the real part of x. bli_obj_real_part( x, &xr ); // Use setv to set the real part of x to alpha_real. bli_setv( &alpha_real, &xr ); } // -- seti --------------------------------------------------------------------- void bli_setim ( obj_t* alpha, obj_t* b ) { obj_t alpha_real; obj_t bi; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_setm_check( alpha, b ); // If the object is real, return early. if ( bli_obj_is_real( b ) ) return; // Initialize a local scalar, alpha_real, using the real projection // of the datatype of b. bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( b ), &alpha_real ); // Copy/typecast alpha to alpha_real. This discards the imaginary // part of alpha (if it is complex). bli_copysc( alpha, &alpha_real ); // Acquire an alias to the imaginary part of b. bli_obj_imag_part( b, &bi ); // Use setm to set the imaginary part of b to alpha_real. bli_setm( &alpha_real, &bi ); } void bli_setiv ( obj_t* alpha, obj_t* x ) { obj_t alpha_real; obj_t xi; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_setv_check( alpha, x ); // If the object is real, return early. if ( bli_obj_is_real( x ) ) return; // Initialize a local scalar, alpha_real, using the real projection // of the datatype of x. bli_obj_scalar_init_detached( bli_obj_dt_proj_to_real( x ), &alpha_real ); // Copy/typecast alpha to alpha_real. This discards the imaginary // part of alpha (if it is complex). bli_copysc( alpha, &alpha_real ); // Acquire an alias to the imaginary part of x. bli_obj_imag_part( x, &xi ); // Use setm to set the imaginary part of x to alpha_real. bli_setm( &alpha_real, &xi ); } cython-blis-0.9.1/blis/_src/frame/base/bli_setri.h000066400000000000000000000042011427272030600217640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); cython-blis-0.9.1/blis/_src/frame/base/bli_string.c000066400000000000000000000035621427272030600221500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_string_mkupper( char* s ) { // Convert the string to uppercase. for ( ; *s != '\0'; s++ ) { // Convert to unsigned in case one of the chars is negative. *s = toupper( ( unsigned char ) *s ); } } cython-blis-0.9.1/blis/_src/frame/base/bli_string.h000066400000000000000000000032551427272030600221540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_string_mkupper( char* s ); cython-blis-0.9.1/blis/_src/frame/base/bli_winsys.c000066400000000000000000000045111427272030600221710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef _MSC_VER #include #else #include #endif #if 0 // NOTE: This function is no longer needed by BLIS since BLIS no longer // makes any attempt to change environment variables; rather, it only // reads them. We can keep it here for some time before removing it, // though. int bli_setenv( const char *name, const char *value, int overwrite ) { #ifdef _MSC_VER // Windows. _putenv_s( name, value ); #else // Everything else: Linux, OS X, etc. setenv( name, value, overwrite ); #endif } #endif void bli_sleep( unsigned int secs ) { #ifdef _MSC_VER // Windows. Sleep( secs * 1000 ); #else // Everything else: Linux, OS X, etc. sleep( secs ); #endif } cython-blis-0.9.1/blis/_src/frame/base/bli_winsys.h000066400000000000000000000034111427272030600221740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); cython-blis-0.9.1/blis/_src/frame/base/cast/000077500000000000000000000000001427272030600205745ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/base/cast/bli_castm.c000066400000000000000000000153071427272030600227030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // NOTE: This is one of the few functions in BLIS that is defined // with heterogeneous type support. This is done so that we have // an operation that can be used to typecast (copy-cast) a matrix // of one datatype to a scalar of another datatype. typedef void (*FUNCPTR_T) ( trans_t transa, dim_t m, dim_t n, void* restrict a, inc_t rs_a, inc_t cs_a, void* restrict b, inc_t rs_b, inc_t cs_b ); static FUNCPTR_T GENARRAY2_ALL(ftypes,castm); // // Define object-based interface. // void bli_castm ( obj_t* a, obj_t* b ) { num_t dt_a = bli_obj_dt( a ); num_t dt_b = bli_obj_dt( b ); trans_t transa = bli_obj_conjtrans_status( a ); dim_t m = bli_obj_length( b ); dim_t n = bli_obj_width( b ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a = bli_obj_row_stride( a ); inc_t cs_a = bli_obj_col_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t cs_b = bli_obj_col_stride( b ); FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_castm_check( a, b ); #if 0 if ( bli_obj_dt( a ) == bli_obj_dt( b ) ) { // If a and b share the same datatype, we can simply use copym. bli_copym( a, b ); return; } #endif // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a][dt_b]; // Invoke the void pointer-based function. f ( transa, m, n, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b ); } // ----------------------------------------------------------------------------- // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC2 #define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \ \ void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b \ ) \ { \ ctype_a* restrict a_cast = a; \ ctype_b* restrict b_cast = b; \ conj_t conja; \ dim_t n_iter; \ dim_t n_elem; \ inc_t lda, inca; \ inc_t ldb, incb; \ dim_t j, i; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_2m \ ( \ transa, \ m, n, rs_a, cs_a, rs_b, cs_b, \ &n_elem, &n_iter, &inca, &lda, &incb, &ldb \ ); \ \ /* Extract the conjugation component from the transa parameter. */ \ conja = bli_extract_conj( transa ); \ \ if ( bli_is_conj( conja ) ) \ { \ if ( inca == 1 && incb == 1 ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copyjs)( a1[i], b1[i] ); \ } \ } \ } \ else \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copyjs)( *a1, *b1 ); \ \ a1 += inca; \ b1 += incb; \ } \ } \ } \ } \ else \ { \ if ( inca == 1 && incb == 1 ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copys)( a1[i], b1[i] ); \ } \ } \ } \ else \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copys)( *a1, *b1 ); \ \ a1 += inca; \ b1 += incb; \ } \ } \ } \ } \ } INSERT_GENTFUNC2_BASIC0( castm ) INSERT_GENTFUNC2_MIXDP0( castm ) // ----------------------------------------------------------------------------- // // Define object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( b ); bli_check_error_code( e_val ); // Check structure. // NOTE: We enforce general structure for now in order to simplify the // implementation. bli_check_general_object( a ); bli_check_error_code( e_val ); bli_check_general_object( b ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( b ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( a, b ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( b ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/cast/bli_castm.h000066400000000000000000000044721427272030600227110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); cython-blis-0.9.1/blis/_src/frame/base/cast/bli_castnzm.c000066400000000000000000000153331427272030600232520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // NOTE: This is one of the few functions in BLIS that is defined // with heterogeneous type support. This is done so that we have // an operation that can be used to typecast (copy-cast) a matrix // of one datatype to a scalar of another datatype. typedef void (*FUNCPTR_T) ( trans_t transa, dim_t m, dim_t n, void* restrict a, inc_t rs_a, inc_t cs_a, void* restrict b, inc_t rs_b, inc_t cs_b ); static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm); // // Define object-based interface. // void bli_castnzm ( obj_t* a, obj_t* b ) { num_t dt_a = bli_obj_dt( a ); num_t dt_b = bli_obj_dt( b ); trans_t transa = bli_obj_conjtrans_status( a ); dim_t m = bli_obj_length( b ); dim_t n = bli_obj_width( b ); void* buf_a = bli_obj_buffer_at_off( a ); inc_t rs_a = bli_obj_row_stride( a ); inc_t cs_a = bli_obj_col_stride( a ); void* buf_b = bli_obj_buffer_at_off( b ); inc_t rs_b = bli_obj_row_stride( b ); inc_t cs_b = bli_obj_col_stride( b ); FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_castnzm_check( a, b ); #if 0 if ( bli_obj_dt( a ) == bli_obj_dt( b ) ) { // If a and b share the same datatype, we can simply use copym. bli_copym( a, b ); return; } #endif // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_a][dt_b]; // Invoke the void pointer-based function. f ( transa, m, n, buf_a, rs_a, cs_a, buf_b, rs_b, cs_b ); } // ----------------------------------------------------------------------------- // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC2 #define GENTFUNC2( ctype_a, ctype_b, cha, chb, opname ) \ \ void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b \ ) \ { \ ctype_a* restrict a_cast = a; \ ctype_b* restrict b_cast = b; \ conj_t conja; \ dim_t n_iter; \ dim_t n_elem; \ inc_t lda, inca; \ inc_t ldb, incb; \ dim_t j, i; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_2m \ ( \ transa, \ m, n, rs_a, cs_a, rs_b, cs_b, \ &n_elem, &n_iter, &inca, &lda, &incb, &ldb \ ); \ \ /* Extract the conjugation component from the transa parameter. */ \ conja = bli_extract_conj( transa ); \ \ if ( bli_is_conj( conja ) ) \ { \ if ( inca == 1 && incb == 1 ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copyjnzs)( a1[i], b1[i] ); \ } \ } \ } \ else \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copyjnzs)( *a1, *b1 ); \ \ a1 += inca; \ b1 += incb; \ } \ } \ } \ } \ else \ { \ if ( inca == 1 && incb == 1 ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copynzs)( a1[i], b1[i] ); \ } \ } \ } \ else \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ ctype_a* restrict a1 = a_cast + (j )*lda + (0 )*inca; \ ctype_b* restrict b1 = b_cast + (j )*ldb + (0 )*incb; \ \ for ( i = 0; i < n_elem; ++i ) \ { \ PASTEMAC2(cha,chb,copynzs)( *a1, *b1 ); \ \ a1 += inca; \ b1 += incb; \ } \ } \ } \ } \ } INSERT_GENTFUNC2_BASIC0( castnzm ) INSERT_GENTFUNC2_MIXDP0( castnzm ) // ----------------------------------------------------------------------------- // // Define object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( b ); bli_check_error_code( e_val ); // Check structure. // NOTE: We enforce general structure for now in order to simplify the // implementation. bli_check_general_object( a ); bli_check_error_code( e_val ); bli_check_general_object( b ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( b ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( a, b ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( b ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/cast/bli_castnzm.h000066400000000000000000000045021427272030600232530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); cython-blis-0.9.1/blis/_src/frame/base/cast/bli_castv.c000066400000000000000000000120511427272030600227050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // NOTE: This is one of the few functions in BLIS that is defined // with heterogeneous type support. This is done so that we have // an operation that can be used to typecast (copy-cast) a matrix // of one datatype to a scalar of another datatype. typedef void (*FUNCPTR_T) ( conj_t conjx, dim_t n, void* restrict x, inc_t inc_x, void* restrict y, inc_t inc_y ); static FUNCPTR_T GENARRAY2_ALL(ftypes,castv); // // Define object-based interface. // void bli_castv ( obj_t* x, obj_t* y ) { num_t dt_x = bli_obj_dt( x ); num_t dt_y = bli_obj_dt( y ); conj_t conjx = bli_obj_conj_status( x ); dim_t n = bli_obj_vector_dim( x ); void* buf_x = bli_obj_buffer_at_off( x ); inc_t inc_x = bli_obj_vector_inc( x ); void* buf_y = bli_obj_buffer_at_off( y ); inc_t inc_y = bli_obj_vector_inc( y ); FUNCPTR_T f; // Check parameters. if ( bli_error_checking_is_enabled() ) bli_castv_check( x, y ); #if 0 if ( bli_obj_dt( x ) == bli_obj_dt( y ) ) { // If x and y share the same datatype, we can simply use copyv. bli_copyv( x, y ); return; } #endif // Index into the type combination array to extract the correct // function pointer. f = ftypes[dt_x][dt_y]; // Invoke the void pointer-based function. f ( conjx, n, buf_x, inc_x, buf_y, inc_y ); } // ----------------------------------------------------------------------------- // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNC2 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* restrict x, inc_t incx, \ void* restrict y, inc_t incy \ ) \ { \ ctype_x* restrict x1 = x; \ ctype_y* restrict y1 = y; \ dim_t i; \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ for ( i = 0; i < n; ++i ) \ { \ PASTEMAC2(chx,chy,copyjs)( x1[i], y1[i] ); \ } \ } \ else \ { \ for ( i = 0; i < n; ++i ) \ { \ PASTEMAC2(chx,chy,copyjs)( *x1, *y1 ); \ \ x1 += incx; \ y1 += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ for ( i = 0; i < n; ++i ) \ { \ PASTEMAC2(chx,chy,copys)( x1[i], y1[i] ); \ } \ } \ else \ { \ for ( i = 0; i < n; ++i ) \ { \ PASTEMAC2(chx,chy,copys)( *x1, *y1 ); \ \ x1 += incx; \ y1 += incy; \ } \ } \ } \ } INSERT_GENTFUNC2_BASIC0( castv ) INSERT_GENTFUNC2_MIXDP0( castv ) // ----------------------------------------------------------------------------- // // Define object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/cast/bli_castv.h000066400000000000000000000044151427272030600227170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); cython-blis-0.9.1/blis/_src/frame/base/cast/old/000077500000000000000000000000001427272030600213525ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/base/cast/old/bli_cast_check.c000066400000000000000000000064261427272030600244430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_castm_check ( obj_t* a, obj_t* b ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( b ); bli_check_error_code( e_val ); // Check structure. // NOTE: We enforce general structure for now in order to simplify the // implementation. bli_check_general_object( a ); bli_check_error_code( e_val ); bli_check_general_object( b ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( b ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( a, b ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( b ); bli_check_error_code( e_val ); } void bli_castv_check ( obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/cast/old/bli_cast_check.h000066400000000000000000000034241427272030600244430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_castm_check ( obj_t* a, obj_t* b ); void bli_castv_check ( obj_t* x, obj_t* y ); cython-blis-0.9.1/blis/_src/frame/base/check/000077500000000000000000000000001427272030600207175ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/base/check/bli_obj_check.c000066400000000000000000000131441427272030600236230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ) { err_t e_val; e_val = bli_check_valid_datatype( dt ); bli_check_error_code( e_val ); e_val = bli_check_matrix_strides( m, n, rs, cs, 1 ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); } void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ) { err_t e_val; e_val = bli_check_valid_datatype( dt ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); } void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ) { err_t e_val; e_val = bli_check_matrix_strides( bli_obj_length( obj ), bli_obj_width( obj ), rs, cs, is ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); } void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ) { err_t e_val; // NOTE: We allow the caller to attach NULL to an object because // the buffer contains NULL after _create_wihout_buffer() anyway. // Thus, we're not opening a window for undefined behavior because // that window is already open. Instead of checking for NULL here, // we check the object buffers for all objects in all of the // computational operations' _check()/_int_check() functions. //e_val = bli_check_null_pointer( p ); //bli_check_error_code( e_val ); e_val = bli_check_matrix_strides( bli_obj_length( obj ), bli_obj_width( obj ), rs, cs, is ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); } void bli_obj_create_scalar_check( num_t dt, obj_t* obj ) { err_t e_val; e_val = bli_check_valid_datatype( dt ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); } void bli_obj_free_check( obj_t* obj ) { //err_t e_val; // We don't bother checking for null-ness since bli_obj_free() // handles null pointers safely. //e_val = bli_check_null_pointer( obj ); //bli_check_error_code( e_val ); } void bli_obj_create_const_check( double value, obj_t* obj ) { err_t e_val; e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); } #if 0 void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ) { err_t e_val; e_val = bli_check_null_pointer( a ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( b ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); } #endif void bli_dt_size_check( num_t dt ) { err_t e_val; e_val = bli_check_valid_datatype( dt ); bli_check_error_code( e_val ); } void bli_dt_string_check( num_t dt ) { err_t e_val; e_val = bli_check_nonconstant_datatype( dt ); bli_check_error_code( e_val ); } void bli_dt_union_check( num_t dt1, num_t dt2 ) { err_t e_val; e_val = bli_check_floating_datatype( dt1 ); bli_check_error_code( e_val ); e_val = bli_check_floating_datatype( dt2 ); bli_check_error_code( e_val ); } void bli_obj_print_check( char* label, obj_t* obj ) { err_t e_val; e_val = bli_check_null_pointer( label ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/check/bli_obj_check.h000066400000000000000000000056251427272030600236350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); cython-blis-0.9.1/blis/_src/frame/base/check/bli_part_check.c000066400000000000000000000062771427272030600240300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ) { err_t e_val; e_val = bli_check_valid_3x1_subpart( requested_part ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( sub_obj ); bli_check_error_code( e_val ); } void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ) { err_t e_val; e_val = bli_check_valid_1x3_subpart( requested_part ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( sub_obj ); bli_check_error_code( e_val ); } void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ) { err_t e_val; e_val = bli_check_valid_3x3_subpart( requested_part ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( obj ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( sub_obj ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/check/bli_part_check.h000066400000000000000000000046541427272030600240320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); cython-blis-0.9.1/blis/_src/frame/base/noopt/000077500000000000000000000000001427272030600210015ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/base/noopt/bli_dlamch.c000066400000000000000000000711621427272030600232320ustar00rootroot00000000000000#include "blis.h" #include #include #include #ifdef __cplusplus extern "C" { #endif #ifdef BLIS_ENABLE_LEGACY_LAMCH double bli_pow_di( bla_double* a, bla_integer* n ); /* Table of constant values */ //static bla_integer c__1 = 1; static bla_double c_b32 = 0.; double bli_pow_di(bla_double *ap, bla_integer *bp) { double pow, x; bla_integer n; unsigned long u; pow = 1; x = *ap; n = *bp; if( n != 0 ) { if( n < 0 ) { n = -n; x = 1/x; } for( u = n; ; ) { if( u & 01 ) pow *= x; if( u >>= 1 ) x *= x; else break; } } return pow; } bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len) { /* Initialized data */ static bla_logical first = TRUE_; /* System generated locals */ bla_integer i__1; bla_double ret_val; /* Builtin functions */ double bli_pow_di(bla_double *, bla_integer *); /* Local variables */ static bla_double base; static bla_integer beta; static bla_double emin, prec, emax; static bla_integer imin, imax; static bla_logical lrnd; static bla_double rmin, rmax, t, rmach; extern bla_logical bli_lsame(bla_character *, bla_character *, ftnlen, ftnlen); static bla_double smnum, sfmin; extern /* Subroutine */ int bli_dlamc2(bla_integer *, bla_integer *, bla_logical *, bla_double *, bla_integer *, bla_double *, bla_integer *, bla_double *); static bla_integer it; static bla_double rnd, eps; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DLAMCH determines double precision machine parameters. */ /* Arguments */ /* ========= */ /* CMACH (input) CHARACTER*1 */ /* Specifies the value to be returned by DLAMCH: */ /* = 'E' or 'e', DLAMCH := eps */ /* = 'S' or 's , DLAMCH := sfmin */ /* = 'B' or 'b', DLAMCH := base */ /* = 'P' or 'p', DLAMCH := eps*base */ /* = 'N' or 'n', DLAMCH := t */ /* = 'R' or 'r', DLAMCH := rnd */ /* = 'M' or 'm', DLAMCH := emin */ /* = 'U' or 'u', DLAMCH := rmin */ /* = 'L' or 'l', DLAMCH := emax */ /* = 'O' or 'o', DLAMCH := rmax */ /* where */ /* eps = relative machine precision */ /* sfmin = safe minimum, such that 1/sfmin does not overflow */ /* base = base of the machine */ /* prec = eps*base */ /* t = number of (base) digits in the mantissa */ /* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */ /* emin = minimum exponent before (gradual) underflow */ /* rmin = underflow threshold - base**(emin-1) */ /* emax = largest exponent before overflow */ /* rmax = overflow threshold - (base**emax)*(1-eps) */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Save statement .. */ /* .. */ /* .. Data statements .. */ /* .. */ /* .. Executable Statements .. */ if (first) { bli_dlamc2(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); base = (bla_double) beta; t = (bla_double) it; if (lrnd) { rnd = 1.; i__1 = 1 - it; eps = bli_pow_di(&base, &i__1) / 2; } else { rnd = 0.; i__1 = 1 - it; eps = bli_pow_di(&base, &i__1); } prec = eps * base; emin = (bla_double) imin; emax = (bla_double) imax; sfmin = rmin; smnum = 1. / rmax; if (smnum >= sfmin) { /* Use SMALL plus a bit, to avoid the possibility of rounding */ /* causing overflow when computing 1/sfmin. */ sfmin = smnum * (eps + 1.); } } if (bli_lsame(cmach, "E", (ftnlen)1, (ftnlen)1)) { rmach = eps; } else if (bli_lsame(cmach, "S", (ftnlen)1, (ftnlen)1)) { rmach = sfmin; } else if (bli_lsame(cmach, "B", (ftnlen)1, (ftnlen)1)) { rmach = base; } else if (bli_lsame(cmach, "P", (ftnlen)1, (ftnlen)1)) { rmach = prec; } else if (bli_lsame(cmach, "N", (ftnlen)1, (ftnlen)1)) { rmach = t; } else if (bli_lsame(cmach, "R", (ftnlen)1, (ftnlen)1)) { rmach = rnd; } else if (bli_lsame(cmach, "M", (ftnlen)1, (ftnlen)1)) { rmach = emin; } else if (bli_lsame(cmach, "U", (ftnlen)1, (ftnlen)1)) { rmach = rmin; } else if (bli_lsame(cmach, "L", (ftnlen)1, (ftnlen)1)) { rmach = emax; } else if (bli_lsame(cmach, "O", (ftnlen)1, (ftnlen)1)) { rmach = rmax; } ret_val = rmach; first = FALSE_; return ret_val; /* End of DLAMCH */ } /* bli_dlamch_ */ /* *********************************************************************** */ /* Subroutine */ int bli_dlamc1(bla_integer *beta, bla_integer *t, bla_logical *rnd, bla_logical *ieee1) { /* Initialized data */ static bla_logical first = TRUE_; /* System generated locals */ bla_double d__1, d__2; /* Local variables */ static bla_logical lrnd; static bla_double a, b, c__, f; static bla_integer lbeta; static bla_double savec; extern bla_double bli_dlamc3(bla_double *, bla_double *); static bla_logical lieee1; static bla_double t1, t2; static bla_integer lt; static bla_double one, qtr; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DLAMC1 determines the machine parameters given by BETA, T, RND, and */ /* IEEE1. */ /* Arguments */ /* ========= */ /* BETA (output) INTEGER */ /* The base of the machine. */ /* T (output) INTEGER */ /* The number of ( BETA ) digits in the mantissa. */ /* RND (output) LOGICAL */ /* Specifies whether proper rounding ( RND = .TRUE. ) or */ /* chopping ( RND = .FALSE. ) occurs in addition. This may not */ /* be a reliable guide to the way in which the machine performs */ /* its arithmetic. */ /* IEEE1 (output) LOGICAL */ /* Specifies whether rounding appears to be done in the IEEE */ /* 'round to nearest' style. */ /* Further Details */ /* =============== */ /* The routine is based on the routine ENVRON by Malcolm and */ /* incorporates suggestions by Gentleman and Marovich. See */ /* Malcolm M. A. (1972) Algorithms to reveal properties of */ /* floating-point arithmetic. Comms. of the ACM, 15, 949-951. */ /* Gentleman W. M. and Marovich S. B. (1974) More on algorithms */ /* that reveal properties of floating point arithmetic units. */ /* Comms. of the ACM, 17, 276-277. */ /* ===================================================================== */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Save statement .. */ /* .. */ /* .. Data statements .. */ /* .. */ /* .. Executable Statements .. */ if (first) { one = 1.; /* LBETA, LIEEE1, LT and LRND are the local values of BETA, */ /* IEEE1, T and RND. */ /* Throughout this routine we use the function DLAMC3 to ensure */ /* that relevant values are stored and not held in registers, or */ /* are not affected by optimizers. */ /* Compute a = 2.0**m with the smallest positive bla_integer m such */ /* that */ /* fl( a + 1.0 ) = a. */ a = 1.; c__ = 1.; /* + WHILE( C.EQ.ONE )LOOP */ L10: if (c__ == one) { a *= 2; c__ = bli_dlamc3(&a, &one); d__1 = -a; c__ = bli_dlamc3(&c__, &d__1); goto L10; } /* + END WHILE */ /* Now compute b = 2.0**m with the smallest positive bla_integer m */ /* such that */ /* fl( a + b ) .gt. a. */ b = 1.; c__ = bli_dlamc3(&a, &b); /* + WHILE( C.EQ.A )LOOP */ L20: if (c__ == a) { b *= 2; c__ = bli_dlamc3(&a, &b); goto L20; } /* + END WHILE */ /* Now compute the base. a and c are neighbouring floating point */ /* numbers in the interval ( beta**t, beta**( t + 1 ) ) and so */ /* their difference is beta. Adding 0.25 to c is to ensure that it */ /* is truncated to beta and not ( beta - 1 ). */ qtr = one / 4; savec = c__; d__1 = -a; c__ = bli_dlamc3(&c__, &d__1); lbeta = (bla_integer) (c__ + qtr); /* Now determine whether rounding or chopping occurs, by adding a */ /* bit less than beta/2 and a bit more than beta/2 to a. */ b = (bla_double) lbeta; d__1 = b / 2; d__2 = -b / 100; f = bli_dlamc3(&d__1, &d__2); c__ = bli_dlamc3(&f, &a); if (c__ == a) { lrnd = TRUE_; } else { lrnd = FALSE_; } d__1 = b / 2; d__2 = b / 100; f = bli_dlamc3(&d__1, &d__2); c__ = bli_dlamc3(&f, &a); if (lrnd && c__ == a) { lrnd = FALSE_; } /* Try and decide whether rounding is done in the IEEE 'round to */ /* nearest' style. B/2 is half a unit in the last place of the two */ /* numbers A and SAVEC. Furthermore, A is even, i.e. has last bit */ /* zero, and SAVEC is odd. Thus adding B/2 to A should not change */ /* A, but adding B/2 to SAVEC should change SAVEC. */ d__1 = b / 2; t1 = bli_dlamc3(&d__1, &a); d__1 = b / 2; t2 = bli_dlamc3(&d__1, &savec); lieee1 = t1 == a && t2 > savec && lrnd; /* Now find the mantissa, t. It should be the bla_integer part of */ /* log to the base beta of a, however it is safer to determine t */ /* by powering. So we find t as the smallest positive bla_integer for */ /* which */ /* fl( beta**t + 1.0 ) = 1.0. */ lt = 0; a = 1.; c__ = 1.; /* + WHILE( C.EQ.ONE )LOOP */ L30: if (c__ == one) { ++lt; a *= lbeta; c__ = bli_dlamc3(&a, &one); d__1 = -a; c__ = bli_dlamc3(&c__, &d__1); goto L30; } /* + END WHILE */ } *beta = lbeta; *t = lt; *rnd = lrnd; *ieee1 = lieee1; first = FALSE_; return 0; /* End of DLAMC1 */ } /* bli_dlamc1_ */ /* *********************************************************************** */ /* Subroutine */ int bli_dlamc2(bla_integer *beta, bla_integer *t, bla_logical *rnd, bla_double *eps, bla_integer *emin, bla_double *rmin, bla_integer *emax, bla_double *rmax) { /* Initialized data */ static bla_logical first = TRUE_; static bla_logical iwarn = FALSE_; /* Format strings */ static bla_character fmt_9999[] = "(//\002 WARNING. The value EMIN may be incorre\ ct:-\002,\002 EMIN = \002,i8,/\002 If, after inspection, the value EMIN loo\ ks\002,\002 acceptable please comment out \002,/\002 the IF block as marked \ within the code of routine\002,\002 DLAMC2,\002,/\002 otherwise supply EMIN \ explicitly.\002,/)"; /* System generated locals */ bla_integer i__1; bla_double d__1, d__2, d__3, d__4, d__5; /* Builtin functions */ double bli_pow_di(bla_double *, bla_integer *); //bla_integer s_wsfe(cilist *), do_fio(bla_integer *, bla_character *, ftnlen), e_wsfe(); /* Local variables */ static bla_logical ieee; static bla_double half; static bla_logical lrnd; static bla_double leps, zero, a, b, c__; static bla_integer i__, lbeta; static bla_double rbase; static bla_integer lemin, lemax, gnmin; static bla_double smnum; static bla_integer gpmin; static bla_double third, lrmin, lrmax, sixth; extern /* Subroutine */ int bli_dlamc1(bla_integer *, bla_integer *, bla_logical *, bla_logical *); extern bla_double bli_dlamc3(bla_double *, bla_double *); static bla_logical lieee1; extern /* Subroutine */ int bli_dlamc4(bla_integer *, bla_double *, bla_integer *), bli_dlamc5(bla_integer *, bla_integer *, bla_integer *, bla_logical *, bla_integer *, bla_double *); static bla_integer lt, ngnmin, ngpmin; static bla_double one, two; /* Fortran I/O blocks */ //static cilist io___58 = { 0, 6, 0, fmt_9999, 0 }; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DLAMC2 determines the machine parameters specified in its argument */ /* list. */ /* Arguments */ /* ========= */ /* BETA (output) INTEGER */ /* The base of the machine. */ /* T (output) INTEGER */ /* The number of ( BETA ) digits in the mantissa. */ /* RND (output) LOGICAL */ /* Specifies whether proper rounding ( RND = .TRUE. ) or */ /* chopping ( RND = .FALSE. ) occurs in addition. This may not */ /* be a reliable guide to the way in which the machine performs */ /* its arithmetic. */ /* EPS (output) DOUBLE PRECISION */ /* The smallest positive number such that */ /* fl( 1.0 - EPS ) .LT. 1.0, */ /* where fl denotes the computed value. */ /* EMIN (output) INTEGER */ /* The minimum exponent before (gradual) underflow occurs. */ /* RMIN (output) DOUBLE PRECISION */ /* The smallest normalized number for the machine, given by */ /* BASE**( EMIN - 1 ), where BASE is the floating point value */ /* of BETA. */ /* EMAX (output) INTEGER */ /* The maximum exponent before overflow occurs. */ /* RMAX (output) DOUBLE PRECISION */ /* The largest positive number for the machine, given by */ /* BASE**EMAX * ( 1 - EPS ), where BASE is the floating point */ /* value of BETA. */ /* Further Details */ /* =============== */ /* The computation of EPS is based on a routine PARANOIA by */ /* W. Kahan of the University of California at Berkeley. */ /* ===================================================================== */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Save statement .. */ /* .. */ /* .. Data statements .. */ /* .. */ /* .. Executable Statements .. */ if (first) { zero = 0.; one = 1.; two = 2.; /* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of */ /* BETA, T, RND, EPS, EMIN and RMIN. */ /* Throughout this routine we use the function DLAMC3 to ensure */ /* that relevant values are stored and not held in registers, or */ /* are not affected by optimizers. */ /* DLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. */ bli_dlamc1(&lbeta, <, &lrnd, &lieee1); /* Start to find EPS. */ b = (bla_double) lbeta; i__1 = -lt; a = bli_pow_di(&b, &i__1); leps = a; /* Try some tricks to see whether or not this is the correct EPS. */ b = two / 3; half = one / 2; d__1 = -half; sixth = bli_dlamc3(&b, &d__1); third = bli_dlamc3(&sixth, &sixth); d__1 = -half; b = bli_dlamc3(&third, &d__1); b = bli_dlamc3(&b, &sixth); b = f2c_abs(b); if (b < leps) { b = leps; } leps = 1.; /* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ L10: if (leps > b && b > zero) { leps = b; d__1 = half * leps; /* Computing 5th power */ d__3 = two, d__4 = d__3, d__3 *= d__3; /* Computing 2nd power */ d__5 = leps; d__2 = d__4 * (d__3 * d__3) * (d__5 * d__5); c__ = bli_dlamc3(&d__1, &d__2); d__1 = -c__; c__ = bli_dlamc3(&half, &d__1); b = bli_dlamc3(&half, &c__); d__1 = -b; c__ = bli_dlamc3(&half, &d__1); b = bli_dlamc3(&half, &c__); goto L10; } /* + END WHILE */ if (a < leps) { leps = a; } /* Computation of EPS complete. */ /* Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). */ /* Keep dividing A by BETA until (gradual) underflow occurs. This */ /* is detected when we cannot recover the previous A. */ rbase = one / lbeta; smnum = one; for (i__ = 1; i__ <= 3; ++i__) { d__1 = smnum * rbase; smnum = bli_dlamc3(&d__1, &zero); /* L20: */ } a = bli_dlamc3(&one, &smnum); bli_dlamc4(&ngpmin, &one, &lbeta); d__1 = -one; bli_dlamc4(&ngnmin, &d__1, &lbeta); bli_dlamc4(&gpmin, &a, &lbeta); d__1 = -a; bli_dlamc4(&gnmin, &d__1, &lbeta); ieee = FALSE_; if (ngpmin == ngnmin && gpmin == gnmin) { if (ngpmin == gpmin) { lemin = ngpmin; /* ( Non twos-complement machines, no gradual underflow; */ /* e.g., VAX ) */ } else if (gpmin - ngpmin == 3) { lemin = ngpmin - 1 + lt; ieee = TRUE_; /* ( Non twos-complement machines, with gradual underflow; */ /* e.g., IEEE standard followers ) */ } else { lemin = f2c_min(ngpmin,gpmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } } else if (ngpmin == gpmin && ngnmin == gnmin) { if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1) { lemin = f2c_max(ngpmin,ngnmin); /* ( Twos-complement machines, no gradual underflow; */ /* e.g., CYBER 205 ) */ } else { lemin = f2c_min(ngpmin,ngnmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } } else if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1 && gpmin == gnmin) { if (gpmin - f2c_min(ngpmin,ngnmin) == 3) { lemin = f2c_max(ngpmin,ngnmin) - 1 + lt; /* ( Twos-complement machines with gradual underflow; */ /* no known machine ) */ } else { lemin = f2c_min(ngpmin,ngnmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } } else { /* Computing MIN */ i__1 = f2c_min(ngpmin,ngnmin), i__1 = f2c_min(i__1,gpmin); lemin = f2c_min(i__1,gnmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } first = FALSE_; /* ** */ /* Comment out this if block if EMIN is ok */ if (iwarn) { first = TRUE_; /* s_wsfe(&io___58); do_fio(&c__1, (bla_character *)&lemin, (ftnlen)sizeof(bla_integer)); e_wsfe(); */ printf( "%s", fmt_9999 ); } /* ** */ /* Assume IEEE arithmetic if we found denormalised numbers above, */ /* or if arithmetic seems to round in the IEEE style, determined */ /* in routine DLAMC1. A true IEEE machine should have both things */ /* true; however, faulty machines may have one or the other. */ ieee = ieee || lieee1; /* Compute RMIN by successive division by BETA. We could compute */ /* RMIN as BASE**( EMIN - 1 ), but some machines underflow during */ /* this computation. */ lrmin = 1.; i__1 = 1 - lemin; for (i__ = 1; i__ <= i__1; ++i__) { d__1 = lrmin * rbase; lrmin = bli_dlamc3(&d__1, &zero); /* L30: */ } /* Finally, call DLAMC5 to compute EMAX and RMAX. */ bli_dlamc5(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); } *beta = lbeta; *t = lt; *rnd = lrnd; *eps = leps; *emin = lemin; *rmin = lrmin; *emax = lemax; *rmax = lrmax; return 0; /* End of DLAMC2 */ } /* bli_dlamc2_ */ /* *********************************************************************** */ bla_double bli_dlamc3(bla_double *a, bla_double *b) { /* System generated locals */ bla_double ret_val; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DLAMC3 is intended to force A and B to be stored prior to doing */ /* the addition of A and B , for use in situations where optimizers */ /* might hold one of these in a register. */ /* Arguments */ /* ========= */ /* A (input) DOUBLE PRECISION */ /* B (input) DOUBLE PRECISION */ /* The values A and B. */ /* ===================================================================== */ /* .. Executable Statements .. */ ret_val = *a + *b; return ret_val; /* End of DLAMC3 */ } /* bli_dlamc3_ */ /* *********************************************************************** */ /* Subroutine */ int bli_dlamc4(bla_integer *emin, bla_double *start, bla_integer *base) { /* System generated locals */ bla_integer i__1; bla_double d__1; /* Local variables */ static bla_double zero, a; static bla_integer i__; static bla_double rbase, b1, b2, c1, c2, d1, d2; extern bla_double bli_dlamc3(bla_double *, bla_double *); static bla_double one; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DLAMC4 is a service routine for DLAMC2. */ /* Arguments */ /* ========= */ /* EMIN (output) INTEGER */ /* The minimum exponent before (gradual) underflow, computed by */ /* setting A = START and dividing by BASE until the previous A */ /* can not be recovered. */ /* START (input) DOUBLE PRECISION */ /* The starting point for determining EMIN. */ /* BASE (input) INTEGER */ /* The base of the machine. */ /* ===================================================================== */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Executable Statements .. */ a = *start; one = 1.; rbase = one / *base; zero = 0.; *emin = 1; d__1 = a * rbase; b1 = bli_dlamc3(&d__1, &zero); c1 = a; c2 = a; d1 = a; d2 = a; /* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. */ /* $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ L10: if (c1 == a && c2 == a && d1 == a && d2 == a) { --(*emin); a = b1; d__1 = a / *base; b1 = bli_dlamc3(&d__1, &zero); d__1 = b1 * *base; c1 = bli_dlamc3(&d__1, &zero); d1 = zero; i__1 = *base; for (i__ = 1; i__ <= i__1; ++i__) { d1 += b1; /* L20: */ } d__1 = a * rbase; b2 = bli_dlamc3(&d__1, &zero); d__1 = b2 / rbase; c2 = bli_dlamc3(&d__1, &zero); d2 = zero; i__1 = *base; for (i__ = 1; i__ <= i__1; ++i__) { d2 += b2; /* L30: */ } goto L10; } /* + END WHILE */ return 0; /* End of DLAMC4 */ } /* bli_dlamc4_ */ /* *********************************************************************** */ /* Subroutine */ int bli_dlamc5(bla_integer *beta, bla_integer *p, bla_integer *emin, bla_logical *ieee, bla_integer *emax, bla_double *rmax) { /* System generated locals */ bla_integer i__1; bla_double d__1; /* Local variables */ static bla_integer lexp; static bla_double oldy; static bla_integer uexp, i__; static bla_double y, z__; static bla_integer nbits; extern bla_double bli_dlamc3(bla_double *, bla_double *); static bla_double recbas; static bla_integer exbits, expsum, try__; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DLAMC5 attempts to compute RMAX, the largest machine floating-point */ /* number, without overflow. It assumes that EMAX + f2c_abs(EMIN) sum */ /* approximately to a power of 2. It will fail on machines where this */ /* assumption does not hold, for example, the Cyber 205 (EMIN = -28625, */ /* EMAX = 28718). It will also fail if the value supplied for EMIN is */ /* too large (i.e. too close to zero), probably with overflow. */ /* Arguments */ /* ========= */ /* BETA (input) INTEGER */ /* The base of floating-point arithmetic. */ /* P (input) INTEGER */ /* The number of base BETA digits in the mantissa of a */ /* floating-point value. */ /* EMIN (input) INTEGER */ /* The minimum exponent before (gradual) underflow. */ /* IEEE (input) LOGICAL */ /* A bla_logical flag specifying whether or not the arithmetic */ /* system is thought to comply with the IEEE standard. */ /* EMAX (output) INTEGER */ /* The largest exponent before overflow */ /* RMAX (output) DOUBLE PRECISION */ /* The largest machine floating-point number. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* First compute LEXP and UEXP, two powers of 2 that bound */ /* f2c_abs(EMIN). We then assume that EMAX + f2c_abs(EMIN) will sum */ /* approximately to the bound that is closest to f2c_abs(EMIN). */ /* (EMAX is the exponent of the required number RMAX). */ lexp = 1; exbits = 1; L10: try__ = lexp << 1; if (try__ <= -(*emin)) { lexp = try__; ++exbits; goto L10; } if (lexp == -(*emin)) { uexp = lexp; } else { uexp = try__; ++exbits; } /* Now -LEXP is less than or equal to EMIN, and -UEXP is greater */ /* than or equal to EMIN. EXBITS is the number of bits needed to */ /* store the exponent. */ if (uexp + *emin > -lexp - *emin) { expsum = lexp << 1; } else { expsum = uexp << 1; } /* EXPSUM is the exponent range, approximately equal to */ /* EMAX - EMIN + 1 . */ *emax = expsum + *emin - 1; nbits = exbits + 1 + *p; /* NBITS is the total number of bits needed to store a */ /* floating-point number. */ if (nbits % 2 == 1 && *beta == 2) { /* Either there are an odd number of bits used to store a */ /* floating-point number, which is unlikely, or some bits are */ /* not used in the representation of numbers, which is possible, */ /* (e.g. Cray machines) or the mantissa has an implicit bit, */ /* (e.g. IEEE machines, Dec Vax machines), which is perhaps the */ /* most likely. We have to assume the last alternative. */ /* If this is true, then we need to reduce EMAX by one because */ /* there must be some way of representing zero in an implicit-bit */ /* system. On machines like Cray, we are reducing EMAX by one */ /* unnecessarily. */ --(*emax); } if (*ieee) { /* Assume we are on an IEEE machine which reserves one exponent */ /* for infinity and NaN. */ --(*emax); } /* Now create RMAX, the largest machine number, which should */ /* be equal to (1.0 - BETA**(-P)) * BETA**EMAX . */ /* First compute 1.0 - BETA**(-P), being careful that the */ /* result is less than 1.0 . */ recbas = 1. / *beta; z__ = *beta - 1.; y = 0.; i__1 = *p; for (i__ = 1; i__ <= i__1; ++i__) { z__ *= recbas; if (y < 1.) { oldy = y; } y = bli_dlamc3(&y, &z__); /* L20: */ } if (y >= 1.) { y = oldy; } /* Now multiply by BETA**EMAX to get RMAX. */ i__1 = *emax; for (i__ = 1; i__ <= i__1; ++i__) { d__1 = y * *beta; y = bli_dlamc3(&d__1, &c_b32); /* L30: */ } *rmax = y; return 0; /* End of DLAMC5 */ } /* bli_dlamc5_ */ #else bla_double bli_dlamch(bla_character *cmach, ftnlen cmach_len) { /* = 'E' or 'e', DLAMCH := eps */ /* = 'S' or 's , DLAMCH := sfmin */ /* = 'B' or 'b', DLAMCH := base */ /* = 'P' or 'p', DLAMCH := eps*base */ /* = 'N' or 'n', DLAMCH := t */ /* = 'R' or 'r', DLAMCH := rnd */ /* = 'M' or 'm', DLAMCH := emin */ /* = 'U' or 'u', DLAMCH := rmin */ /* = 'L' or 'l', DLAMCH := emax */ /* = 'O' or 'o', DLAMCH := rmax */ /* where */ /* eps = relative machine precision */ /* sfmin = safe minimum, such that 1/sfmin does not overflow */ /* base = base of the machine */ /* prec = eps*base */ /* t = number of (base) digits in the mantissa */ /* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */ /* emin = minimum exponent before (gradual) underflow */ /* rmin = underflow threshold - base**(emin-1) */ /* emax = largest exponent before overflow */ /* rmax = overflow threshold - (base**emax)*(1-eps) */ double safe_min = DBL_MIN; double small = 1.0f / DBL_MAX; if ( small >= safe_min ) safe_min = small * ( 1.0 + DBL_EPSILON ); switch ( toupper( *cmach ) ) { case 'E': return DBL_EPSILON; case 'S': return safe_min; case 'B': return FLT_RADIX; case 'P': return FLT_RADIX*DBL_EPSILON; case 'N': return DBL_MANT_DIG; case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0 : 0.0; case 'M': return DBL_MIN_EXP; case 'U': return DBL_MIN; case 'L': return DBL_MAX_EXP; case 'O': return DBL_MAX; } return 0.0; } #endif #ifdef __cplusplus } #endif cython-blis-0.9.1/blis/_src/frame/base/noopt/bli_dlamch.h000066400000000000000000000033121427272030600232270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); cython-blis-0.9.1/blis/_src/frame/base/noopt/bli_lsame.c000066400000000000000000000053551427272030600231040ustar00rootroot00000000000000/* lsame.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ #ifdef __cplusplus extern "C" { #endif #include "blis.h" bla_logical bli_lsame(bla_character *ca, bla_character *cb, ftnlen ca_len, ftnlen cb_len) { /* System generated locals */ bla_logical ret_val; /* Local variables */ static bla_integer inta, intb, zcode; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* LSAME returns .TRUE. if CA is the same letter as CB regardless of */ /* case. */ /* Arguments */ /* ========= */ /* CA (input) CHARACTER*1 */ /* CB (input) CHARACTER*1 */ /* CA and CB specify the single bla_characters to be compared. */ /* ===================================================================== */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. Executable Statements .. */ /* Test if the bla_characters are equal */ ret_val = *(unsigned char *)ca == *(unsigned char *)cb; if (ret_val) { return ret_val; } /* Now test for equivalence if both bla_characters are alphabetic. */ zcode = 'Z'; /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */ /* machines, on which ICHAR returns a value with bit 8 set. */ /* ICHAR('A') on Prime machines returns 193 which is the same as */ /* ICHAR('A') on an EBCDIC machine. */ inta = *(unsigned char *)ca; intb = *(unsigned char *)cb; if (zcode == 90 || zcode == 122) { /* ASCII is assumed - ZCODE is the ASCII code of either lower or */ /* upper case 'Z'. */ if (inta >= 97 && inta <= 122) { inta += -32; } if (intb >= 97 && intb <= 122) { intb += -32; } } else if (zcode == 233 || zcode == 169) { /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ /* upper case 'Z'. */ if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) { inta += 64; } if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) { intb += 64; } } else if (zcode == 218 || zcode == 250) { /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */ /* plus 128 of either lower or upper case 'Z'. */ if (inta >= 225 && inta <= 250) { inta += -32; } if (intb >= 225 && intb <= 250) { intb += -32; } } ret_val = inta == intb; /* RETURN */ /* End of LSAME */ return ret_val; } /* bli_lsame */ #ifdef __cplusplus } #endif cython-blis-0.9.1/blis/_src/frame/base/noopt/bli_lsame.h000066400000000000000000000033461427272030600231070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); cython-blis-0.9.1/blis/_src/frame/base/noopt/bli_slamch.c000066400000000000000000000710531427272030600232500ustar00rootroot00000000000000#include "blis.h" #include #include #include #ifdef __cplusplus extern "C" { #endif #ifdef BLIS_ENABLE_LEGACY_LAMCH double bli_pow_ri( bla_real* a, bla_integer* n ); /* Table of constant values */ //static bla_integer c__1 = 1; static bla_real c_b32 = (float)0.; double bli_pow_ri(bla_real *ap, bla_integer *bp) { double pow, x; bla_integer n; unsigned long u; pow = 1; x = *ap; n = *bp; if( n != 0 ) { if( n < 0 ) { n = -n; x = 1/x; } for( u = n; ; ) { if( u & 01 ) pow *= x; if( u >>= 1 ) x *= x; else break; } } return pow; } bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len) { /* Initialized data */ static bla_logical first = TRUE_; /* System generated locals */ bla_integer i__1; bla_real ret_val; /* Builtin functions */ double bli_pow_ri(bla_real *, bla_integer *); /* Local variables */ static bla_real base; static bla_integer beta; static bla_real emin, prec, emax; static bla_integer imin, imax; static bla_logical lrnd; static bla_real rmin, rmax, t, rmach; extern bla_logical bli_lsame(bla_character *, bla_character *, ftnlen, ftnlen); static bla_real smnum, sfmin; extern /* Subroutine */ int bli_slamc2(bla_integer *, bla_integer *, bla_logical *, bla_real *, bla_integer *, bla_real *, bla_integer *, bla_real *); static bla_integer it; static bla_real rnd, eps; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SLAMCH determines single precision machine parameters. */ /* Arguments */ /* ========= */ /* CMACH (input) CHARACTER*1 */ /* Specifies the value to be returned by SLAMCH: */ /* = 'E' or 'e', SLAMCH := eps */ /* = 'S' or 's , SLAMCH := sfmin */ /* = 'B' or 'b', SLAMCH := base */ /* = 'P' or 'p', SLAMCH := eps*base */ /* = 'N' or 'n', SLAMCH := t */ /* = 'R' or 'r', SLAMCH := rnd */ /* = 'M' or 'm', SLAMCH := emin */ /* = 'U' or 'u', SLAMCH := rmin */ /* = 'L' or 'l', SLAMCH := emax */ /* = 'O' or 'o', SLAMCH := rmax */ /* where */ /* eps = relative machine precision */ /* sfmin = safe minimum, such that 1/sfmin does not overflow */ /* base = base of the machine */ /* prec = eps*base */ /* t = number of (base) digits in the mantissa */ /* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */ /* emin = minimum exponent before (gradual) underflow */ /* rmin = underflow threshold - base**(emin-1) */ /* emax = largest exponent before overflow */ /* rmax = overflow threshold - (base**emax)*(1-eps) */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Save statement .. */ /* .. */ /* .. Data statements .. */ /* .. */ /* .. Executable Statements .. */ if (first) { bli_slamc2(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); base = (bla_real) beta; t = (bla_real) it; if (lrnd) { rnd = (float)1.; i__1 = 1 - it; eps = bli_pow_ri(&base, &i__1) / 2; } else { rnd = (float)0.; i__1 = 1 - it; eps = bli_pow_ri(&base, &i__1); } prec = eps * base; emin = (bla_real) imin; emax = (bla_real) imax; sfmin = rmin; smnum = (float)1. / rmax; if (smnum >= sfmin) { /* Use SMALL plus a bit, to avoid the possibility of rounding */ /* causing overflow when computing 1/sfmin. */ sfmin = smnum * (eps + (float)1.); } } if (bli_lsame(cmach, "E", (ftnlen)1, (ftnlen)1)) { rmach = eps; } else if (bli_lsame(cmach, "S", (ftnlen)1, (ftnlen)1)) { rmach = sfmin; } else if (bli_lsame(cmach, "B", (ftnlen)1, (ftnlen)1)) { rmach = base; } else if (bli_lsame(cmach, "P", (ftnlen)1, (ftnlen)1)) { rmach = prec; } else if (bli_lsame(cmach, "N", (ftnlen)1, (ftnlen)1)) { rmach = t; } else if (bli_lsame(cmach, "R", (ftnlen)1, (ftnlen)1)) { rmach = rnd; } else if (bli_lsame(cmach, "M", (ftnlen)1, (ftnlen)1)) { rmach = emin; } else if (bli_lsame(cmach, "U", (ftnlen)1, (ftnlen)1)) { rmach = rmin; } else if (bli_lsame(cmach, "L", (ftnlen)1, (ftnlen)1)) { rmach = emax; } else if (bli_lsame(cmach, "O", (ftnlen)1, (ftnlen)1)) { rmach = rmax; } ret_val = rmach; first = FALSE_; return ret_val; /* End of SLAMCH */ } /* bli_slamch_ */ /* *********************************************************************** */ /* Subroutine */ int bli_slamc1(bla_integer *beta, bla_integer *t, bla_logical *rnd, bla_logical *ieee1) { /* Initialized data */ static bla_logical first = TRUE_; /* System generated locals */ bla_real r__1, r__2; /* Local variables */ static bla_logical lrnd; static bla_real a, b, c__, f; static bla_integer lbeta; static bla_real savec; static bla_logical lieee1; static bla_real t1, t2; extern bla_real bli_slamc3(bla_real *, bla_real *); static bla_integer lt; static bla_real one, qtr; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SLAMC1 determines the machine parameters given by BETA, T, RND, and */ /* IEEE1. */ /* Arguments */ /* ========= */ /* BETA (output) INTEGER */ /* The base of the machine. */ /* T (output) INTEGER */ /* The number of ( BETA ) digits in the mantissa. */ /* RND (output) LOGICAL */ /* Specifies whether proper rounding ( RND = .TRUE. ) or */ /* chopping ( RND = .FALSE. ) occurs in addition. This may not */ /* be a reliable guide to the way in which the machine performs */ /* its arithmetic. */ /* IEEE1 (output) LOGICAL */ /* Specifies whether rounding appears to be done in the IEEE */ /* 'round to nearest' style. */ /* Further Details */ /* =============== */ /* The routine is based on the routine ENVRON by Malcolm and */ /* incorporates suggestions by Gentleman and Marovich. See */ /* Malcolm M. A. (1972) Algorithms to reveal properties of */ /* floating-point arithmetic. Comms. of the ACM, 15, 949-951. */ /* Gentleman W. M. and Marovich S. B. (1974) More on algorithms */ /* that reveal properties of floating point arithmetic units. */ /* Comms. of the ACM, 17, 276-277. */ /* ===================================================================== */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Save statement .. */ /* .. */ /* .. Data statements .. */ /* .. */ /* .. Executable Statements .. */ if (first) { one = (float)1.; /* LBETA, LIEEE1, LT and LRND are the local values of BETA, */ /* IEEE1, T and RND. */ /* Throughout this routine we use the function SLAMC3 to ensure */ /* that relevant values are stored and not held in registers, or */ /* are not affected by optimizers. */ /* Compute a = 2.0**m with the smallest positive bla_integer m such */ /* that */ /* fl( a + 1.0 ) = a. */ a = (float)1.; c__ = (float)1.; /* + WHILE( C.EQ.ONE )LOOP */ L10: if (c__ == one) { a *= 2; c__ = bli_slamc3(&a, &one); r__1 = -a; c__ = bli_slamc3(&c__, &r__1); goto L10; } /* + END WHILE */ /* Now compute b = 2.0**m with the smallest positive bla_integer m */ /* such that */ /* fl( a + b ) .gt. a. */ b = (float)1.; c__ = bli_slamc3(&a, &b); /* + WHILE( C.EQ.A )LOOP */ L20: if (c__ == a) { b *= 2; c__ = bli_slamc3(&a, &b); goto L20; } /* + END WHILE */ /* Now compute the base. a and c are neighbouring floating point */ /* numbers in the interval ( beta**t, beta**( t + 1 ) ) and so */ /* their difference is beta. Adding 0.25 to c is to ensure that it */ /* is truncated to beta and not ( beta - 1 ). */ qtr = one / 4; savec = c__; r__1 = -a; c__ = bli_slamc3(&c__, &r__1); lbeta = c__ + qtr; /* Now determine whether rounding or chopping occurs, by adding a */ /* bit less than beta/2 and a bit more than beta/2 to a. */ b = (bla_real) lbeta; r__1 = b / 2; r__2 = -b / 100; f = bli_slamc3(&r__1, &r__2); c__ = bli_slamc3(&f, &a); if (c__ == a) { lrnd = TRUE_; } else { lrnd = FALSE_; } r__1 = b / 2; r__2 = b / 100; f = bli_slamc3(&r__1, &r__2); c__ = bli_slamc3(&f, &a); if (lrnd && c__ == a) { lrnd = FALSE_; } /* Try and decide whether rounding is done in the IEEE 'round to */ /* nearest' style. B/2 is half a unit in the last place of the two */ /* numbers A and SAVEC. Furthermore, A is even, i.e. has last bit */ /* zero, and SAVEC is odd. Thus adding B/2 to A should not change */ /* A, but adding B/2 to SAVEC should change SAVEC. */ r__1 = b / 2; t1 = bli_slamc3(&r__1, &a); r__1 = b / 2; t2 = bli_slamc3(&r__1, &savec); lieee1 = t1 == a && t2 > savec && lrnd; /* Now find the mantissa, t. It should be the bla_integer part of */ /* log to the base beta of a, however it is safer to determine t */ /* by powering. So we find t as the smallest positive bla_integer for */ /* which */ /* fl( beta**t + 1.0 ) = 1.0. */ lt = 0; a = (float)1.; c__ = (float)1.; /* + WHILE( C.EQ.ONE )LOOP */ L30: if (c__ == one) { ++lt; a *= lbeta; c__ = bli_slamc3(&a, &one); r__1 = -a; c__ = bli_slamc3(&c__, &r__1); goto L30; } /* + END WHILE */ } *beta = lbeta; *t = lt; *rnd = lrnd; *ieee1 = lieee1; first = FALSE_; return 0; /* End of SLAMC1 */ } /* bli_slamc1_ */ /* *********************************************************************** */ /* Subroutine */ int bli_slamc2(bla_integer *beta, bla_integer *t, bla_logical *rnd, bla_real * eps, bla_integer *emin, bla_real *rmin, bla_integer *emax, bla_real *rmax) { /* Initialized data */ static bla_logical first = TRUE_; static bla_logical iwarn = FALSE_; /* Format strings */ static bla_character fmt_9999[] = "(//\002 WARNING. The value EMIN may be incorre\ ct:-\002,\002 EMIN = \002,i8,/\002 If, after inspection, the value EMIN loo\ ks\002,\002 acceptable please comment out \002,/\002 the IF block as marked \ within the code of routine\002,\002 SLAMC2,\002,/\002 otherwise supply EMIN \ explicitly.\002,/)"; /* System generated locals */ bla_integer i__1; bla_real r__1, r__2, r__3, r__4, r__5; /* Builtin functions */ double bli_pow_ri(bla_real *, bla_integer *); //bla_integer s_wsfe(cilist *), do_fio(bla_integer *, bla_character *, ftnlen), e_wsfe(); /* Local variables */ static bla_logical ieee; static bla_real half; static bla_logical lrnd; static bla_real leps, zero, a, b, c__; static bla_integer i__, lbeta; static bla_real rbase; static bla_integer lemin, lemax, gnmin; static bla_real smnum; static bla_integer gpmin; static bla_real third, lrmin, lrmax, sixth; static bla_logical lieee1; extern /* Subroutine */ int bli_slamc1(bla_integer *, bla_integer *, bla_logical *, bla_logical *); extern bla_real bli_slamc3(bla_real *, bla_real *); extern /* Subroutine */ int bli_slamc4(bla_integer *, bla_real *, bla_integer *), bli_slamc5(bla_integer *, bla_integer *, bla_integer *, bla_logical *, bla_integer *, bla_real *); static bla_integer lt, ngnmin, ngpmin; static bla_real one, two; /* Fortran I/O blocks */ //static cilist io___58 = { 0, 6, 0, fmt_9999, 0 }; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SLAMC2 determines the machine parameters specified in its argument */ /* list. */ /* Arguments */ /* ========= */ /* BETA (output) INTEGER */ /* The base of the machine. */ /* T (output) INTEGER */ /* The number of ( BETA ) digits in the mantissa. */ /* RND (output) LOGICAL */ /* Specifies whether proper rounding ( RND = .TRUE. ) or */ /* chopping ( RND = .FALSE. ) occurs in addition. This may not */ /* be a reliable guide to the way in which the machine performs */ /* its arithmetic. */ /* EPS (output) REAL */ /* The smallest positive number such that */ /* fl( 1.0 - EPS ) .LT. 1.0, */ /* where fl denotes the computed value. */ /* EMIN (output) INTEGER */ /* The minimum exponent before (gradual) underflow occurs. */ /* RMIN (output) REAL */ /* The smallest normalized number for the machine, given by */ /* BASE**( EMIN - 1 ), where BASE is the floating point value */ /* of BETA. */ /* EMAX (output) INTEGER */ /* The maximum exponent before overflow occurs. */ /* RMAX (output) REAL */ /* The largest positive number for the machine, given by */ /* BASE**EMAX * ( 1 - EPS ), where BASE is the floating point */ /* value of BETA. */ /* Further Details */ /* =============== */ /* The computation of EPS is based on a routine PARANOIA by */ /* W. Kahan of the University of California at Berkeley. */ /* ===================================================================== */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Save statement .. */ /* .. */ /* .. Data statements .. */ /* .. */ /* .. Executable Statements .. */ if (first) { zero = (float)0.; one = (float)1.; two = (float)2.; /* LBETA, LT, LRND, LEPS, LEMIN and LRMIN are the local values of */ /* BETA, T, RND, EPS, EMIN and RMIN. */ /* Throughout this routine we use the function SLAMC3 to ensure */ /* that relevant values are stored and not held in registers, or */ /* are not affected by optimizers. */ /* SLAMC1 returns the parameters LBETA, LT, LRND and LIEEE1. */ bli_slamc1(&lbeta, <, &lrnd, &lieee1); /* Start to find EPS. */ b = (bla_real) lbeta; i__1 = -lt; a = bli_pow_ri(&b, &i__1); leps = a; /* Try some tricks to see whether or not this is the correct EPS. */ b = two / 3; half = one / 2; r__1 = -half; sixth = bli_slamc3(&b, &r__1); third = bli_slamc3(&sixth, &sixth); r__1 = -half; b = bli_slamc3(&third, &r__1); b = bli_slamc3(&b, &sixth); b = f2c_abs(b); if (b < leps) { b = leps; } leps = (float)1.; /* + WHILE( ( LEPS.GT.B ).AND.( B.GT.ZERO ) )LOOP */ L10: if (leps > b && b > zero) { leps = b; r__1 = half * leps; /* Computing 5th power */ r__3 = two, r__4 = r__3, r__3 *= r__3; /* Computing 2nd power */ r__5 = leps; r__2 = r__4 * (r__3 * r__3) * (r__5 * r__5); c__ = bli_slamc3(&r__1, &r__2); r__1 = -c__; c__ = bli_slamc3(&half, &r__1); b = bli_slamc3(&half, &c__); r__1 = -b; c__ = bli_slamc3(&half, &r__1); b = bli_slamc3(&half, &c__); goto L10; } /* + END WHILE */ if (a < leps) { leps = a; } /* Computation of EPS complete. */ /* Now find EMIN. Let A = + or - 1, and + or - (1 + BASE**(-3)). */ /* Keep dividing A by BETA until (gradual) underflow occurs. This */ /* is detected when we cannot recover the previous A. */ rbase = one / lbeta; smnum = one; for (i__ = 1; i__ <= 3; ++i__) { r__1 = smnum * rbase; smnum = bli_slamc3(&r__1, &zero); /* L20: */ } a = bli_slamc3(&one, &smnum); bli_slamc4(&ngpmin, &one, &lbeta); r__1 = -one; bli_slamc4(&ngnmin, &r__1, &lbeta); bli_slamc4(&gpmin, &a, &lbeta); r__1 = -a; bli_slamc4(&gnmin, &r__1, &lbeta); ieee = FALSE_; if (ngpmin == ngnmin && gpmin == gnmin) { if (ngpmin == gpmin) { lemin = ngpmin; /* ( Non twos-complement machines, no gradual underflow; */ /* e.g., VAX ) */ } else if (gpmin - ngpmin == 3) { lemin = ngpmin - 1 + lt; ieee = TRUE_; /* ( Non twos-complement machines, with gradual underflow; */ /* e.g., IEEE standard followers ) */ } else { lemin = f2c_min(ngpmin,gpmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } } else if (ngpmin == gpmin && ngnmin == gnmin) { if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1) { lemin = f2c_max(ngpmin,ngnmin); /* ( Twos-complement machines, no gradual underflow; */ /* e.g., CYBER 205 ) */ } else { lemin = f2c_min(ngpmin,ngnmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } } else if ((i__1 = ngpmin - ngnmin, f2c_abs(i__1)) == 1 && gpmin == gnmin) { if (gpmin - f2c_min(ngpmin,ngnmin) == 3) { lemin = f2c_max(ngpmin,ngnmin) - 1 + lt; /* ( Twos-complement machines with gradual underflow; */ /* no known machine ) */ } else { lemin = f2c_min(ngpmin,ngnmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } } else { /* Computing MIN */ i__1 = f2c_min(ngpmin,ngnmin), i__1 = f2c_min(i__1,gpmin); lemin = f2c_min(i__1,gnmin); /* ( A guess; no known machine ) */ iwarn = TRUE_; } first = FALSE_; /* ** */ /* Comment out this if block if EMIN is ok */ if (iwarn) { first = TRUE_; /* s_wsfe(&io___58); do_fio(&c__1, (bla_character *)&lemin, (ftnlen)sizeof(bla_integer)); e_wsfe(); */ printf( "%s", fmt_9999 ); } /* ** */ /* Assume IEEE arithmetic if we found denormalised numbers above, */ /* or if arithmetic seems to round in the IEEE style, determined */ /* in routine SLAMC1. A true IEEE machine should have both things */ /* true; however, faulty machines may have one or the other. */ ieee = ieee || lieee1; /* Compute RMIN by successive division by BETA. We could compute */ /* RMIN as BASE**( EMIN - 1 ), but some machines underflow during */ /* this computation. */ lrmin = (float)1.; i__1 = 1 - lemin; for (i__ = 1; i__ <= i__1; ++i__) { r__1 = lrmin * rbase; lrmin = bli_slamc3(&r__1, &zero); /* L30: */ } /* Finally, call SLAMC5 to compute EMAX and RMAX. */ bli_slamc5(&lbeta, <, &lemin, &ieee, &lemax, &lrmax); } *beta = lbeta; *t = lt; *rnd = lrnd; *eps = leps; *emin = lemin; *rmin = lrmin; *emax = lemax; *rmax = lrmax; return 0; /* End of SLAMC2 */ } /* bli_slamc2_ */ /* *********************************************************************** */ bla_real bli_slamc3(bla_real *a, bla_real *b) { /* System generated locals */ bla_real ret_val; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SLAMC3 is intended to force A and B to be stored prior to doing */ /* the addition of A and B , for use in situations where optimizers */ /* might hold one of these in a register. */ /* Arguments */ /* ========= */ /* A (input) REAL */ /* B (input) REAL */ /* The values A and B. */ /* ===================================================================== */ /* .. Executable Statements .. */ ret_val = *a + *b; return ret_val; /* End of SLAMC3 */ } /* bli_slamc3_ */ /* *********************************************************************** */ /* Subroutine */ int bli_slamc4(bla_integer *emin, bla_real *start, bla_integer *base) { /* System generated locals */ bla_integer i__1; bla_real r__1; /* Local variables */ static bla_real zero, a; static bla_integer i__; static bla_real rbase, b1, b2, c1, c2, d1, d2; extern bla_real bli_slamc3(bla_real *, bla_real *); static bla_real one; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SLAMC4 is a service routine for SLAMC2. */ /* Arguments */ /* ========= */ /* EMIN (output) INTEGER */ /* The minimum exponent before (gradual) underflow, computed by */ /* setting A = START and dividing by BASE until the previous A */ /* can not be recovered. */ /* START (input) REAL */ /* The starting point for determining EMIN. */ /* BASE (input) INTEGER */ /* The base of the machine. */ /* ===================================================================== */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Executable Statements .. */ a = *start; one = (float)1.; rbase = one / *base; zero = (float)0.; *emin = 1; r__1 = a * rbase; b1 = bli_slamc3(&r__1, &zero); c1 = a; c2 = a; d1 = a; d2 = a; /* + WHILE( ( C1.EQ.A ).AND.( C2.EQ.A ).AND. */ /* $ ( D1.EQ.A ).AND.( D2.EQ.A ) )LOOP */ L10: if (c1 == a && c2 == a && d1 == a && d2 == a) { --(*emin); a = b1; r__1 = a / *base; b1 = bli_slamc3(&r__1, &zero); r__1 = b1 * *base; c1 = bli_slamc3(&r__1, &zero); d1 = zero; i__1 = *base; for (i__ = 1; i__ <= i__1; ++i__) { d1 += b1; /* L20: */ } r__1 = a * rbase; b2 = bli_slamc3(&r__1, &zero); r__1 = b2 / rbase; c2 = bli_slamc3(&r__1, &zero); d2 = zero; i__1 = *base; for (i__ = 1; i__ <= i__1; ++i__) { d2 += b2; /* L30: */ } goto L10; } /* + END WHILE */ return 0; /* End of SLAMC4 */ } /* bli_slamc4_ */ /* *********************************************************************** */ /* Subroutine */ int bli_slamc5(bla_integer *beta, bla_integer *p, bla_integer *emin, bla_logical *ieee, bla_integer *emax, bla_real *rmax) { /* System generated locals */ bla_integer i__1; bla_real r__1; /* Local variables */ static bla_integer lexp; static bla_real oldy; static bla_integer uexp, i__; static bla_real y, z__; static bla_integer nbits; extern bla_real bli_slamc3(bla_real *, bla_real *); static bla_real recbas; static bla_integer exbits, expsum, try__; /* -- LAPACK auxiliary routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SLAMC5 attempts to compute RMAX, the largest machine floating-point */ /* number, without overflow. It assumes that EMAX + f2c_abs(EMIN) sum */ /* approximately to a power of 2. It will fail on machines where this */ /* assumption does not hold, for example, the Cyber 205 (EMIN = -28625, */ /* EMAX = 28718). It will also fail if the value supplied for EMIN is */ /* too large (i.e. too close to zero), probably with overflow. */ /* Arguments */ /* ========= */ /* BETA (input) INTEGER */ /* The base of floating-point arithmetic. */ /* P (input) INTEGER */ /* The number of base BETA digits in the mantissa of a */ /* floating-point value. */ /* EMIN (input) INTEGER */ /* The minimum exponent before (gradual) underflow. */ /* IEEE (input) LOGICAL */ /* A bla_logical flag specifying whether or not the arithmetic */ /* system is thought to comply with the IEEE standard. */ /* EMAX (output) INTEGER */ /* The largest exponent before overflow */ /* RMAX (output) REAL */ /* The largest machine floating-point number. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* First compute LEXP and UEXP, two powers of 2 that bound */ /* f2c_abs(EMIN). We then assume that EMAX + f2c_abs(EMIN) will sum */ /* approximately to the bound that is closest to f2c_abs(EMIN). */ /* (EMAX is the exponent of the required number RMAX). */ lexp = 1; exbits = 1; L10: try__ = lexp << 1; if (try__ <= -(*emin)) { lexp = try__; ++exbits; goto L10; } if (lexp == -(*emin)) { uexp = lexp; } else { uexp = try__; ++exbits; } /* Now -LEXP is less than or equal to EMIN, and -UEXP is greater */ /* than or equal to EMIN. EXBITS is the number of bits needed to */ /* store the exponent. */ if (uexp + *emin > -lexp - *emin) { expsum = lexp << 1; } else { expsum = uexp << 1; } /* EXPSUM is the exponent range, approximately equal to */ /* EMAX - EMIN + 1 . */ *emax = expsum + *emin - 1; nbits = exbits + 1 + *p; /* NBITS is the total number of bits needed to store a */ /* floating-point number. */ if (nbits % 2 == 1 && *beta == 2) { /* Either there are an odd number of bits used to store a */ /* floating-point number, which is unlikely, or some bits are */ /* not used in the representation of numbers, which is possible, */ /* (e.g. Cray machines) or the mantissa has an implicit bit, */ /* (e.g. IEEE machines, Dec Vax machines), which is perhaps the */ /* most likely. We have to assume the last alternative. */ /* If this is true, then we need to reduce EMAX by one because */ /* there must be some way of representing zero in an implicit-bit */ /* system. On machines like Cray, we are reducing EMAX by one */ /* unnecessarily. */ --(*emax); } if (*ieee) { /* Assume we are on an IEEE machine which reserves one exponent */ /* for infinity and NaN. */ --(*emax); } /* Now create RMAX, the largest machine number, which should */ /* be equal to (1.0 - BETA**(-P)) * BETA**EMAX . */ /* First compute 1.0 - BETA**(-P), being careful that the */ /* result is less than 1.0 . */ recbas = (float)1. / *beta; z__ = *beta - (float)1.; y = (float)0.; i__1 = *p; for (i__ = 1; i__ <= i__1; ++i__) { z__ *= recbas; if (y < (float)1.) { oldy = y; } y = bli_slamc3(&y, &z__); /* L20: */ } if (y >= (float)1.) { y = oldy; } /* Now multiply by BETA**EMAX to get RMAX. */ i__1 = *emax; for (i__ = 1; i__ <= i__1; ++i__) { r__1 = y * *beta; y = bli_slamc3(&r__1, &c_b32); /* L30: */ } *rmax = y; return 0; /* End of SLAMC5 */ } /* bli_slamc5_ */ #else bla_real bli_slamch(bla_character *cmach, ftnlen cmach_len) { /* = 'E' or 'e', SLAMCH := eps */ /* = 'S' or 's , SLAMCH := sfmin */ /* = 'B' or 'b', SLAMCH := base */ /* = 'P' or 'p', SLAMCH := eps*base */ /* = 'N' or 'n', SLAMCH := t */ /* = 'R' or 'r', SLAMCH := rnd */ /* = 'M' or 'm', SLAMCH := emin */ /* = 'U' or 'u', SLAMCH := rmin */ /* = 'L' or 'l', SLAMCH := emax */ /* = 'O' or 'o', SLAMCH := rmax */ /* where */ /* eps = relative machine precision */ /* sfmin = safe minimum, such that 1/sfmin does not overflow */ /* base = base of the machine */ /* prec = eps*base */ /* t = number of (base) digits in the mantissa */ /* rnd = 1.0 when rounding occurs in addition, 0.0 otherwise */ /* emin = minimum exponent before (gradual) underflow */ /* rmin = underflow threshold - base**(emin-1) */ /* emax = largest exponent before overflow */ /* rmax = overflow threshold - (base**emax)*(1-eps) */ float safe_min = FLT_MIN; float small = 1.0f / FLT_MAX; if ( small >= safe_min ) safe_min = small * ( 1.0f + FLT_EPSILON ); switch ( toupper( *cmach ) ) { case 'E': return FLT_EPSILON; case 'S': return safe_min; case 'B': return FLT_RADIX; case 'P': return FLT_RADIX*FLT_EPSILON; case 'N': return FLT_MANT_DIG; case 'R': return FLT_ROUNDS == FE_TONEAREST ? 1.0f : 0.0f; case 'M': return FLT_MIN_EXP; case 'U': return FLT_MIN; case 'L': return FLT_MAX_EXP; case 'O': return FLT_MAX; } return 0.0f; } #endif #ifdef __cplusplus } #endif cython-blis-0.9.1/blis/_src/frame/base/noopt/bli_slamch.h000066400000000000000000000033101427272030600232440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); cython-blis-0.9.1/blis/_src/frame/base/proj/000077500000000000000000000000001427272030600206145ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/base/proj/bli_projm.c000066400000000000000000000071161427272030600227420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_projm ( obj_t* a, obj_t* b ) { // Check parameters. if ( bli_error_checking_is_enabled() ) bli_projm_check( a, b ); if ( ( bli_obj_is_real( a ) && bli_obj_is_real( b ) ) || ( bli_obj_is_complex( a ) && bli_obj_is_complex( b ) ) ) { // If a and b are both real or both complex, we can simply use // copym. bli_copym( a, b ); } else { // This branch handles the case where one operand is real and // the other is complex. if ( bli_obj_is_real( a ) /* && bli_obj_is_complex( b ) */ ) { // If a is real and b is complex, we must obtain the real part // of b so that we can copy a into the real part (after // initializing all of b, including imaginary components, to // zero). obj_t br; bli_obj_real_part( b, &br ); bli_setm( &BLIS_ZERO, b ); bli_copym( a, &br ); } else // bli_obj_is_complex( a ) && bli_obj_is_real( b ) { // If a is complex and b is real, we can simply copy the // real part of a into b. obj_t ar; bli_obj_real_part( a, &ar ); bli_copym( &ar, b ); } } } // ----------------------------------------------------------------------------- void bli_projm_check ( obj_t* a, obj_t* b ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( b ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_precisions( a, b ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( b ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( a, b ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( b ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/proj/bli_projm.h000066400000000000000000000034401427272030600227430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); cython-blis-0.9.1/blis/_src/frame/base/proj/bli_projv.c000066400000000000000000000071241427272030600227520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_projv ( obj_t* x, obj_t* y ) { // Check parameters. if ( bli_error_checking_is_enabled() ) bli_projv_check( x, y ); if ( ( bli_obj_is_real( x ) && bli_obj_is_real( y ) ) || ( bli_obj_is_complex( x ) && bli_obj_is_complex( y ) ) ) { // If x and y are both real or both complex, we can simply use // copyv. bli_copyv( x, y ); } else { // This branch handles the case where one operand is real and // the other is complex. if ( bli_obj_is_real( x ) /* && bli_obj_is_complex( y ) */ ) { // If x is real and y is complex, we must obtain the real part // of y so that we can copy x into the real part (after // initializing all of y, including imaginary components, to // zero). obj_t yr; bli_obj_real_part( y, &yr ); bli_setv( &BLIS_ZERO, y ); bli_copyv( x, &yr ); } else // bli_obj_is_complex( x ) && bli_obj_is_real( y ) { // If x is complex and y is real, we can simply copy the // real part of x into y. obj_t xr; bli_obj_real_part( x, &xr ); bli_copyv( &xr, y ); } } } // ----------------------------------------------------------------------------- void bli_projv_check ( obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_precisions( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/proj/bli_projv.h000066400000000000000000000034401427272030600227540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); cython-blis-0.9.1/blis/_src/frame/base/proj/old/000077500000000000000000000000001427272030600213725ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/base/proj/old/bli_proj_check.c000066400000000000000000000063261427272030600245020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_projm_check ( obj_t* a, obj_t* b ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( b ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_precisions( a, b ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_matrix_object( b ); bli_check_error_code( e_val ); e_val = bli_check_conformal_dims( a, b ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( b ); bli_check_error_code( e_val ); } void bli_projv_check ( obj_t* x, obj_t* y ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_floating_object( y ); bli_check_error_code( e_val ); e_val = bli_check_consistent_object_precisions( x, y ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_vector_object( y ); bli_check_error_code( e_val ); e_val = bli_check_equal_vector_lengths( x, y ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( y ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/base/proj/old/bli_proj_check.h000066400000000000000000000034241427272030600245030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ void bli_projm_check ( obj_t* a, obj_t* b ); void bli_projv_check ( obj_t* x, obj_t* y ); cython-blis-0.9.1/blis/_src/frame/compat/000077500000000000000000000000001427272030600202135ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/amd/000077500000000000000000000000001427272030600207545ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/amd/bla_copy_amd.c000066400000000000000000000111161427272030600235310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ /*bli_init_auto()*/; \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ /* NOTE: While we skip explicit initialization for real domain instances since we call the microkernel directly, the complex domain instances still need initialization so that they can query valid contexts from gks. However, the expert API will self-initialize before attempting to query a context, so the complex domain cases should work fine. */ \ PASTEMAC2(ch,blisname,isuf) \ ( \ BLIS_NO_CONJUGATE, \ n0, \ x0, incx0, \ y0, incy0, \ NULL \ ); \ \ /* Finalize BLIS. */ \ /*bli_finalize_auto();*/ \ } #ifdef BLIS_ENABLE_BLAS //INSERT_GENTFUNC_BLAS( copy, copyv ) GENTFUNC( float, s, copy, copyv, _zen_int ) GENTFUNC( double, d, copy, copyv, _zen_int ) #endif #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname, isuf ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ /*bli_init_auto()*/; \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ /* NOTE: While we skip explicit initialization for real domain instances since we call the microkernel directly, the complex domain instances still need initialization so that they can query valid contexts from gks. However, the expert API will self-initialize before attempting to query a context, so the complex domain cases should work fine. */ \ PASTEMAC2(ch,blisname,isuf) \ ( \ BLIS_NO_CONJUGATE, \ n0, \ x0, incx0, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ /*bli_finalize_auto();*/ \ } #ifdef BLIS_ENABLE_BLAS //INSERT_GENTFUNC_BLAS( copy, copyv ) GENTFUNC( scomplex, c, copy, copyv, _ex ) GENTFUNC( dcomplex, z, copy, copyv, _ex ) #endif cython-blis-0.9.1/blis/_src/frame/compat/amd/bla_gemv_amd.c000066400000000000000000000130661427272030600235230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ) \ { \ trans_t blis_transa; \ dim_t m0, n0; \ dim_t m_y, n_x; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ /*bli_init_auto();*/ \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ transa, \ m, \ n, \ lda, \ incx, \ incy \ ); \ \ /* BLAS handles cases where y has no elements as well as those where x has no elements. In the case of the former, it cannot do any work since the output vector is empty; but in the latter case, BLAS has peculiar semantics. When x has no elements (and transa(A) has no columns), BLAS returns immediately without performing any computation even if the number of elements of y (and rows of transa(A)) is non-zero, in which case any sane interpretations of gemv would have the the operation reduce to y := beta * y. Here, we emulate the BLAS exactly so as to provide "bug-for-bug" compatibility. Note that this extreme level of compatibility would not be contemplated if it weren't for the fact that some BLAS unit tests actually check for this behavior. Also, it should be emphasized that BLIS, when called natively, does NOT exhibit this quirky behavior; it will scale y by beta as one would expect. */ \ if ( *m == 0 || *n == 0 ) \ { \ /* Finalize BLIS. */ \ /*bli_finalize_auto();*/ \ \ return; \ } \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* Convert/typecast negative values of m and n to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Determine the dimensions of x and y so we can adjust the increments, if necessary.*/ \ bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \ \ /* If alpha is zero, scale y by beta and return early. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ PASTEMAC2(ch,scalv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ m_y, \ ( ftype* )beta, \ ( ftype* )y0, incy0, \ NULL, \ NULL \ ); \ return; \ } \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Declare a void function pointer for the current operation. */ \ PASTECH2(ch,blisname,_unb_ft) f; \ \ /* Choose the underlying implementation. */ \ if ( bli_does_notrans( blis_transa ) ) f = PASTEMAC(ch,gemv_unf_var2); \ else /* if ( bli_does_trans( blis_transa ) ) */ f = PASTEMAC(ch,gemv_unf_var1); \ \ /* Obtain a valid context from the gks. This is needed because these implementations of ?gemv_() skip calling gemv_ex() and instead call the unblocked fused variants directly. */ \ cntx_t* cntx = bli_gks_query_cntx(); \ \ /* Invoke the variant chosen above, which loops over a level-1v or level-1f kernel to implement the current operation. */ \ f \ ( \ blis_transa, \ BLIS_NO_CONJUGATE, \ m0, \ n0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ x0, incx0, \ (ftype*)beta, \ y0, incy0, \ cntx \ ); \ \ /* Finalize BLIS. */ \ /*bli_finalize_auto();*/ \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( gemv, gemv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/000077500000000000000000000000001427272030600213175ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_gbmv.c000066400000000000000000000063111427272030600232350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* transa, \ f77_int* m, \ f77_int* n, \ f77_int* kl, \ f77_int* ku, \ ftype* alpha, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ) \ { \ trans_t blis_transa; \ dim_t m0, n0; \ dim_t m_y, n_x; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* Convert/typecast negative values of m and n to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Determine the dimensions of x and y so we can adjust the increments, if necessary.*/ \ bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n_x, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m_y, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( gbmv, gbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_gbmv.h000066400000000000000000000045451427272030600232510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* transa, \ f77_int* m, \ f77_int* n, \ f77_int* kl, \ f77_int* ku, \ ftype* alpha, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hbmv.c000066400000000000000000000056331427272030600232440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ f77_int* k, \ ftype* alpha, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( hbmv, hbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hbmv.h000066400000000000000000000044401427272030600232440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ f77_int* k, \ ftype* alpha, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hpmv.c000066400000000000000000000055421427272030600232610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* a, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( hpmv, hpmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hpmv.h000066400000000000000000000043471427272030600232700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* a, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hpmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hpr.c000066400000000000000000000052421427272030600230750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype_r* alpha, \ ftype* x, f77_int* incx, \ ftype* a \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( hpr, hpr ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hpr.h000066400000000000000000000041761427272030600231070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype_r* alpha, \ ftype* x, f77_int* incx, \ ftype* a \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hpr ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hpr2.c000066400000000000000000000054641427272030600231650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* x, f77_int* incx, \ ftype* y, f77_int* incy, \ ftype* a \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( hpr2, hpr2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_hpr2.h000066400000000000000000000042711427272030600231650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* x, f77_int* incx, \ ftype* y, f77_int* incy, \ ftype* a \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hpr2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rot.c000066400000000000000000000052451427272030600231130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCR2 #define GENTFUNCR2( ftype_xy, ftype_r, chxy, chr, blasname, blisname ) \ \ void PASTEF772(chxy,chr,blasname)( \ f77_int* n, \ ftype_xy* x, f77_int* incx, \ ftype_xy* y, f77_int* incy, \ ftype_r* c, \ ftype_r* s \ ) \ { \ dim_t n0; \ ftype_xy* x0; \ ftype_xy* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCR2_BLAS( rot, ROT_KERNEL ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rot.h000066400000000000000000000042731427272030600231200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_xy, ftype_r, chxy, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,chr,blasname)( \ f77_int* n, \ ftype_xy* x, f77_int* incx, \ ftype_xy* y, f77_int* incy, \ ftype_r* c, \ ftype_r* s \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( rot ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rotg.c000066400000000000000000000042471427272030600232630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCR #define GENTFUNCR( ftype_xy, ftype_r, chxy, chr, blasname, blisname ) \ \ void PASTEF77(chxy,blasname)( \ ftype_xy* x, \ ftype_xy* y, \ ftype_r* c, \ ftype_r* s \ ) \ { \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCR_BLAS( rotg, rotg, ROTG_KERNEL ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rotg.h000066400000000000000000000041251427272030600232630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR #define GENTPROTR( ftype_xy, ftype_r, chxy, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(chxy,blasname)( \ ftype_xy* x, \ ftype_xy* y, \ ftype_r* c, \ ftype_r* s \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR_BLAS( rotg, rotg ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rotm.c000066400000000000000000000050521427272030600232640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_int* n, \ ftype* x, f77_int* incx, \ ftype* y, f77_int* incy, \ ftype* dparam \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( rotm, ROTM_KERNEL ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rotm.h000066400000000000000000000041161427272030600232710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_int* n, \ ftype* x, f77_int* incx, \ ftype* y, f77_int* incy, \ ftype* dparam \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( rotm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rotmg.c000066400000000000000000000042531427272030600234350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ ftype* d1, \ ftype* d2, \ ftype* x, \ ftype* y, \ ftype* dparam \ ) \ { \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( rotmg, ROTMG_KERNEL ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_rotmg.h000066400000000000000000000041301427272030600234340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ ftype* d1, \ ftype* d2, \ ftype* x, \ ftype* y, \ ftype* dparam \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( rotmg ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_sbmv.c000066400000000000000000000056151427272030600232570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ f77_int* k, \ ftype* alpha, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( sbmv, sbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_sbmv.h000066400000000000000000000044221427272030600232570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ f77_int* k, \ ftype* alpha, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( sbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_spmv.c000066400000000000000000000055241427272030600232740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* a, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( spmv, spmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_spmv.h000066400000000000000000000043311427272030600232740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* a, \ ftype* x, f77_int* incx, \ ftype* beta, \ ftype* y, f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( spmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_spr.c000066400000000000000000000052241427272030600231100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* x, f77_int* incx, \ ftype* a \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( spr, spr ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_spr.h000066400000000000000000000041601427272030600231130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* x, f77_int* incx, \ ftype* a \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( spr ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_spr2.c000066400000000000000000000054461427272030600232000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* x, f77_int* incx, \ ftype* y, f77_int* incy, \ ftype* a \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, y, *incy, y0, incy0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( spr2, spr2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_spr2.h000066400000000000000000000042531427272030600232000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_int* m, \ ftype* alpha, \ ftype* x, f77_int* incx, \ ftype* y, f77_int* incy, \ ftype* a \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( spr2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tbmv.c000066400000000000000000000056451427272030600232630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ f77_int* k, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx \ ) \ { \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( tbmv, tbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tbmv.h000066400000000000000000000043241427272030600232610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ f77_int* k, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( tbmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tbsv.c000066400000000000000000000056451427272030600232710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ f77_int* k, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx \ ) \ { \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( tbsv, tbsv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tbsv.h000066400000000000000000000043241427272030600232670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ f77_int* k, \ ftype* a, f77_int* lda, \ ftype* x, f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( tbsv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tpmv.c000066400000000000000000000055541427272030600233000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ ftype* a, \ ftype* x, f77_int* incx \ ) \ { \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( tpmv, tpmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tpmv.h000066400000000000000000000042331427272030600232760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ ftype* a, \ ftype* x, f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( tpmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tpsv.c000066400000000000000000000055541427272030600233060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ ftype* a, \ ftype* x, f77_int* incx \ ) \ { \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, x, *incx, x0, incx0 ); \ \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( tpsv, tpsv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/attic/bla_tpsv.h000066400000000000000000000042331427272030600233040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname)( \ f77_char* uploa, \ f77_char* transa, \ f77_char* diaga, \ f77_int* m, \ ftype* a, \ ftype* x, f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( tpsv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_amax.c000066400000000000000000000061371427272030600221320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype_x, chx, blasname, blisname ) \ \ f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ) \ { \ dim_t n0; \ ftype_x* x0; \ inc_t incx0; \ gint_t bli_index; \ f77_int f77_index; \ \ /* If the vector is empty, return an index of zero. This early check is needed to emulate netlib BLAS. Without it, bli_?amaxv() will return 0, which ends up getting incremented to 1 (below) before being returned, which is not what we want. */ \ if ( *n < 1 || *incx <= 0 ) return 0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ ( \ n0, \ x0, incx0, \ &bli_index, \ NULL, \ NULL \ ); \ \ /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran) index. Also, if the BLAS integer size differs from the BLIS integer size, that typecast occurs here. */ \ f77_index = bli_index + 1; \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return f77_index; \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( amax, amaxv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_amax.h000066400000000000000000000037051427272030600221350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_asum.c000066400000000000000000000051161427272030600221450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCR2 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \ \ ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ) \ { \ dim_t n0; \ ftype_x* x0; \ inc_t incx0; \ ftype_r asum; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ ( \ n0, \ x0, incx0, \ &asum, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return asum; \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCR2_BLAS( asum, asumv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_asum.h000066400000000000000000000037331427272030600221550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_axpy.c000066400000000000000000000053341427272030600221630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n0, \ (ftype*)alpha, \ x0, incx0, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( axpy, axpyv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_axpy.h000066400000000000000000000040111427272030600221570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_copy.c000066400000000000000000000052511427272030600221520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n0, \ x0, incx0, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( copy, copyv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_copy.h000066400000000000000000000037521427272030600221630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_dot.c000066400000000000000000000131741427272030600217710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCDOT #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \ \ ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ ftype rho; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_conjx, \ BLIS_NO_CONJUGATE, \ n0, \ x0, incx0, \ y0, incy0, \ &rho, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return rho; \ } INSERT_GENTFUNCDOTR_BLAS( dot, dotv ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTFUNCDOTC_BLAS( dot, dotv ) #else // #ifdef BLIS_ENABLE_COMPLEX_RETURN_INTEL // For the "intel" complex return type, use a hidden preceding parameter to // return the result rather than an actual return value. #undef GENTFUNCDOT #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \ \ void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ ftype rho; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_conjx, \ BLIS_NO_CONJUGATE, \ n0, \ x0, incx0, \ y0, incy0, \ &rho, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ *rhop = rho; \ } INSERT_GENTFUNCDOTC_BLAS( dot, dotv ) #endif // -- "Black sheep" dot product function definitions -- // Input vectors stored in single precision, computed in double precision, // with result returned in single precision. float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ) { return ( float ) ( ( double )(*sb) + PASTEF77(d,sdot) ( n, x, incx, y, incy ) ); } // Input vectors stored in single precision, computed in double precision, // with result returned in double precision. double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ) { dim_t n0; float* x0; float* y0; inc_t incx0; inc_t incy0; double rho; dim_t i; /* Initialization of BLIS is not required. */ /* Convert/typecast negative values of n to zero. */ bli_convert_blas_dim1( *n, n0 ); /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ bli_convert_blas_incv( n0, (float*)x, *incx, x0, incx0 ); bli_convert_blas_incv( n0, (float*)y, *incy, y0, incy0 ); rho = 0.0; for ( i = 0; i < n0; i++ ) { float* chi1 = x0 + (i )*incx0; float* psi1 = y0 + (i )*incy0; bli_ddots( (( double )(*chi1)), (( double )(*psi1)), rho ); } /* Finalization of BLIS is not required, because initialization was not required. */ return rho; } #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_dot.h000066400000000000000000000057021427272030600217740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_gemm.c000066400000000000000000000156771427272030600221420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ transa, \ transb, \ m, \ n, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_transa, \ blis_transb, \ m0, \ n0, \ k0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ (ftype*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ transa, \ transb, \ m, \ n, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Handle special cases of m == 1 or n == 1 via gemv. */ \ if ( n0 == 1 ) \ { \ dim_t m0t, k0t; \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0t, &k0t ); \ \ PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \ ( \ blis_transa, \ bli_extract_conj( blis_transb ), \ m0t, k0t, \ ( ftype* )alpha, \ ( ftype* )a, rs_a, cs_a, \ ( ftype* )b, ( bli_does_notrans( blis_transb ) ? rs_b : cs_b ), \ ( ftype* )beta, \ c, rs_c, \ NULL, \ NULL \ ); \ return; \ } \ else if ( m0 == 1 ) \ { \ dim_t n0t, k0t; \ bli_set_dims_with_trans( blis_transb, n0, k0, &n0t, &k0t ); \ \ PASTEMAC2(ch,gemv,BLIS_TAPI_EX_SUF) \ ( \ blis_transb, \ bli_extract_conj( blis_transa ), \ n0t, k0t, \ ( ftype* )alpha, \ ( ftype* )b, cs_b, rs_b, \ ( ftype* )a, ( bli_does_notrans( blis_transa ) ? cs_a : rs_a ), \ ( ftype* )beta, \ c, cs_c, \ NULL, \ NULL \ ); \ return; \ } \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m0_a, n0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_conjtrans( blis_transa, &ao ); \ bli_obj_set_conjtrans( blis_transb, &bo ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( gemm, gemm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_gemm.h000066400000000000000000000043251427272030600221330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_gemv.c000066400000000000000000000112641427272030600221370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2022, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ) \ { \ trans_t blis_transa; \ dim_t m0, n0; \ dim_t m_y, n_x; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ transa, \ m, \ n, \ lda, \ incx, \ incy \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* Convert/typecast negative values of m and n to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Determine the dimensions of x and y so we can adjust the increments, if necessary.*/ \ bli_set_dims_with_trans( blis_transa, m0, n0, &m_y, &n_x ); \ \ /* BLAS handles cases where y has no elements as well as those where x has no elements. In the case of the former, it cannot do any work since the output vector is empty; but in the latter case, BLAS has peculiar semantics. When x has no elements (and transa(A) has no columns), BLAS returns immediately without performing any computation even if the number of elements of y (and rows of transa(A)) is non-zero, in which case any sane interpretations of gemv would have the the operation reduce to y := beta * y. Here, we emulate the BLAS exactly so as to provide "bug-for-bug" compatibility. Note that this extreme level of compatibility would not be contemplated if it weren't for the fact that some BLAS unit tests actually check for this behavior. Also, it should be emphasized that BLIS, when called natively, does NOT exhibit this quirky behavior; it will scale y by beta as one would expect. */ \ if ( m_y > 0 && n_x == 0 ) \ { \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return; \ } \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n_x, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m_y, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_transa, \ BLIS_NO_CONJUGATE, \ m0, \ n0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ x0, incx0, \ (ftype*)beta, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( gemv, gemv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_gemv.h000066400000000000000000000042321427272030600221410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_ger.c000066400000000000000000000063101427272030600217520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCDOT #define GENTFUNCDOT( ftype, ch, chc, blis_conjy, blasname, blisname ) \ \ void PASTEF772(ch,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ) \ { \ dim_t m0, n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ MKSTR(chc), \ m, \ n, \ incx, \ incy, \ lda \ ); \ \ /* Convert/typecast negative values of m and n to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ blis_conjy, \ m0, \ n0, \ (ftype*)alpha, \ x0, incx0, \ y0, incy0, \ (ftype*)a, rs_a, cs_a, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCDOT_BLAS( ger, ger ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_ger.h000066400000000000000000000041521427272030600217610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_hemm.c000066400000000000000000000140761427272030600221330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ m, \ n, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_side, \ blis_uploa, \ BLIS_NO_CONJUGATE, \ BLIS_NO_TRANSPOSE, \ m0, \ n0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ (ftype*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ m, \ n, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt = PASTEMAC(ch,type); \ \ const conj_t conja = BLIS_NO_CONJUGATE; \ const trans_t transb = BLIS_NO_TRANSPOSE; \ const struc_t struca = BLIS_HERMITIAN; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t mn0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \ bli_set_dims_with_trans( transb, m0, n0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ \ bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( blis_uploa, &ao ); \ bli_obj_set_conj( conja, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( struca, &ao ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ blis_side, \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( hemm, hemm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_hemm.h000066400000000000000000000043121427272030600221300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_hemv.c000066400000000000000000000065601427272030600221430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ m, \ lda, \ incx, \ incy \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ BLIS_NO_CONJUGATE, \ BLIS_NO_CONJUGATE, \ m0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ x0, incx0, \ (ftype*)beta, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( hemv, hemv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_hemv.h000066400000000000000000000042211427272030600221400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_her.c000066400000000000000000000061741427272030600217630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ m, \ incx, \ lda \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ BLIS_NO_CONJUGATE, \ m0, \ (ftype_r*)alpha, \ x0, incx0, \ (ftype*)a, rs_a, cs_a, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( her, her ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_her.h000066400000000000000000000041001427272030600217530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_her2.c000066400000000000000000000064761427272030600220520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ m, \ incx, \ incy, \ lda \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ BLIS_NO_CONJUGATE, \ BLIS_NO_CONJUGATE, \ m0, \ (ftype*)alpha, \ x0, incx0, \ y0, incy0, \ (ftype*)a, rs_a, cs_a, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( her2, her2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_her2.h000066400000000000000000000041621427272030600220450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_her2k.c000066400000000000000000000160031427272030600222100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* We emulate the BLAS early return behavior with the following conditional, which returns if one of the following is true: - matrix C is empty - the rank-2k product is empty (either because alpha is zero or k is zero) AND matrix C is not scaled. */ \ if ( m0 == 0 || \ ( ( PASTEMAC(ch,eq0)( *alpha ) || k0 == 0 ) \ && PASTEMAC(chr,eq1)( *beta ) \ ) \ ) \ { \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return; \ } \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploc, \ blis_transa, \ blis_transa, \ m0, \ k0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ (ftype_r*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* We emulate the BLAS early return behavior with the following conditional, which returns if one of the following is true: - matrix C is empty - the rank-2k product is empty (either because alpha is zero or k is zero) AND matrix C is not scaled. */ \ if ( m0 == 0 || \ ( ( PASTEMAC(ch,eq0)( *alpha ) || k0 == 0 ) \ && PASTEMAC(chr,eq1)( *beta ) \ ) \ ) \ { \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return; \ } \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt_r = PASTEMAC(chr,type); \ const num_t dt = PASTEMAC(ch,type); \ \ const trans_t transb = blis_transa; \ const struc_t strucc = BLIS_HERMITIAN; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m0_a, n0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ bli_set_dims_with_trans( transb, m0, k0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype* )alpha, &alphao ); \ bli_obj_init_finish_1x1( dt_r, (ftype_r*)beta, &betao ); \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( blis_uploc, &co ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( strucc, &co ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( her2k, her2k ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_her2k.h000066400000000000000000000043151427272030600222200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_herk.c000066400000000000000000000147121427272030600221330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* We emulate the BLAS early return behavior with the following conditional, which returns if one of the following is true: - matrix C is empty - the rank-k product is empty (either because alpha is zero or k is zero) AND matrix C is not scaled. */ \ if ( m0 == 0 || \ ( ( PASTEMAC(chr,eq0)( *alpha ) || k0 == 0 ) \ && PASTEMAC(chr,eq1)( *beta ) \ ) \ ) \ { \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return; \ } \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploc, \ blis_transa, \ m0, \ k0, \ (ftype_r*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype_r*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* We emulate the BLAS early return behavior with the following conditional, which returns if one of the following is true: - matrix C is empty - the rank-k product is empty (either because alpha is zero or k is zero) AND matrix C is not scaled. */ \ if ( m0 == 0 || \ ( ( PASTEMAC(chr,eq0)( *alpha ) || k0 == 0 ) \ && PASTEMAC(chr,eq1)( *beta ) \ ) \ ) \ { \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return; \ } \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt_r = PASTEMAC(chr,type); \ const num_t dt = PASTEMAC(ch,type); \ \ const struc_t strucc = BLIS_HERMITIAN; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m0_a, n0_a; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ \ bli_obj_init_finish_1x1( dt_r, (ftype_r*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt_r, (ftype_r*)beta, &betao ); \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( blis_uploc, &co ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ \ bli_obj_set_struc( strucc, &co ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( herk, herk ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_herk.h000066400000000000000000000042341427272030600221360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_nrm2.c000066400000000000000000000051171427272030600220570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCR2 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \ \ ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ) \ { \ dim_t n0; \ ftype_x* x0; \ inc_t incx0; \ ftype_r norm; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ ( \ n0, \ x0, incx0, \ &norm, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ \ return norm; \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCR2_BLAS( nrm2, normfv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_nrm2.h000066400000000000000000000037331427272030600220660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_scal.c000066400000000000000000000057041427272030600221250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCSCAL #define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \ \ void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ) \ { \ dim_t n0; \ ftype_x* x0; \ inc_t incx0; \ ftype_x alpha_cast; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype_x*)x, *incx, x0, incx0 ); \ \ /* NOTE: We do not natively implement BLAS's csscal/zdscal in BLIS. that is, we just always sub-optimally implement those cases by casting alpha to ctype_x (potentially the complex domain) and using the homogeneous datatype instance according to that type. */ \ PASTEMAC2(cha,chx,copys)( *alpha, alpha_cast ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(chx,blisname,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n0, \ &alpha_cast, \ x0, incx0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCSCAL_BLAS( scal, scalv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_scal.h000066400000000000000000000037671427272030600221410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_swap.c000066400000000000000000000052051427272030600221510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ n0, \ x0, incx0, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( swap, swapv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_swap.h000066400000000000000000000037361427272030600221650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_symm.c000066400000000000000000000140301427272030600221600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ m, \ n, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_side, \ blis_uploa, \ BLIS_NO_CONJUGATE, \ BLIS_NO_TRANSPOSE, \ m0, \ n0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ (ftype*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ m, \ n, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt = PASTEMAC(ch,type); \ \ const conj_t conja = BLIS_NO_CONJUGATE; \ const trans_t transb = BLIS_NO_TRANSPOSE; \ const struc_t struca = BLIS_SYMMETRIC; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t mn0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \ bli_set_dims_with_trans( transb, m0, n0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ \ bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( blis_uploa, &ao ); \ bli_obj_set_conj( conja, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( struca, &ao ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ blis_side, \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( symm, symm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_symm.h000066400000000000000000000042661427272030600221770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_symv.c000066400000000000000000000065421427272030600222020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ m, \ lda, \ incx, \ incy \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ BLIS_NO_CONJUGATE, \ BLIS_NO_CONJUGATE, \ m0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ x0, incx0, \ (ftype*)beta, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( symv, symv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_symv.h000066400000000000000000000042031427272030600221770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syr.c000066400000000000000000000061541427272030600220200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ m, \ incx, \ lda \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ BLIS_NO_CONJUGATE, \ m0, \ (ftype*)alpha, \ x0, incx0, \ (ftype*)a, rs_a, cs_a, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( syr, syr ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syr.h000066400000000000000000000040621427272030600220210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syr2.c000066400000000000000000000064621427272030600221040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNCRO #define GENTFUNCRO( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ) \ { \ uplo_t blis_uploa; \ dim_t m0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ m, \ incx, \ incy, \ lda \ ); \ \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( m0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ BLIS_NO_CONJUGATE, \ BLIS_NO_CONJUGATE, \ m0, \ (ftype*)alpha, \ x0, incx0, \ y0, incy0, \ (ftype*)a, rs_a, cs_a, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCRO_BLAS( syr2, syr2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syr2.h000066400000000000000000000041441427272030600221040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syr2k.c000066400000000000000000000152171427272030600222550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* The real domain ssyr2k and dsyr2k in netlib BLAS treat a trans value of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have to go out of our way a little to support this behavior. */ \ if ( bli_is_real( PASTEMAC(ch,type) ) && \ bli_is_conjtrans( blis_transa ) ) \ { \ blis_transa = BLIS_TRANSPOSE; \ } \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploc, \ blis_transa, \ blis_transa, \ m0, \ k0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ (ftype*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* The real domain ssyr2k and dsyr2k in netlib BLAS treat a trans value of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have to go out of our way a little to support this behavior. */ \ if ( bli_is_real( PASTEMAC(ch,type) ) && \ bli_is_conjtrans( blis_transa ) ) \ { \ blis_transa = BLIS_TRANSPOSE; \ } \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt = PASTEMAC(ch,type); \ \ const trans_t transb = blis_transa; \ const struc_t strucc = BLIS_SYMMETRIC; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m0_a, n0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ bli_set_dims_with_trans( transb, m0, k0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( blis_uploc, &co ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ bli_obj_set_conjtrans( transb, &bo ); \ \ bli_obj_set_struc( strucc, &co ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( syr2k, syr2k ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syr2k.h000066400000000000000000000042711427272030600222600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syrk.c000066400000000000000000000141201427272030600221630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* The real domain ssyrk and dsyrk in netlib BLAS treat a trans value of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have to go out of our way a little to support this behavior. */ \ if ( bli_is_real( PASTEMAC(ch,type) ) && \ bli_is_conjtrans( blis_transa ) ) \ { \ blis_transa = BLIS_TRANSPOSE; \ } \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploc, \ blis_transa, \ m0, \ k0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ m, \ k, \ lda, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ \ /* The real domain ssyrk and dsyrk in netlib BLAS treat a trans value of 'C' (conjugate-transpose) as 'T' (transpose only). So, we have to go out of our way a little to support this behavior. */ \ if ( bli_is_real( PASTEMAC(ch,type) ) && \ bli_is_conjtrans( blis_transa ) ) \ { \ blis_transa = BLIS_TRANSPOSE; \ } \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt = PASTEMAC(ch,type); \ \ const struc_t strucc = BLIS_SYMMETRIC; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m0_a, n0_a; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( blis_uploc, &co ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ \ bli_obj_set_struc( strucc, &co ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( syrk, syrk ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_syrk.h000066400000000000000000000042101427272030600221670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trmm.c000066400000000000000000000134461427272030600221640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ transa, \ diaga, \ m, \ n, \ lda, \ ldb \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_side, \ blis_uploa, \ blis_transa, \ blis_diaga, \ m0, \ n0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ transa, \ diaga, \ m, \ n, \ lda, \ ldb \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ \ const num_t dt = PASTEMAC(ch,type); \ \ const struc_t struca = BLIS_TRIANGULAR; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ \ dim_t mn0_a; \ \ bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ \ bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0, n0, (ftype*)b, rs_b, cs_b, &bo ); \ \ bli_obj_set_uplo( blis_uploa, &ao ); \ bli_obj_set_diag( blis_diaga, &ao ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ \ bli_obj_set_struc( struca, &ao ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ blis_side, \ &alphao, \ &ao, \ &bo, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( trmm, trmm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trmm.h000066400000000000000000000042501427272030600221620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trmv.c000066400000000000000000000067031427272030600221730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ) \ { \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ ftype* one_p; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ transa, \ diaga, \ m, \ lda, \ incx \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Acquire a pointer to the global scalar constant BLIS_ONE. */ \ one_p = PASTEMAC(ch,1); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ blis_transa, \ blis_diaga, \ m0, \ one_p, \ (ftype*)a, rs_a, cs_a, \ x0, incx0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( trmv, trmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trmv.h000066400000000000000000000041161427272030600221740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trsm.c000066400000000000000000000134461427272030600221720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ transa, \ diaga, \ m, \ n, \ lda, \ ldb \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_side, \ blis_uploa, \ blis_transa, \ blis_diaga, \ m0, \ n0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ) \ { \ side_t blis_side; \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0, n0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ side, \ uploa, \ transa, \ diaga, \ m, \ n, \ lda, \ ldb \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_side( *side, &blis_side ); \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ \ const num_t dt = PASTEMAC(ch,type); \ \ const struc_t struca = BLIS_TRIANGULAR; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ \ dim_t mn0_a; \ \ bli_set_dim_with_side( blis_side, m0, n0, &mn0_a ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ \ bli_obj_init_finish( dt, mn0_a, mn0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0, n0, (ftype*)b, rs_b, cs_b, &bo ); \ \ bli_obj_set_uplo( blis_uploa, &ao ); \ bli_obj_set_diag( blis_diaga, &ao ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ \ bli_obj_set_struc( struca, &ao ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ blis_side, \ &alphao, \ &ao, \ &bo, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( trsm, trsm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trsm.h000066400000000000000000000042501427272030600221700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trsv.c000066400000000000000000000067031427272030600222010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ) \ { \ uplo_t blis_uploa; \ trans_t blis_transa; \ diag_t blis_diaga; \ dim_t m0; \ ftype* x0; \ inc_t incx0; \ ftype* one_p; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploa, \ transa, \ diaga, \ m, \ lda, \ incx \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploa, &blis_uploa ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_diag( *diaga, &blis_diaga ); \ \ /* Convert/typecast negative values of m to zero. */ \ bli_convert_blas_dim1( *m, m0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( m0, (ftype*)x, *incx, x0, incx0 ); \ \ /* Set the row and column strides of A. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ \ /* Acquire a pointer to the global scalar constant BLIS_ONE. */ \ one_p = PASTEMAC(ch,1); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploa, \ blis_transa, \ blis_diaga, \ m0, \ one_p, \ (ftype*)a, rs_a, cs_a, \ x0, incx0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( trsv, trsv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bla_trsv.h000066400000000000000000000041161427272030600222020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/bli_blas.h000066400000000000000000000126431427272030600221410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // for toupper(), used in xerbla() // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- #include "bla_r_sign.h" #include "bla_d_sign.h" #include "bla_r_cnjg.h" #include "bla_d_cnjg.h" #include "bla_r_imag.h" #include "bla_d_imag.h" #include "bla_c_div.h" #include "bla_z_div.h" #include "bla_f__cabs.h" // needed by c_abs, z_abs #include "bla_r_abs.h" #include "bla_d_abs.h" #include "bla_c_abs.h" #include "bla_z_abs.h" #include "bla_lsame.h" #include "bla_xerbla.h" #include "bla_xerbla_array.h" // -- Level-0 BLAS prototypes -- #include "bla_cabs1.h" // -- Level-1 BLAS prototypes -- #include "bla_amax.h" #include "bla_asum.h" #include "bla_axpy.h" #include "bla_copy.h" #include "bla_dot.h" #include "bla_nrm2.h" #include "bla_rot.h" #include "bla_rotg.h" #include "bla_rotm.h" #include "bla_rotmg.h" #include "bla_scal.h" #include "bla_swap.h" #include "f77_amax_sub.h" #include "f77_asum_sub.h" #include "f77_dot_sub.h" #include "f77_nrm2_sub.h" // -- Level-2 BLAS prototypes -- // dense #include "bla_gemv.h" #include "bla_ger.h" #include "bla_hemv.h" #include "bla_her.h" #include "bla_her2.h" #include "bla_symv.h" #include "bla_syr.h" #include "bla_syr2.h" #include "bla_trmv.h" #include "bla_trsv.h" #include "bla_gemv_check.h" #include "bla_ger_check.h" #include "bla_hemv_check.h" #include "bla_her_check.h" #include "bla_her2_check.h" #include "bla_symv_check.h" #include "bla_syr_check.h" #include "bla_syr2_check.h" #include "bla_trmv_check.h" #include "bla_trsv_check.h" // packed #include "bla_hpmv.h" #include "bla_hpr.h" #include "bla_hpr2.h" #include "bla_spmv.h" #include "bla_spr.h" #include "bla_spr2.h" #include "bla_tpmv.h" #include "bla_tpsv.h" // banded #include "bla_gbmv.h" #include "bla_hbmv.h" #include "bla_sbmv.h" #include "bla_tbmv.h" #include "bla_tbsv.h" // -- Level-3 BLAS prototypes -- #include "bla_gemm.h" #include "bla_hemm.h" #include "bla_herk.h" #include "bla_her2k.h" #include "bla_symm.h" #include "bla_syrk.h" #include "bla_syr2k.h" #include "bla_trmm.h" #include "bla_trsm.h" #include "bla_gemm_check.h" #include "bla_hemm_check.h" #include "bla_herk_check.h" #include "bla_her2k_check.h" #include "bla_symm_check.h" #include "bla_syrk_check.h" #include "bla_syr2k_check.h" #include "bla_trmm_check.h" #include "bla_trsm_check.h" // -- BLAS extension prototypes -- // unique to BLIS #include "bla_axpby.h" // level-3 #include "bla_gemmt.h" #include "bla_gemmt_check.h" // batch #include "bla_gemm_batch.h" // 3m #include "bla_gemm3m.h" #include "bla_gemm3m_check.h" // -- Fortran-compatible APIs to BLIS functions -- #include "b77_thread.h" #endif // BLIS_ENABLE_BLAS cython-blis-0.9.1/blis/_src/frame/compat/blis/000077500000000000000000000000001427272030600211445ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/blis/thread/000077500000000000000000000000001427272030600224135ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/blis/thread/b77_thread.c000066400000000000000000000053021427272030600245050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define Fortran-compatible BLIS interfaces. // void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ) { dim_t jc0 = *jc; dim_t pc0 = *pc; dim_t ic0 = *ic; dim_t jr0 = *jr; dim_t ir0 = *ir; // Initialize BLIS. bli_init_auto(); // Convert/typecast negative values to zero. //bli_convert_blas_dim1( *jc, jc0 ); //bli_convert_blas_dim1( *pc, pc0 ); //bli_convert_blas_dim1( *ic, ic0 ); //bli_convert_blas_dim1( *jr, jr0 ); //bli_convert_blas_dim1( *ir, ir0 ); // Call the BLIS function. bli_thread_set_ways( jc0, pc0, ic0, jr0, ir0 ); // Finalize BLIS. bli_finalize_auto(); } void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ) { dim_t nt0 = *nt; // Initialize BLIS. bli_init_auto(); // Convert/typecast negative values to zero. //bli_convert_blas_dim1( *nt, nt0 ); // Call the BLIS function. bli_thread_set_num_threads( nt0 ); // Finalize BLIS. bli_finalize_auto(); } cython-blis-0.9.1/blis/_src/frame/compat/blis/thread/b77_thread.h000066400000000000000000000037551427272030600245240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); cython-blis-0.9.1/blis/_src/frame/compat/cblas/000077500000000000000000000000001427272030600212775ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/cblas/bli_cblas.h000066400000000000000000000041551427272030600233670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. #include "cblas.h" #endif // BLIS_ENABLE_CBLAS #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/cblas.tgz000066400000000000000000006015341427272030600231220ustar00rootroot00000000000000 8M}80N>  y>BwMH 9$L_JmٖeR*JrU%ӫ\.yxH?f}ƁqhJ吔 88 9,v)hh)A{s>sM:kydͲ!_9::x0z2ll?hx ?Y'~8R qXͿQ)Ua+j rJ/ 8yEJ-.Cxk?Jp dk=Gejo҇ϗp@̉ LԔ̾Sgڜ32>["f/O 6lɗhrg!$dl ٬?Ff?#O}U  P=A'|8DfO{3,3twv#HN5hB3|:)GM=d<+%4ӼwxB `F? d843Z}0EhpH(Q?o<>t8J?MlݹO3L`4|yZk|'cWq"a#vJk F%NV\vfy4P8l5zĨkU*ʪH[]7TCaJqYI eCP[O2S23*j(*REV#-֓"BZEi\Hu!u3J ]5+6Cɘ>QP`ܙӌHkFS0tƩ,Eh$A2lj8جf%\S4Xhw/c^̭8}` D0'ǦZ`dvr҄6c} #VhM:uC[_`qE'XO 66*nT$z-0U(Bg7!@>*(Į.l`Jdp;>Be,ze2$lMFB ^hC: o֍xN~>'\q(]vlS^%H9o5x8{ *cWk `5eM8 ^n}(aV@W;8#t6bѲ77?]?e?fbQV6?y\Mwq,r`#hyGk4J٭FE^xܗ\$u/[@f*}[tc?gDpFRi  !ʓR 0<,Qg&f݁鏔^ [huQ;kt^w2QVMF7d2ڮCdӢ4|[wa7n 3wV;;ک|O^(F Qab|Oq:Vj[VJ٣nujmFSj[Ue'^^./+գJU6 e[ORI_.d޾4/+Ҥ{)LeUeq}DÊ_c#4|Ԡlϫ gߣ0Xe/کQ@W6\^/Agj#B{DPem\${2S#r_@ɍ xtTNE@x6${]<އ>,p`sD:F켑UjT|Rnsnڼhˑya,;3Σaï5)PhqqQ?=kv &NFeenlKԟ,/6XvѲo,K%OC=Zc$1d"h893}F$MIyY8fu9gvVQ5oHm^e K<|Ɣ⭻A[-Îgߟ fiĝ Pb+G JJJt/NA@cHZ:*ψf_ ?P4Gx<_EsۡVxb"R87';ٍtj1|ǒtQ2`eF8p 扊jI8U"`2Jq,5S"dv+>,]tɜVChāk씔T4cl'xr+l: `䮷;べo8tNpm $3tig*!1D@|k f y rǖ] lz2Sh#:ž6Aڡ2vCcl:lP淶(bYd:R''PcD# ?oV˕4_W6 Ea+Aw s$fv 5@(C,NL i "}C{b/y+|ͷ'cOC9>fS<=g{5>;w6#6ͰSq`?Y7@A<.Q 'Pݚ|*o#k:hpe\5~bCCo[ufux7gBB߶9PVc|7VT)w׈unMъ3Roya ORp[%^_ j ?EVMEYk]5krYFOow}#gpk ĈwgbЛn9|P 7ᘦ19J$aXѤ?'oK"g8kCz/ \l=qYvA82yv_c`=}l0셛R+)%I i{,4۰gEV_$eOX 3E("= vy x\OE@)|y(BՔ@- fQJY$=|YN|2(9Ua7 .ok+?KYI!5s\Iaug d䪇ږV_q W\m: JmR p) N$$Azs$)l`^a#K1XيZR&eIH-rtI +jDNe1D..߼e5UQAȘǠe{/_0J.Um'"/kIFA&82 1Vfv=Fm`[ Id $xKC`۾}=m_h.3m^a;Z?SX+ Е9XyHĥcv>IJTqZNiNR1 Y_Ey^PEyNV9y^`CZ",^%[9r^e'8$g*2%-@e\ l}>fl:ث%h Ō+Re0z^[F hY\l<+ыLД5aNPR.ɺծ 2{^YRtVzp,(uFߐ>_А>5vʢJi=+050c UE wI ;O"LZ?}jZ86;LuR, 9V(2S~=)\N[k|yH":}Ru5G" HFwӜ ([Utau0^:hBE\Led5rDCf CVjPA(Ca19Zp1Tfy *CR 29 Şļz fԳ6cTLJ_]PMO*U&c,RkX33f.!?f{<}||f!f.!7fy\{њ)e뇦ɃSe0Hy(`rȐG{E4y 8 \4,ȂXPd'H]:N7 <8UH(kZ7bp(xƓ&v{ցX]b0vzo"ȵ K :ϚC. ? Drly1Vb𴴕$ -m-7 '=%Ʒ3Ëp ۯ:X[';U,t]+]~:c7Roya= [Ev `x iZ7|@rYFOow}#gp&tz7Ac䚤LGrn3ˋ2SMBi%p4.Xq$9Dw.G#2@Ngs2P2&Xūڋ Qr[n=.OO?sWl~mX8>>ﯲO$5N/̀^aŽJXS,+teZ> # `/jo^YX8Va|M_/Kú^O|z/{&Яa]ב5 X uҰ+> 咹jZPˠPփ9>X7"$#_e\|`YtYmBDsNUD.윀֊@v]VҼkc}l+)zX_!JZ}:1֗iSɐo=['R#Ɖ^X5OǢ0ig"wp@ S52 8u23]|0m_32*1ċPQ >1/3|xs<&_x0ò]5\Ћe;Cɡe(  C-R- uZ.Vy%e\j eA^0kucPv"Kwo0rn3&9Y]:pz+]^`K0v&xUPw%8%M?'/ "TxcQdE6^_bT/s-JKُ,Lkl%isE)ymл]_ L9sEMFKXʊWdI ӥTOQRND+ 0ވ8A}l;Akr@WY V9إocB*^dMd r,^Cx`aq^Xiz'ȱy}B'ipǛhxN&UA gCofDˁx5O3NxYzN5Qb'ȱzU5Q@"[zUNcQ8cOcQڠwuN2epNKx%OSOӦ83'Qp'id 2 nF ù R=Pf7\31ȡ/G&F/۹g qQ,#Q{FRrc l(j S6DPXǩZAGT #A-]V0Wmk5eJcXQ+=<鴑0N׆TDL JA:,òIhAlS+JE_Ě 0Wc2@?u?u X HS/sz iz(i +z iz({jX!B2bưG ʈUª(+#2:OYH,5>ÄKeQ2?FēAR?sm67鞃7X&9f`E%0Hi_>ވ K%K KmXlÒd _EZXR #QsXR{QQV0a V0QV0abXRv"OdJaUOs+1=CWVV1gYkXRvB~zTCVrbMXRaKXRV2aMXR ]e&ʀ_d_d2WJ%e&_mށ~Òb bưG b bÒ9g%#VazX1B2b ʈ<Ò 2M_,2_d Q*%e)_ӳ> B y %sM’2 rXRn_>ވKz%u’t$c[,B2:hwC e^fzEz:xUEw@Oƿ2F g,jVdϫ &]fF8[`kf)6W!fSqMڴo,J R]U܆sLpk0۬Y 2Mnrp]>9HMJnk-ˮg ٱ&lXHĴb- K g^z =`5+z:i"& Z:}…v!+T-)zFhFKh,R1*Լ0jv5X[TQ3!&3fI 6MNj8ͅ.ּ?f#5Lf&3_\tm\SOk{@iYAȵ|VV7E+=쿏>OwA`2 D"c\KY*jCOv離=>"S)cZy`ۆ1j= i?z,&bЩ ^❏VQ!/_~Q=G ֣b){HVY(hIdh1`.Ls!ޠ"k(_DXOY ;D5mM/{?vx/اDŠ*j6O$@XgOg'͵(6crGwX92̀wxĝ.K?Yߊl*݈4ɂ\L'|8ڗS~պ#??LtEp{7[@Nɝ5Y噭?u>W"oKlR$qt4 X3?_~q />QMa;^ ː-7*o\ŠF|93!!yqVĎkO#o7S*/qpPEg( (nŀT>0#fN)UnܡG|科VO͠\6|k_QT$ ;N'_&%1O/1mJw I=H9|z]8Wb! `G˙ >'/E\⯬cJ9ǀ_% ?=3|(AjGg+tlQX`PXذ=U3fh@eJ10 & JP) J[H3t/ :GqkQۋ#;1Լ*+ OuYS"_1*=*61/5T6WLBD,0:rY UakѿkZ\¢& ha =Hki?ʍ5/#h-E)o];hnӒm.[ې1<?v(3 ԉ /xhK!c\ohɧH*nxH&j ۽y;Ʃ¥ p[!Jǝ߅a<.⨉ L>#"m_+J7.;&ڕ?kʟ5VΟ#w类ru`1_̬Hh8n E0 af 1F<.j O=a&cUHMZq]=l7zI]txZK:Dc]~> n&;ȸR&Ո_SHinڕlxl@ K7MlOQ|ܧ" F\U0 ˘ cCd*@lӀ  DғJ㻶Oj}W}s62>\ˬg*!/Z"E+RGzn `cX}K4LJ9GH6}nvuA*7G6哦7tuuK)I4ʈPuXJjHYt$q9:*Nahbąf)IH̰1o F͕sM#>z}jZGHrJ:=Jiz5fâ>2Xwc_`j%9j,'`J>מ\%JSߜ.%)f? Mj꙾F#-wkeeM;8ߡ~Y)<`w}Dˤ/ðH]W"a1OHx;`Dv,}#[5zuSN|h{$1BO)2Nrd܆OAN+1M!'2Mo)%{kSO]mu]_^?YYu:@ 8EeVrMQx? lHN$CH#!&W>\^ƇbLls$`(5cATp=(rCb#2OK)dzt>Ya\qe䊞T0nAf;K#-.>Ţ)Ew,+8e%OiQhfۭB?p=bvh5)E"ӎBg[-L^B ;_ Qy q{L&mT0*N#C5FꝊJۢJƂj"6FLHF8JBzhlqeH_@W6Fd <ՔBQ@ W^Oyt)+F"DĥR<YHe-/htNBrbtDN9? s\5_\$~="8Um:yM.ƫCL'G#xBOazٷ{DoJۈC2GnF0OGdFJ! x%Ϸ}11H>d./Odw.'e<)l%ik b|EL6*QgLdJNt\| S T6W.j^!%#^1;ñ9boqUȞ)3 qoVEIUh7n>hgGk5zWJajQf 3(5zG)4Jf>OL.DFWt<-fR|S,|y:lKrɕ9In=78Kq<8W>^}O~$&D}^3KMҦ)k7v- +Fnw'pgK,mLY_t͍*UK< :lqٹxө:Cڥw{@y:y%a/b+(ֈL*Jukn2'pB FPy|_^"uKJ5/η^M㲼U?=i:q>7 oYuE_o y(rk\ֺ]:szt4*pu_05*ݗ['lmEiFqIhf-w`V9:2x~֟ 䄶jZ`'ÃCWGk ^[f&x-V.YN7ʮxͺ{`wE?;dræO2Pr`I|^,rnXX/ш-ة :r[~~Q ߊ1nsr7! Ɵ'BggtqLRs( 6yI|M ǥVtCzomlFyA7n`j4J@3o 9wsL_{UW..myG ۸pg}弎Ouh>.;PQËmgCuuA ,a֦7!(q)bd6 .z𢦧蕪h-ZQEnڥS[owMU#l{u*{,$*z"* ai>]o/"Rln1dNwi86c{2{5ӭ5.6+œb8Phs ;ter{x!"0%OH(ACW-"y3vо4"QWA)_-?\ֻ2[p>äeת[X2>,QAEx@S*瞮eCww|òӐbդJ/M9k[Qnp ClwAtv_d*{ar(I\^tnZPlbQ| Z[0Je(f_ E|aأՊgƍ أ`:` 6sՌgF4I@?txD[=SLSF,>8f*e(H<Ŕ?5tSj,ݪ~ۈS+E)q#~*4W\hT11qqg̜+xa@9Pt 'ά:Sy̪jIAMvjdEx4 %zxG6ʅf#AQ7omE$,:>$+$JGW[L;k(ED'k=2E5XK(ApL@Myabi]ߟ.\k ^_p%keC8TԶHșzok KxB ˄wI{v"1?cճ rX}K-axT-{r%چh>1wkׅ6(kgk0G٦1: n[>/ׅ. 0| 0= >Djrtx4 .pAvK5:Q > ]֍=Q{66vv[p Oj@N^Fw9ȣ5_צa.ެR CvSIϔRݟ;g޵ډ_#|t".f;v2B*Yb{')t@0lȗ@Bv=@cݸ!!=:r֌rFhÆUhs|mGNGD*CÆo?4p2 ~O2jwAY9Χm/1ta肻EZ-Y lYkFrCYł#,e-WW)JPVËca`)ȍP`CpQp|.ŅC.GD<VX~9nUr:cpY;A)(VT̼zDX`|)(ԳbAc;>q©$5ҡmsCۙ͹G\Gr]ae>cVq9YGWUw][GցI g:8oCU􁪘I%lhZ^!Gf1(m* wmpMEh_i8iD"T=rU*ڌtH-OT|W Xcvzm\e{=:yRa͈ > L0*O8\<}bKUENj-M հ.Mwh=2iG A]؎Y4b.VI$5[=#(sT֞U&pe ȑp, X 1,^kk} #kVwC)nt=lZ[yu:csvze; fMx}- \9>oOONǏD FsARhѡ)ϋM'FJӁ3sC'RJе' #ZPe'ԥTv8@FڱtnBj7ioTIK;Oe0x P3J:5 <Wf<֠t{1΀f.Ӟ'Xv-Bo7;bpLq.EStrԋ}x~x|#> vݼ}L΃7ށߍS9<"B74Fg;FbHiXAR aViFы Q0OZr\WQz\G U :.'ZDMF no}5z;!z@pV@G] |ET}tqၮ@lY+ڶ5NaDiFс_]'.E,rإ1Jb4nZRs^{RhDSZSbhS!5'_F&]V J`@S蜓y|i jf xW7ni54O[\DZ{.w"1NQw K8;dZ !tF5IA`W:g$LEbW? =tWU7mi|3H=N;EBIF1͝_/ NhAY26@"GzMVҾz m`z^+B֟Y&G^ך-rvko3~~;$ώy1tΛEM x|f/i60I|ޅ8g70ށ_VI(NqmZĞmK lAxgϑE_Qi٨0A@55QހZze()@ zP I!4U8o}?nFjΛ+@s[kpBH~ rp*Ѣ}rV"f=lhq{.$|e?";!X֌2#xݷͷ#6$xwv5^_&-&b4:$`ŽQM} F[hrL{8%ARv TۺF La=%f n:K4WۈhM\\_ ? j]W}5DTiK:(+,'_ERԚ7E lcH4>e"jEHK1xLhLRaۘmJ LI&;MQIiV|6wبi` ާ,VwSrXA2 fوV,i\{DyfƌѤ^n!Mdzl%{nBw]3X'!y@Ee-%mE@^ử٬)n咾zX>ܼ}I[Q&cQog{INGߴ;zO^<?[ >=Nsr#D+P8"Sk0Ͱ= ԂG;2B%lw7sD6}60&'ӧM0A5'Ƥ?OP~@SP! Ƞ?Mn}PpHwȚ;MӁTԜr'%h W0 PKv16,`ns bƾQłڟ"YaVUFqF=j"0FyJqr|d\gڛME}D﬩&"E)R|i7*(P" |;$9sI{uUШ{4L$L\$f242١u=ùG\h:꼭^GD#֡?XM\3;U80oDp5nb.#~bDAW]w}!eZyknvlm:״B?u:N 6gwZP %SH,E*.}zf[:u6\x< >.nzeÛ N)A;Fa 61`Ů"@T\ί6BPV%7NKn]N@GNo [O 'Y3ܝBc} 4w [n v@"0G}T8TinB X&ΘS*Mu^_حbjC©"E+IW}e|ŭvlmmⶸ~>k6Cn~33o3ޟ3f>ѭG pgZp[ܱ:Ba% AikAx8'㗧[-ܒ}#a_ Pn 2 n>SS_Kv` ?"/7&)l8Xl \%vWLPzäQͲ9aZ_îgq2 U &u#,g% rt߳vo?L2&iF ;6ܳua!d4zϡ]%ѮQ)O| Wf 00 YpUa84Jv>n޺U- _#Gd@P!AN}k,U.iee:5qvE3]ztpm@4%ԬtګfS@S1{#q2mlk{81pXi SE(.\Ze`!:}^@{2Ep&fB(M!3k8:^Pji2ϛ7̌ɸ1ٺu8bЁ}H]g$ifV074^x0X ,U/bVՉ$K'",*tghA[,hxwev0(࢕iX| a9(z. Fhò4ɡ(Ѕ Oyp<|zy{L/ E|y $p@&5 e _J>,|>XS+4Zc7^.5nkZ vN8X$a iZɷn-sqoۚX ǔc9z[S:\OrUN`:H4#Y VвٷÈ* Zd;')Bn#B,vOm޺fhf0A"HQvpFs]pS*?kd˖s9Y9YW>I*^@W@WײxQr_Z2‚em{gk0LɵdYYwKtW&e ,A% fǿNs 9 R@ a;nA(i3q e  nh-`պ( º'NV͍ -J~55~Q҄W<T]2Z eRJAݸS!0^hf9DJ.Exj5^>[o֭uZFE+8a&?6&[F9M >LˋXxgu4SQ@b.OچMnnn"gRNXfaUⲝSX%!IN;unJ=3RE!V !H0iF&QB6*Xc H957kQ11Qؕ%ػ=yFC._>gђ{4KFO֘ ǃ=2z^x(.'1 ).=au&C;?~o8 DaMkby 6M8)HP?J6sC 8}lBDTesZ/JU* *jcKEEc-ڇV=`)(0t7p ;VܚGnxjS?F_uOhS1 Ō|q;3Hg *w:# EAkak7g_t?NP[Uj5.%j"{H&rF|+ 6n^΍VE}>NO\ԜjNy-k3em8˺Yi#{&l^R$S.tZD"amҾ <e}^5"¦rQRQ %(\] ]k0 $7qͥX'4wdnYOx )._{_9a#`459Vl  ޴IJ$ϓ!rgHO 4Je2FҦEי΢K`C!LBCY)o@I9Dyo{װ~H6^< ='yY -֩!Hu>O!c.g ! (`X!F5 @@Vv6-@qy| ̳^oAC7 6v4=А1+bġ3OV 9$ չbɣ(QJ{{սvbX4Ðw 1uhTUA%Ի$ %vq" RݞV 7NR PFBmf u +QGXZ36w~r7!QaS6R4`nMsm䵉DO[SOE)֊7D}Z="Kn< "y$-"L7* jʰU!-rC_3N*Wa )lNjUaUuSy O # NL4$)}+9y/-XECB2pYٶ XAqAp:ш!PvY{H# ~7L1*Dch1W!4uޖ>slMq/a3Y3êA: ?З`܋(FKTKT~K E7҅GF$2%$E'Eۤעhi+itYℲċcĒ$E;|%V[q+mD/fE;`ehmzݕ4DUB3$f,I@x#Gt ы0+ZDs0q"DbWיjQ_K ?.ofh$)L|̠ w/t83 .J>bJxLfZ7qmB5Q/!hJ(, Qt MM_U n'șOވmwoLLw~w{\oW Ow=7wA2z JAK=f#.,z$_7 c-8,aWu0+Q|_9/q[0)AC: Ĭi)EE%h)iG(OJ00)YT+vÝĕ>u w A j.sȿ|E+|ZQ UX5* +9tYNH#PcD*PF|SD0 5*'ʷpSAG vxE%ȥ+D Dv1MՌi:ۈB"^8vD1:/k:=Q[ ƙ^ϵ2me;XڊLmb4CRp>}(Ŗ洩=?Y1ن1gΘƼ990f92洗`:9ʭ>XѤBIǣv#qFhQO;<=uTTrIb$7.c `Y 0Ɖ -׽o % 8KQs$!] b`2ġOLfXH!F0^t$^^dxIEUe8E¦4-Fq]E1NqpientuDe2> E OȗoDa"'^q@6eW/I"J؛5I_IIzɤEˑ/a@N4N~31SobGU.v4`P H^&>F-vQFCb+'^͆ѡ^+>t {DOƏT)pˊFe5pKrŅѩKv ^:W9OyEsI0+«n#=}\/;{p ޅC_U|>g.3u<(8P1ӇM@qbxOpIeU}^7> u1!!Tyf1ԙ_sߎ<ũy5sރB>3[ l`/ZD߱X@ 3yJyH TDEM#^/QDD $HmswF윫B ((Eq^ozرΫd N6CxIru2!@kDAu  ER[G&*UT{48֛4 N{Hj8T%ڂ-l:UVz޹:~#wG$y?K4 +x#IPy|J=݈COBkhprƀpEs130ǍOû,0T^?0ၨ^|vcR*zE 9 ʩ7?_ߞ^ z}:E.z)%5ud0<7-*W>yϯ^\WUUc/ϖ0Q4Y tPRp3r9#Ķ3Z[[};9F;4;:e uz&[}ELu٧e'%ý5b! X[;yU(<]S8םS:2FE؉lX@w@= 8ܲBޱd.w:m-L;?x}^d<+ NU `ȭ6mE=,.0;%&˜rqjp`.\oz{8pmk%Ɓ+[@9|vC!?xi#Rtz'MدpΊHҏ<."կ?XLqC_0y麂殽aH3>˪x|+NzL޽a*~le<aj_G0]/0 $A2CdjPK_IensLLy}ѩεZ %&L*J  :''1'N'XU69idQ'UBTpX,aW Y;&f@[q)yεOvlO(74|"ThuV'$^?K9KaVeid8d),gTvC[xlJǨHw ʳ?`݈饾gЪ\b$xJ -8vH{hA*M窘uT-], cT_w ]bQ88Oz/r&mIc9(MC\H?Mr+|:DRBU􅻮Tj,DCSr8.O^bD _?G?L$JϘOVϓ_ԋ)[m#޽ ]-,.D;uuǑ*#U m炓S!a?E ܜpqjJ<,ң)FFc} tߡU."Ӝ-<w3(IKA}$pK?L9wH]&YO! s]F5 AQ;qfQ3b-/O 9 ?ɛfWE\av0yh]|R@ӖoZ ϧt̍Wƫ͹6^m|w^mEbL&uhLf&]dnLq&C(m}N}v.>^UW ?Mz[S7q{MG䇓 x(:$ۜrbbt#v.«lm;Z|mt:Θ$^:A6IOF6΅$&LIqlk<՛e:2k^۶j{N a[,Lb002=[՘)dJd~$yF6B}NjGg}|x eɚ068iI\!1[XjJ}@5\Fl&qR,}RZʞsq~>*%r~>\h P7tIK%zAhozNqx*=" u|(j ׷{2ڙEkpR-MFiCK3aS>C`2ܝS]H"DxN ~ʬ0t}CfaÖRf簉k{)ӆ/sII~A]Qwm_ ܁A붮@iTC)~pGY{ь:c,SYk ¢?@wám mCUܡ*:_-UN!iF"MJpwg{#~FV̧mm;zs]ԮQ+gKbw{:0[Ԟ!햖7 E7ѩ@7m\"gk0lp pGmҽMW`Q_7f!yokŞq<3С+mƹ>x}/n#1\0GۀAY*s[hsiH?ʜRk4~S].Di~%6h|G. ڀNu7h4p8\B>4@ļ>PuLWFsQ sH ~"6(5gv7q݁KwnUܮ8Ou·oP2 1V _+(([ ҒD˦p"j#TL _TLuBwTOnAV?cnun B)uWxR-DMzoQ{vprE.6]X(>TܰՓ+̯Z'ghư>/_(H:= [_b %σ5IOޫѐop: o^`qg_]ʢw*[5|q6P{NJCT7ҫ z^tߚ *_prT6 /*~a!I bz?"ҟ-9Hq!!azn]qFjor et'ѵDMOx(jO,<8ٝ w됞o\GCmR5bU[;k^tD1/YsL Awu#*JI}!<9 9 X,Pn?S,ߞoQE53I@!='n|]n7?I!&u,NZߵ`)7qX;x#,X2ѫqFs%zxvKl9T FT⢃::=ֈc:k#8©y˼5cr;{/É{mF&ϼ^F_#6ug)-7ll.\[qNq*]bӡLo83pW@J##5k;_U'‡p1vW 2)aI$[''vf&EV#|ρf:~>|v7_iN 'p0fC[=G$E:S.ExGKLʌvaaNAL2t3ha W6|na}4&S2 "D`n i|#s͑Z8A  jgWqF 5ދ b6tkpMԓ/UV} rChéB([IoU*h6}3JSS#Ugycۚo茍J[8\/iL()pz 9BiR*ʷ찫B X Qp̪TsG@RRqTS@t;G n7DhPb܎r ۿzP2>4h T<[聇/'+ME A3}N9cM kui*щT qݻ.DA]owxU?~aJ&tRȓ\q-'njWř,Ebm1l^énacmDʼ L-ch4+ :bTIPIm׮2CKrذKX[w'ok KŒJ7ϼ_r?ǯ}J7_9CTpi62Ƒ2~i IAL;aCߑf 6>9Qc'Ns⟌w=/+4ZoT_nor>>~B`%wX.T~)WKA!EZ98 )f 2 &__nD9Z> +8"7h𒮀n5˾bJHleنcQwo.K4 kF-`ᄖtdގ'wShMB!t;8v7=̘}V5i0ɶKd@j#2Ơ$#pRh`<@Gw5(U\-ZR''xm_pD1r?-BSdN.N rJ~CG^^>exLfms_A~X8uճWM8r~)8^D5!~v`(1&P]`aǫ<$2l4X&_%qKK^,u3(miVc{*֑^OO ^&};- զ,|E߽}A9wTD:!u߆Sruv>P#&w;D!x4r9`6|˓bQh/LF7׫JEe%{;c ¾|4hm&蔕m6 >|NN#a`IR1*ے)K[gK(+OB\ 6jxP¦GpȈM(/3)y0ˌ\*g*fkX`޷K Xw'PhKT5erc;xLG D pa?f25|qAi _r=T;`2ma,&B>!vEW_.'&$i247J8qy'ax1&0M!v [Ngqf]Eg/ERAPUfd㻨TN{Y4Y.i#$tDDL?K, Ka "KD}-F?rZ^@Sek1);Oa%Y4fE9GkCO[8X4L/8p;2;3qúܧcݧi'bDP#'pGA}HÅ#%6lآޖ+}M#>肆%e{YxF/Pr=_3BbmX .N=".h^ LDZT] b&o/(0 WpS%}g*$x[2_ۀi± Le.4nc@ӷ_t%և/#bu_.dx<7sAfnu2stTR^^pHmđ!)葼 ykyפ֙=08N׎4e@7 3<&P6wr跄 BRg:َDߖ0̀XhYt#CE2ڞtkTvrLRϭH`xE"͝U,4gtLC.o2:&Y5 ̈́x.R/"]}錸T\nW\d@ wF*DOlINHnX}B+\~n^*2L IR0/yO@x*I@1,(> ˚~.4g:"`Tμ)^sѲcR|C}Jt CSΜ\|2=XLmYfhۼ .% uC p۰y|//'_"?0h8Ɉ@vf'ל"E*~2,/Vl0̅4ƿ[f]`cn=Դ\ a$#ʟcgFz|O% CF0yV>nH $B s#LϐϪW6o^JK TM kzkvu޳7{x;o%O(G`?1;dels1dW`FK/I$PkJJ\&զlM<+E((f96"h'6snn}r3[oK  !a.Ir%Y(6k}[%0% gb:jEP UF2 M(6k=ֱN0^[<eb&y G+JQQ 먵Oy>,;TZ >L#'ZBH-S%>VAsvkפ/VGģƲ"4I`ջVĄ<ʋ>0^|Fvw(-KHLxm9QHԛ@)M P\{y߾.0X{×V(ﱤ26Lp-0}ۆ× 4[<̱S*@WQ]jt(YGI 3$Q&.V^3J Blz3Ķ>)?aj'0Х y;ygyO/OZ4] TwJE@w-/4=^ޛzKTX;8t-Jsd āc9|ΐ\֨6Dj{-#N|5V݃=hl :V?ӿÎC'ƫEs*9]Wad[KAu؀Lr o9kj + 8H;'+u̳\)hI{ajքkl1 =wzʯn+HNu|*EK:I=jԣ Ca1׌hPQkָxX?NP݌7Z"nyDR% Dx xuJuU]HҪ3Yx~$fUdEk~𜑴$^ZZg,J?-~CK!aD.DO+8eT?Y}PpQ'j͝|n{XT~7FU]cŽNW5,r9wv"#"r!1ڏ|Y0 c#F|>pfp ~01P=(A¡,eAsQ.:'Ѡ2vXX@ H5K#z5bbwJ.X6n䘎\ԅq&Y^,+b C{2Fon@*rb!2\^zl`;naяT@]:;"x<~ 8(Ssm{?D\29~zK)WBgQw:W4I\? ʢbɯX6,",^½|LU,#3^~m>mJ,BqЋ[]4𪇬*ݥØm9s%mԊ"7QE!%n 6@cD%HhXl=s.xI(={\?>NB]OIOI̾ٺgkxϓ ^\}0q@,퇻ij4*L ꎿ>岍Z6|_mJlhC[m翱asYNW0V)6dGlG@nU9X BN Qu0x-ZxuAuAuA'.]+^nƘ߹VnnG) G0ZǐKl_{^V՟CV lv. \{{[s*|ˑVUK.2^L~tP??=4܍Sg31"<4X1$+/x@R4BZW@c$_z `FG,2뛖e8*#7p/\^́G:'q7o3 FW@W?bqBi8RdCjoODW>ʆ!\sYeo{lx8qbc{gc5O"Oc;7y|<&QPXhZb*8nҨ\{3~Nޕ{{8fXd8:HKa&rnN٬5[)+ |Uf eգ;YjLV}mܒ}Uq|*p|&RdKHxc;dn|86/3f[б8V- c~X-s?)Ͽ8q*?F3O6; ID̗mB`CQҥ{^n gn{Od ?m6nCNۯw.@Bٷ1&;#1yzfzf= 8b.`w(ɞ*eWl[6]kv5; 2dg.a7x o&9Lt 0sGf.m~fr 2O燙}^R۹z~vȁWW.ҜD:@/Ya3tU/fx9;syID}>"ޥRt1fҜK3~El>s69kAE.#q$~ɧkSzs QQI[5ʏ^^uH7@{.#<`I9̒S6p{nnrm㘾:ͥێxإ#ݰN dvo\sʍ> Lq#R_伔7Klg$诣"y7r.]`ɇX><;A@WT pśˊF^|oGFG! qc3 0XX)C˛ާw\HsaRgrꑵ?9Iyt1wI Nv,sBNY%m땘] KZ{-Y- 3VTZl;κ\b'Z|uvE:puV|B/Pc'1 Za4~ )`ٔ$oI N$$^?.Y%.Fu—>&vY8dC$ghّXtHanJΣ42bћ<V<ڪy^=FM%2+kf j4TGSë\{b$nX ޓ[&R H˖¼*Xߐ{&\t^#;*<ÿX8ؔ״Zi/} 3J;vwY@\N /̋=; S?EYzH Ebi5fn'QZP,oAԽ֚&;\\}<?S.OFW JI֠T H)LĒ54C~N2Ed{W(٨t6щ.ڳ() g3IgT }C r&]Ќ @rhh=C YÊE@jjh~SΟrM̀RϚ`W-fņӸ-fO}k:ר-Ԑ p`UvdK}je+Uͮ)¬F$ӓfJU)&KU'Akkw6Gb-E1kZp])1 J?l9MZ..!FHQZG/bD_WHm&7,xiw車9*cwZd ah;^X_ޒDO"h2, EOe#^1}P /EbW6}eu%MhZ"MMacHӒjZz5샯AbN9RڱݔS5auȨPp`iTpiD36;5ӠZTCGğTF1kZǢFI|T5WL9QZI4RYרTM+QIRgAaSnJ&t,$#%tc }- F0f NCtƳc2Loљ'fb Яgk\nX" ^Je@6GF/%8Ipkl=o0S3mdTd3Ìs ؼ0EŜ8'ӭ4.+Y#S=i(oE$+U&/$?k0sDZ`f`ˋg4VH̃ZL据$4Y<6oe?}I֌0%W6/(Yd#נdT|Pill5p94pYɲl(GOSI&A',%zɵϠO>V 5`nL7n÷uRFp|g%HݔF}li1!,Oh=Xolp~=*zL5eieA0]5p {FvjazF4zkثԽ6|E|^eʍ P}^KO1A__6G8n pl( w7*oBAS7/#r94Oh +#k!X v2' '.hp@^F_{GG:ug7~;* d8C5E([5p͇>6K[l:Qs}H!e!v lѠ-PӮc"Қwd}p."j;6g~n &E#W UwCxpDݾUB"x5=ߵ1Q/,"(o/>`{A"`/f^.bs(& *69QLG:m:M@~%C e|,)칥^xd~8XZ|sDg- ݜI߿Mު ^U`'?>?Iq-"i)zuLV^[mdToa99OˆCk0\O%9 N"<$%)^lkcvr^./_PI_=mA $) TCc&J~bpCh!` }.ܑ'7TtTuķmJZri Ύ 81nLŰ`9` N}QfYiZEjWڷ-]hdAEqzzEk C veh7jau0긪xLۆY<ҮRy=)cB'wB~){g:>& ԅx+9Fĝrbu)>"x;Ky'~K6qN6k݁1n+\~;>.i 3:C(k=)a9.7vym̀iNTmrG@CP+GPm0dn| {ęv4uQ,\W 74M/M5wY>voAo˶c(3J֯H=d;ua1/r)k7ѝ\=Q=1Z|Kr~?x6#_1ʰCmR5bU[;k^tD1/RT~>?+y%f}˻sZN;bW÷1H.=Ub@J5DչY-.B7s1K䝤Mx=:xw:C'ׂѿWf[8>ѹ^,_Өk=-:_J__2_>bqʕejeRnL ʯ/C7-v xptrLgwq?ȏYy-D~>h==_d"~G㟰]I_~ݝ@](;~{rrpK}xx9M'8aߠ}/S CWaHF, {[%afkN>S<{M~$|MHO3TׯߦLF?0O;1Pإ>Go|L3n119%aawu.}L#/]g@>Ezw- 10  w/f)&T"m@ҍ0ʐ= P],mgV! e^5c (g>=>g]q;7;U  Rb\?;:H eQyXA^l]ok)-} dqp$^{dbM@2a)DXܞFó~B`%w qreTRtj/%t6PP~}yh8"7%Q*H$dg ]J?x%_~m12dSL,.Q}~-SN@P9_$ gԫ4 !y&F%sr 5Zuz m@V5mKCGPѵy[Ƶq7͂}2[&1OLEefx_m]LSл4 ~`d vw$R}7/YA]ZT`@t|>[l/ EBDUy(7k[ɄWuπ7 Uﳡ }J=3v"32OXshT&텟&'嬈z'l8ԙSW* :q&4,$cWZj\!ix)oP!'Ì=# V@EZ~\Ī2r_VFsC-S-!MZ.Q%e/97Ô P7EmٝSXY" اl/ʖ8 {D.(NRASvx*^ sPQ<摩]7Y`2ի"e2Mº&aMM blFZ’h햺Gը=bư&0u5(,wX=aɛĜ!/ʘ02x*_)aԿ|Fē!a%ԟll >yJ%a`M-l+,>< H& H& Hnl@ttK_e[@RcQ䀤,AWQLب10al j(6j j7hb+b j;\^XSrúW,5 iq y<âU1IV $-jOX5ȝ [e $-lX5=`+`$N󥵇zorDRAlպ(6i kj[d4u,d35HZDE0u*,wX1Ma{: a]- #Mb΃܂7%"i*ߐEkD`-z@*ߔ%!I < sDx &"im>"IHҡ$[HDG6RQ!:YiUg,RzЕ۲5yۅe@f]JAmxMA^Yr=xH2 } kRjbXM@+Ǫ. QG뢸W[r] Zsع8KbY,L Pוt.ڥXj0 NU>L!UJ2] HTi.T[+thPeF"x$GܖM9KZc1?Gܚ%n({3V,P5cs:A,87I{]lIыx*n2:ٚ :#<۹-9R9[F95{^%SL4)ġW,zNɣtX(ĴJ:7zUx^2^WEK+\BL%3BޱGEOϓhvm)Mek&7{8!n臼d(a%|ߦ'vDd}a?v7s/w@N6>~B`%7|ʕ2qreTRt_H1K Ll08t#bX!7dNr d@dOpS?N;"7ș; Hߥt8}(s={&^ nr蒓_ӗx4N'OO%/h3ߠQy^&#` Pg˽m<]-Kt4x|~~o|h^sh-uW]` h-趐b_2B# nw\,'T{$ \x2 ǣ}9qw/?٘]8JG. Gwd>X婐s>i~|(1=W{Gy{ktkפNʒ7ɑ&Sݧr2Ɠ;P > &dgZzN~vg/onNHD**/݋v=7MyGqV^`vq|-=m;'l{zȑOd'۷bWy =7@3|+wRtovHݠ^ yZCy{^Loy1%b /x46.ڈFXj(*,_!tt9n: :C]<eĬz @Gyݲ_K(ڂxI.@gQLnesw+$4"'5&ZV8쯻HP`jx9Yy{uپ iq1\rKAS - /x% FɈ~#@@,߽OClrkO3Mt|%!kzOYn%f ~/ $ͷ F>d¼/(aJD=ǐwLt8ZA 4whQЫ66IM{$!>׾Yr2 PFoY/$h#NՇ2K!+ R ։vKI7t #nˣkBrA<j"(X)Jʿ?fK&y[XP0‰ӏ;'9`]ؖr℗q).~ =DhW(zU.v8:N2 w-Fy1F jƬOT7`9IZr*ҬjO@ )T3=OITb<&$Z@~/bAo˭ 0ВW %Fxnêr+UrSeheöx ٮcoN80Jä0>)?HL$=yXѤ9V/BG:`S2ie.at [`ĊEAOKL,ZSl"M6 o),>v##쐨?".HQJQr+c( &(Q6 vҪ<9ľ[Fߩ]\ոq (/.M\-:׶Js6$Kĺ9|UMhMaJ:u_=J4_%>Xfcb?kck^Eui$"-6H­G9 I}앑̌,scffy:(l*+zeSkϦ^.i6|j(/gv7׳؟7ȋO`$ɬPF!aXyu9JH0YavÍ'&>c"-u)dWf_D̷e?JJ=hB=ros.?m]'pȓ1`yF0'Iɍ&xގ{U E#PT7dY:ӈVp^&Zs9 /V[F:kۭ ~l!N "&Dk7Bpl;< ï;0z+yR!hXčixsu=Wu5\~!R=!8: b!3hcLg7ՉhgM#&4Ȉ@-fȜ+GB@4!8+vϑېNc cob.pJx4L~2\Þpz %v#[NWqg) x%,+9Oͳ 2Ї0 X4t ^^B7dĒ:X2 xu;:-|<% r9ܙ;ҿ8huͮ:W}Fosdv8 G1~z[n5jf>o|>LۯcڙףOZ ^w @rsm~H4>4gIHH[o'8l{ᝳ[l8x2<h<._-]$QnoX]HG >dk,Ν; "$_fyoylZoSh R?j6MgeRݫfA>!(Cq8'$/4P&͎V$4t֔H[ ߇o[S Y ]a?R<ʡ:ǴO :K0ۡuuո[&_\RHN.>mm vu/V~Z]0ş[b|tpzM.d2&rJ]%-~T?oKO$l/۽迏,[QFE1 e{key)졄yd#  6DqikMWu%U {[FHwoG5M"Ҏ\MW e:&ԥb )BJGOg,[>9b@pTp`?QG]ۃ%`Ltx$ eb|Lb_B~1^`43uVyOzЙܬ,FDM׮m5,yZ`<g \'/2%_1 =CL(/O;^2;vD+-ɂ3ge C}[`eq<ߡr3ywg~x$Rm,ú|Vx/nA*E@4ATmsFQQN:Trc q|- y{T+x/ltܣTmEzǻR;qqE9aKE(Wف6`9yCMOT`lҝs5  gYi䐽()_R;6J[r2XhiJLN4RTD#hSKT-\Y8WHUXjfh:աZ a ٺ gvmSDW[f_ϴg XW_Ǝ¼2KelH[' 1V`#RT VY%ݗQk!6zR /(!C/b* 96DĔU@ZG"1e1s$DApel8Oijေ FM ~F"n\<>oPHI`5(XW`HV]>w1udAVRZBmq߂РCufLmFᣬȾ?DR׈y2Yo2 ++L(gЀ 7UOxK9r1`،7t  ru~F?|UNZ [q/D;Y9TXudC=^ JF_S?,u,E_Rh* V}]N8KUIޣdꬫ$Bxc.DƲPeF(k?e*+*W8}XBsOܤt$b\;{.(|5 *y PےũNTʡ{p".r z( .|,f@Dg9uG#UsYwMf$^iT&Tϣ,' IBRae .Z:i,HqnXȵʁ)a0,P(7cp5mAn!Xxmm*]ǟ4ʚgփ)+x)xyv S.zffl%پ jx͇ ㇸ((B4Px;l e ,[J{׺_S 4wC> WZ~^Q~DG/B4@bБD$^}%j?\g0|Xt)?ĕV <K^ՂOKՂٚ> uu-UAuqI_W >֌> NtjF' = O&-RQ89FdH~lлL g6mVSR04u}},)·}G[K& &$(E|J-"f;#vpC^dnܪW鸼_8_NPۻmI7Zu: Zrq\$!juMJt?͵Fϯ _pwC+vѿ+v~Ew~J*ƾ}V6/ﲉ>Ϧ Fȗvr(5yrrѷB:s\,-I0pri֛@J2$zE:su^ov{$f8մ$~ cѽw^$f,n)›y1ڊ*a\8f ϵnFQAy<]n ԹlfوSwCR'3rGZ9 v>nTd''6JXNpS\zW*B B+<)0rB:iӓ*j' 56k}{\k|lyEaЏjfKO>P^E꼳{^;R{޽(h/0 ˇ1Wk%Cۮ\҃_(Ktҫ Xu%Wk xWyNK)y/UuOg'_d/`; hJ]( % (7gO]xB 2!#: eudhyr>iw.8$7qN!'Wy:tntWNHN6!ۡ;u阇Z6<]7x-7tt@e@Ԩ8Euz|8dX)^b"թu)XW;<@ I<,E6 gOO/tن;RHݼLxp'SNS,ѦwBSRHٙzRap\h$!Z7HBVթ:2}zի!P%ig[{Le]@\wBI=r,9l4lhduX&ܽ7>AFJ>[6CLmffTmFyj Q|S|KRDR {G(D.GTCe44ӇL$fIZpLSTbJfq ' meV_f M/^Ew n<\\6;9٣! ,;z# PXfQd~R6~ qa{:??c#̭|?>L";e≹i-%HDWN&t2߇CP Ơۇ{p4}J{C5o0ql<ޱb92#OȩP;{y[h}p3{K;D4ߠuv } )D3VgK~in.Uj7ÖI 淉I)zT^h0"Ҍ8,z( I#,jJ頿l.Nٖ8E5 *SqV'k$XJq]5a <: i!?phƌzE]k3 ]]<& xeL>S* r*d['XV9b"bO%?K?%s@5vhhe;X((%?ڎ hb O kTOtޞ5/=ed-:qLL_!a}@O1&X :-Q8;&*?/9&wwzCcuދnG@Ӌ qߧcݧi'bDPGOq T 3OvMѵA7ʜxj#xIVa6DQk*RhGM}bB1{rec8W-<J ]%ؾ`\Qvtvo(R7Jo1?¿&"[Fip膆7ڀ"aQdE]F7e'y`[S'-a`bT gvU7yrKMۖzK GzRTO*񪑶R0@,J&IiMZmR;~j'WwJ0wJ$2y؀ pH@x ﹼG|UX6!KsL⿕9?loᦟ՚ҕ&_'gZ ޅ320.q(ITs1ArtN``GDŽ1N8q  wŀt;(P up#y=?AݧWR^þW8 0ܹ¯8CEjcvߌm# #⣜&ǘ>WlKs7o$f!MK׿55NnzdF<Vijo>]ɀ:Q;x!zk5mY(kk5;kB}72q<<;5!z`m*}d#ȊAg cXp`K`tp(\cȾuLwUQ#ΆBN.zSߋ~.|C#Jx䁎HPF|p` 6偢p x_QMe[0luGKqqlӚY 1FcD/_'FSNYBHR&W)IxdBUiތfJ`So}xvEI4ڕ؅/\`XoʾUS1/+00fh'(AnG߁0xh*@%E?@yporD-$ۯ!Hr > ;Y5@wS V-)_:a*3eiMżg5~"|NUEz/c u%WGT)6%8,*' WrZX*4+=?wWTO(n eMr39fhȍc9Ms9?z8r#7Y} X+h8)6WH)|_ 48]& QOeZ-ʁ> ~iyQU7l65ֲEkaAh % [uz 0GΆ jUU\rvq-fFIwMECMƇ*p`K Ƌ@d)Q+?i0<ۃ&dn4cD:b^^r-H9} 7ÔBfo~|l`~ī4&{y8t?ە+/l|6g)aVmb/i=K5ϺB1?3)]p (cvyW F&G %+Q~c[a=U16~eTZH\ZjjjXڊ7-S[MV#Z#߶%]IvXƺ"Fw]϶"hR:O'5dļuqnӺLRiF*l "GR }5`Q'I '.c]XwnB3քadݽ rYH$IlH(NXZQ, A({ӉWNT읈l龀a,(PqzTbɗXHRPyD97gVEJf$sn/'yhl/IY!<7?<:ХR/7ϷM9 SA3؂1. *z8(df-5˜GBw5`^gWUgBRIKE ~{"b4]1o3{n0Y6(@P;dBĭld|t)Ѝx߀+^Cv=v6&8r \gK(\wz|B w RސuVʎ'A/J݊] f;~aKA 8{zz"k{d/Z y}jwn;2HqقCvMZuq0e<1*膻j)ї-.$e_/F^M׽\R#{T3@.`rT0m|ȉ B-KRVE~VnyգdHQ{ܨ1?A6Ms_+k,Téؼgƶvc f[6Ov[S"&v\$\\_5\}wQ&{?ZU\fH1Ѯi}sEciؤЦyN_ͻ_-i5? mW6Ny }=ĂFo5v{Ui.>^\6zViKG.K2QE꟥w29 =R<ٍ E{<4xN`TwbXmR`t(bLU^tL>L]~iimL@cR+fBrpA ibl^GNG!#W#DBX`[au42x8E9ZnMNEq<%JHEdU~))r}HaZB\ftrve=I[ :]gOT G.=y\` nS Ts*t;eUdؒ/["4^J3IqK^o}b (>IOxDU Aد"AAl^8N!?7?Dʋ< .N?oz%tś/^B&]ud?ubz%RaHIp> }=?Bp:GO1_Xla ]Ò9٦b{ep3X|0k /ϡ+DU]oQQWrܽ|b9z1+¿9mMh-U+Oeҹg|~X*np,A~X%lW&c:>X{URLl;i͟VK_akTx| O.G|l՝j94\I#DB@qd.M3bcY_X~a s;w^u~e(s1|r=?G_O^4h<8;䳏 pɧ%1D vr ץ)R[n#`:򿒲i2k_B`_^w^kq^Y}u׾ B+hMWWg?V7V=rk?ݵO-mo*se- U!?37z8QͲ.ίW_t9c:嘜oϨ]jZ0owV|lCm]8*&IiAQ(pDّ|w!D;27^}7aAXMas/{kvvWOF/UwwLʾ[;C͝4:!֛hq[ N0랟h]o15ǛԢOmc9ޘM,f"3;ϬX9v)e,ALjm9>)orQ̥Y͎Yv,ϥ97\3F& W9op- _ٮ pD"XH Ea'ealIZѸA1 aq1e*0eMA=< 7tSl {|ek0<0G|@IpD+q%0CŢŜ;8FP6i B`pwT+~,tFw\8)yIYzP9 q/9E7;Qt-$sCr0>:mk"ԎVНq'Kڣ(8ݱ Nqs 8]q [n?oia,^2 M7XSƔ1e)caq)kYjm)! (vn,Ag Cr"G kӵa:7mvgBj ZY4..`8eu!e|\ydբ§"4GHh< Ǯw \";Eq ~t'JXbw2Ns2]+! DzN:awh`X4!8&f"j;%\+NXĠwBjy0Vl @5ЊFgA\pY,'8 +kng̓K(%R[< QT ԡ_| #VZPX}^9uLM%0$E<96 3 ]Ӯ-沬Dc}1B ` l8 */Ю$H vWrWx 3&#Z`W0|X$hf21-̥EY2.FϬ.\h[`fӀ6LytvQ3W  aD-6-{7|V!8&s0#>;cα`u*eE3q""RG]*:Gp[iSQ<D G`WX''HZQ鎴0: !0E+Ć-.]b DA w&}n [@>OD^ U(oj0 < EX\נ@#Š}: 8!P>S|("8GxGɏ{^|=p-?8+$Ā`}k>Ė'֧M/JO>4YB'eJ֑ב幝;ޭ< Hu0[txE!h &weLC[K4C2Ѹf4,6cd|%}qW*34E}Ft-NjwK2J,I̋gcIڐq;p1ik'd83N" $mm?{()m>ʨDeT1)uQJml+vAg9In1*{9̧T)w-B"P8uǖDb ̉Kq<=ٿQ8_dW[1c{Gg<ۼ  њdl]]E!r{*/^J_pY$#owZ~ph# ιJ@TH.4}wlx*VL6Z̈́'<"HT !׉sxl6ui 8kv1dr!o/ 9p7̱4v)Uٟme›l*ҙlp8ٞ94`J|wBNIHV `qmyf>H4hF44ڴ"ٰnaA}ڇ+1Ѕ V#)8pT3z+nvqaeՇ#_!q 7e*yxĽ6+'ڬUȇ 8Sӆ0| a+O1"/a;d?7o}01Xu!NzD`xxJ ӤaBCPv̀]x߫²=PZAyl]]$+TdUmh`jIv=͵bB8y!}Tk&Tf!J('9N<R&/ %N Gvt&de+Bt~?P@+z6!|*-}|߶t?׸i63f<#va!G7$Itjn'$qؖȆc&(D/r^972%51L m|LB`1 *NR]|;S%m &AQ0/qWUF^ = a0Z1n߶AoMk9b Ô5GLr|G~IoJ=p}=> xBXǸNzT&=@WZ7 =X:2<AKːP1ĈFbւշt@ &?@uj'BҨu:JwFqJuܭyHOp9*X"fB a G;0at<^?bmpۆ[_F0q*Aü訍v2]uM"L1%A?"M+ؘ<&U]n-wKN^r |%Y+=~%@.%K:ioJ64T'!]#fBAH]oN{W؟zEc;JPiܻ1Ո;,c8k1mİ?~`i|z)8^u]vuKNL{ăK)gdE?]g9!T> p’ߩOV𧻓[18ׁK5mPȀ. y,g{Id_U=_Y"~\v*ۜ1 x]c_ Rb|B6Ok/A'{I;k_=էyGOOrFugҘދ\Xas [p[Zx>KH?D]a\W:Aydԓ"GR?7X"XDNANљҡrX- 8{MyO|wk,ۧU<ݰnb3mT@e8`>ym,K;C3Uep_=Gx8==ԥv|ɴAx[&M^ȿ+ŘW̗WT6o"B׺n{ F~ÓEo7n{Hhlݾ߃^/ =z; uO#fy _Ƒ\4|WcW }4yjbAƣ/ozFɹ>~hray6t+y]K8oiHY[Sp .7~ `8xzy>-3cVs'p?L/B>W TpfQ˙K=Rս@ .D&~{x S u"jQ '<gLeRR!*@9F&~OWntYDq40-'`[@^@:6\dN;?=HBR*{WPv-pd"q "i Eެx֬V3H@2>I*?c$+bT5O?U*QS>M)0L&2r 'x"W?H$`NDu АWmSET7~^UXViˬ"E#ʾ-w/+$"d0!7@E3dےO{CޢAaOW@t}䎪1dp/ϟ -eːCƢ"2V\2$2{̏ G$O/0}0u /8 t Z4G^cY~JVEM "f@b~ bhɸB6h\J87d0/ dsSiCGaM;ΆL'`z(Jeg3tx+t.Dǃº`M8͆PI[Yԝ0ظYwЬ!ju΂uCz145& c-Xk;ՊOp@Ɓ[("[`q0V1q Vhj7mf"d-q.UX=X55I؅P1RcaRR!F`qXqiP* v%J+S%RBƂ*xS'8 N6!*/U16h0v@Cqh1;(  jb?3V\11MR ū(6 I4cHӀB4!FA.6S{2+>!vV]oqZ3aiC@5hvb>Yf bffZV11zh5Zvˊq( 28HY E Fm]ZeurʴĦM %Nጐ<ԒM.(a6S\KF(4kxs7DդїTi<"G- *v7kj,GƠ+]&*.BYg CMP,**s!!3P%xnEz.IMcHؼQGOe* NNE!|-3Uv): CwHGx{ fB~QpbW J0(mӃҶ Qnt3b Pp3QƓA7ip%֠<5ݟ1O*3*5I9Hع+tjF[u‘G()mFK#t{`ҬYg uoD3Es ~F`OOn1ID%*THfP pNQQs`T!MaG GB4to`Q #F1Y 3 I!dXw)\q~AJtFT f$y₪θj0m>. v2Wb#w+iuvjqe v(]tV,$Kby=( :h7vA@7n7v)]! =I'I˞6[ g'^a͐ˬ==L<}gV'<0]|.>kGt]|ZNc9Eb|z!J~; CG\lqkOk?'o_&yM]x^ST =-Q̻Ş ZF,8O1+ofhPa8kgP|v;A)O3vYLA cXa1*!GtP}nK_m<}%抂 cՅwX&B:M:](RW.Dt6ք͔uxM,J QsutDz$ n!.%]97~p Uô4!̴G\t =*4}b^)"{p EY%x-UzJS޾Ti$jCV)z)(e+dz*^L8ә&6 U2E??V}$1R ~_ ~"L+mIӐ~,sPk Q)ײ#g[G~4 ݹ%ܥ(Qemm% 5h9 P\xU `{(f R2 s$PRP)n˝$CY8~7+$<%9x51z?7)%Na84f5h"]n&ح 2K5S;7] ICF #@k<=`H]c_\7(7b@!\9; ǃO sONi˲pa7\{ N F4O-2k#pyypĂãjG#$JH*bPˆHa#fJܟ?a5x#^q6\} ~+C5H+78 #b| {">?/Gݿ@M @\t  \,{͗|,bQ 1_ËE)kxt!w 21/@ ߸&AChCx7 ԃؤ‚S_'d9BSQq;9ǖwIg1,Kw˜mr_-Zt =VŽ/IdW?iP}#dhJq",,6w gd T-_ }Li t\~-}ٜ/j q63x>tPB;l4߃= 0t4e:\ Y$UpD 9J\(* gC$I8MrLkryhq" kJڢ=7ij87@[T9*456Vkҙ|z4;vJ>ُ$}r}2d'}`>9?'E 4?B IDwGwl,;B,"wǑ;## 'hх*XZrVۅ~I^)X߄ė͠Y>4OIYCp ^bAbXr颵o7qj;M*kXd,입ΕcɷBПaVwV].T¢PРtF:SGg90Z}o܇uߣjݽгz]w٨x)ǂ`s,=!ؔf7#'8' *9K JLI$o` GGz4j+Ȼr#8%j]<i$.Dt M0Gg01#jݦ[(ϧ׭*e0mEQ]cellR ۺqwklNvt:߂ϓQ9xT*B1a2&95,pޔre`aqy2a;\[hJ)??+b(#ۢϋ7stb@(:8Jz8;H7bd3[L8#q&K̑0WU&-4!89~6 GOdt & o~0/_T죝]&3&Ft{@HNMGǒrM"E ?Bw},H?A %n. +Ӎ:TB깸+J!(P9?Qݓ/SaȻyy<*eB0qP~~#σgxF!9йuY/wJ_Am*Dw % } VU`ظ#r% C5x܉O/' > t1z<ŀJ Ev14(#t4d{u}**r? ~C# 2o ûM2y?\m~+@7[$U RŔ'--!4z? +twzUy%2Tu)]p?6SJS;Q6#RR"5/Q)Z̲?O7" ,$@)JiG"⦾#yK0Q}R:%:? l̊˭rkV\ݭMY֪,j,2%Q ,$)\ ѼXͱ Ym!r5%ũT*V n+U\]RC҈D.P&wQ6 3 [΃T7gUΠC*#eBvqre-04ImR/CcvL`fjO^Zm3$ țȌ4LwdF$l[5nZi¢\6Bρ‚C$./5[&.m"gv:΁yd..5PQUU{l6٪%U]-$:_yu-3l3]25-P8?ˆt,1 b:~՚V`=lx$$@Q:fmjڙMbf(M2mS&TfTWVNʺH"'{=׳ 1=t6uݐ>ѐ>57;C&ʢ>BWzVd:52cɺ UY4Ȼ$Lq~!v>Ņ:Z8'6;,`Q"ˑKT.@`eAe*{SWs!8%i5ZP-@3L 4ʨItT'g8@GEB`Ms*bhzMAg# 4 ɂLZgj~ 5jP8 S9*Ƹ#ci4KlR%7cT j^1TEN${gAbvS={j3AnkCX;53>mqkFALA8ϹbNҤ3Hpo R0Y,Xhjj6b$c{TbMP0#a.LnM:<ϸ:<\RTiH9 BÄKwEC? "_ +f6?JERΦ ,aga_erYqT Lgᖁv_eVjq48Wi<4{vfi+n,m-K[U%=d='e]L^lmo ʼnѸFN7fڇz#]_fܤ}Z7"$<)^4ʝ]}0{ۤD;sf|`oÃ/| /BwP?,HwO2Ɣ_t4C{lLxW`xr'LOJrG7|)oC+g% ~TUo⮹6oh^oYM]8?9 `F~ !~o>?ta|TKRW+OPD7rP}]sUnnMZ {Һ[KU'$âϓxK|]+g~O|8{IXwͲIPbE> Ɠ)I.dQuM>~~Kfs??|[b1ġiȆBX\c~ '@M_# 3oy4* ,d%ư1'nH[^賬rRPUM!yNtt]htiI77:qTE*zW"ԯ uWi~"ԯ uP+p]*.U:NjNT#Fv9>O# _fg 4]rOEtVP77d\~S]"CvDm&>$e)+Rp/q%^_?^_ WחȰ/d+4dy|ļY7OǤ23kgobyoӑTjO\\ ՋfIDHt/v5ٸκ`ܠH7Qf:2k>~67F_ZT$ʈ"|S˺0].Xs圙jWl[5bi*8{̙p*UZfMgw NcgWm`>g3s . ϰ` gKݙxY8KDqJ~".NٝYx1ŋYtqA Ȓ,]_^oV^Ȇdf^d73W/d7+Wyc?s7+'L.hۗPsVz ֊%7@cdt988x!'%ݳJ7sx,RCqxgqk7O\R=Kf:-RdMv6u{Iw.֮2tۓS%]zM8ΥiJAPR8VT4*l eiZRm}XfT#f^++Ն׊n<r(j;Wyf XSY"V?°hqMX4,K3?555 Xg_sx-XS_8X9\߂ w|iN@%G1?]*X&,B:X6,B*X&,B:Xœ׈lvs]# z05bڰg s]# z05"9/3n2E4akWEY"?c#F̓IB?gFo=`ԧ/g/Vϙ46KKqhٿ@O#ȇIXIXIHGz#Btk#)Mծ'?n+V zfR3VKzjN+V znR3#)b֊hòyXF,|׊hòyGRv"u։E2IY U7" $쁂r|?Jٝ'o`]u=oȾS;8$S?,H? ɧIyL{cJ/~D:!=k ȳl7OD#Ue܃Uɧn%}/xv>~zxIoy7N7vﴐ[@yܟb MR{hpk-P=>~_.xI XT&h͑4lGwNѝ:㧢ՄD (z8v0 r4*}ѫv$oZ%8r&uƣMox}>M1m.7 uR&q{mw@"´B/|ȱjnro;v:L| :F'IH{vW3a3 waB&D v7eW+:OaׅK1y{b)<#)q';W;>#UI^O}:hܛFj|ߓD#|E`#99'O{,D4G-!vɈww i'ӎKs2|iR9%&Lwzc=s 'w HWU%_'|Eל'kbO: r ^r'?|y"8˭eFpDL~OcMM=7EPĦ־|[3iu-mk,onMSoO.Hb !n?_!n[vy |Q Z3aLawl8tZfAuP WhPjF6NȊ_=V s$\ B*sx}uqZJ! ]I3g ܷ7_ߡo7-WΔ66ɶ^C fFR=xGŻ.~.ca1wN]#l1f gH2x 1DDj&iH`eIlk-Z@*ӤvoC,f-6 Q5 Bџ\dVw)WP_B#'ʆEbezeV[ޘpAAa"RE+u1|G(6܌" ,&ȭ*.o-3yWZI2FF)B.ʉZ~"W(YWؙ Qi?td$R3I2N ¡ lKfdbYEk-o41_B8duT`Tǂ'<.xN~tZ.V&ɔ._%WYI.4-n*D̑Q{9Fsg7 ?6٧i c Ysb)Vk̼Hv֊1~i%ʬHuGA>ԭn"%J|8Գ&K}Avk^^S t" :vY55˪0V9QDS:hZMkل|^ilE3l,I+ E0`2<)b[ ~ JX'!+^cERJHEZEҊ.'E~Gvh!mo&U]nCjRn]+7!$NޤOhF`7 %]f?Ye03?OIkO8)OCz?o\[Y׺wWeZ /lEvz=Z`H'JzܩM4/O/-qyFH7u5F2FPwNU3 ~y+;/+6%]}W^so-yW+~?nRY*zl^$rw/twyt=vIo#t~M/>)wˡ=96#4 loizBh>܏pW1z#T-.T;Jh۾H}GI?CoB?bsB)O&vN?wJ[ߊq` L!"5Cs!cJx-(#i(i(,[i8$h'a/R ›7T ͟:J0[uyUlu^^Q HN/rlnv[pݟ/fryJ \vƮ9iOZpum_v")i#Iȴ~VYL{$uCX8(*-e)76fYuW7\]^[!ѰL^n*:|y7rRC\؏*onGNnSK#BJɅ!I:TUHЅMK>)(>VO*Yy)xX&(@ `ri7SQ==o;yS5)d!媵*TV`-U/ ŚLeԴ}XF,q:w@ :]&/|2!_> >ѭt1|46 L{TqT́%y'bwtad-A:F/Í Oq]6_2f///A)*5 T `9NUzSzSʖM9딅nF^6EXxKOz+ ?Ɗļc,H'1E^G?{oߝF | =sмe;؀sO>Md{SUnInN&LlL$RկJ"1L_"j@F_cDCZ)%(; RRE,PZ*\(,",e~ BsHVw )%T%%#i¹Y_;nʯef}ʺ ?Ò\JGPr.N{CR>= Y`+A9/ӶhXա>2G2]Q^~e5pR}rY5e +LdzX3, :}JMM;_@fxS+ۉ3(w\?M2yZ+F )3 oO B@Dɗ -n3RDS ar>PǁGYt'HȟgҖvh'VV҉-C쑝X)dXP@v+<$)US;Zy(q̌h*Vв8d~)D Q>$($_j j*K;LXe=u" ť8+su'Ao-.'<*pPJ#寃}nu(pJ;PA*? A+`8r40`T =._r|1t+MdO mډG4RURo mڽoU_p|_siδ#J =-&%W_Ϟ R\q&@ IH9cב.(y4 NE Q:b('Ty qPbÚv( AՐ~Wa|nadKK=Qp"y fb k"Ҙ,6I'rU;AJiqtU RE:HY<[\* +346yqPVJBR:H[kBrkTl1M%}GT r[@kߍk3IܿV׈w>ۯ Q&MhUǚ֪dUy{5T[qAJ J#/KI&V }E,zwW'l=[rҔtLgV;#HNl()MBӹyZE8rjkZog(( <~[̟?L  NT1]:򼽭7؛.7Ť;m23a*zq[-KR#(a/T(AWI , ~ i0oGm&kb`y 5%,o"Yd)FE"С`x!2E6E7,.Ap<7ϝQ<?θupVtr#$+J@ԯWj^*tCpcD#m x$kEuǾÖ+v[HA!C̬x?uR=/)q8 2iln6̮@+¡㧑@6ۣdkcq64v߿kDTJi8Gj) Wy,eڽn(dg#TL:.vYRi>.MJ+3}Hi5Y|(;{8o]˟][[TŊ:նGi*]N*FwyҦՖ4mjDX.kHVպky_zǤN̋@y!֋ .,\Wj,(pٸSjWjՔR\2]x) oNӶ=<(l9a7;.)+YM%69GVg*'r7mf\{2RVٜbF65T_B`WWʾU 643 "呯HW]U}+)ՄO~)i, 5F͂FIiJB ж& % AȺRAJ|W6|W1|W W;2eF_D H&Z82p$#_8M,JLp[/đhb ΅AtDToxvUӊV[rdE#D vĻ6ۏZ**wT\HHTth-l$bRQ8j/eUP_30"RfZ5e-r$+Rg }[F]Ԡd.Sj mo o %54O֦LrɸpYQYQ,D\& f78.o$_⯅HdH1$aboujX)D((.(x)TDMd_1&~drrO7* &c؝Hm[V6ŝ ÷`x!JU9]퓓Vk uuڭc|)B]-QDuipuEuˏ;y"+{];-Fv\"!4cJAN2=q)J Vs9×%`# c(: x++ sgp]^mず'#Dٱ-LmNKIbȚV ʐdjR+Rj{6d'vu[AE+'u8*[/l0;Twai㘷ahj"XNf)q죭xx[`=. @w%9kv=2$ nD?I`'bPJʭ! {Ug0(m4Ȳv}R]o)u$$xovmB߿͵Ύ/l@sK%a% 3~ա}ְ6Z>χ F(vvLQ\`@N}W6fϳ#Ad2|!epf>Rx5aO `Tv;]reb 8tchy{9 sAg0L|Υ,DZ^A#4UyT!,E5J-M$xQ{['OW;H&+ƊLvL#.49< Xl%r9k,9jyȜq=KvNX,|YϞ!O-xfƿgy< *|zOw9NͧgO>E=8ǏkkWOpon>/ϋL5 X_@ Ǩ|2a v8A]y=JBwHZP rP>#f*sPڭ?}Yr78:<<ן%@t B='w_%sdOˮVc6^JR szY&@cTAVnsCk Ĥ+ F]$[E5Ǧdb8 n{c;wVe[Tz KG$g,jDf ?UŜ (iot‚'T^"~دé>3Aq|X@2+|eA9͖`Hc@ ~TFM ZJ< +w[^|J fG|ao! @A2fQ-HqsZ+|ʼn9*ÇxZ6 Kg%`OQ" ,`a҂oO?y 'ඝ5bNaAPQgaUL-RdJ6h=G+*_}ǀ5eHkm"-A&|ur1^ޯ玡*\ok {7_$ x#N|HlK=nn9FoK=dng1t$=<Ћ|XC-_?̳oण+3=xp_^5 t?}˓sq?~X|=ʛ.@2%bHxEAom-j6[>읶Jv n~xfO7|da=nvv#OLArи BC=B^wG$׽x=1 Q8C.=&7|Po_q"[ W[xqK#^\F<ȓ[|T+ \¸7t'[)$+VoKQja̮tnVI$ CNaګ:lM[TE` m6 o==&i Tؐ伡y#d yw0η!j@QP:hklۿˎ&]9!ҕ;k3T&A34'zK )kQ)obBddhBoqmM1Zhqc$DunI]RCOmtPCh&YDljj%VSo+ʭ"&_#5FG E$} m϶`-blIgxxWCWGLA@1.x0₨Ga>; gH 5_#H]j ްiQa[h^v$e]]::MEK3Q) E;&J(+O?(h }nT]9 FhJeZV\)QqLR/E{v-I^<ڗT2AIoF,V=~q">a)C5+!n|=i,fs&z# GKNW\I\ 9``;t4H#4Oti xI9X1I<4NWWdaҦ 5.):#JYꗕD9A|349^KAi1+3yBb,n$v `;2P}w3E )F O fޗp}}ct:H)!O-23)A(\ȼqQr9i`*ĆEV/  'mf#<^95F>2; -O7.P.,q1aP^% v$nAMEEa wѮGN :P2OP462Nd4w4)l\ǩmѴG#.̬SrDE"(U\eQit*Hmi&FBYݣ+YLݩ iD 1a05210KQ?ta[8?َ-)ߒMA eF.E9NdJ\4S,,nZ XKqaG8+Zo`/W77V\XdW}M/H>T2`Dgݪ;F\r(V?f->>:7sN>f?#J`;3 gp$f!CM//+?M,ɜ TpӅT''WtX yJy5k?Bg5Z51l-Ms(sݽ2!zœm*"*p dšP1 l2OCdU5T" lT< =fHFrPPqQCpKxE`O>S5 (5z'¬U"h}K`dX7CpQ| hEu,|2!C.VlQ8t?A8Jvק5wFp.w;8B )F yVigi彯h;—UɅvx&pآRx. 0%E;_v4N8객o7qD\a{[9 N0hOܰ"9 ;R[)"߉ PHGp7R|w)HQCݸ"b7jTt٩Bޏ`fW,mَ,Y%tt<ʤŐ0*)ߠӧTNr^"2LFMj|a}{۪#w%asRZ!fbxb-7^w`nMI1/*80fH$ B.l`\;w~ ?pTCrEAWLA*yX}Q(bBPJWDS3gkzB?O)ZPeSO89t"6bѳc3ivACӴUI6RdJ^f&DDɊ|MA9/Vor@Y%~fG 5X,>iD=Ǔ(Z N)b2D/X%X n[zyM>C5k *)Dgo(?7eDn#D@a7+]Z'e]lkX\e.[r# wxBQ@OԨʛ/KB4L7wdA=F±k|pFG!.`w3D߳NLN?L>K5X@<_}rT 'q~>pMl+'ONuAN 봸;X>%#N'sX!ǘF z'/k@C`҂'-KJcDc!&]҇!>ʹ#hM>*&6k͟1i6Z۲1# *H3zf9 02#$*{[(]vZ1TDsv{1xA*Qͭ䈰0гNhxĥwi2<řLU6R\~0y]OtlS Tul!Ƶiq6g Ӓ?TTG[&[Ħu$ޥ `v'%/pgۖu'J)Xy+0b`Q0Bft< p۱UBzhhMĹ'źi^3+5c\F1611rbO {IIY;|PyyRRCb!$ÇFiD4h8yۣE?6 ۆHѶ*6m&C'4 \ #"[0/3h>\! 腕^Q΃j ߱j )?Pt%Y= cR cUmRBiP2i]VSq.QL>tȭ jӪ@HڞwW'9gS08uQcM\`I}{R-9Ղ'kZmZi]>ĈG.@ȥ[R(9lK)tSؤB vz)zV.l{M7)AF2k(nuyŁ@wV]DYYI(PjEF,x'}F!T W6Ⱥ/!k%}`a4øN[X(d.WE!syjMq{d2}̺{KQRydg5TTzDady~mױgW qQՏ.G)Ѱ(GղA2.xRVJXUh`7xhPN࿑BϮEr@/Z h6 D:S.K;-!oеylq':p`[&j\ {~J!ePb)lp9xjUnͿ[;0эvJcD78zqtw=qMP!1Rz,5&板h^kVѝx05 Pwm㻦fwmy5_U]BwP;<?\\?]g>:p?S m xH&S˻hU8}s@i0yXZxZ©MjI<%|_6E9&[XJ:l#X<DdsO:̙\+~l~M.r$xHGT)DqrS'}k27̮`43zYEynR^|?hE1#[Y dtJLoj.[\;=@rPd8%_&ЧDH4^Z;$U&ޕSmV28htG$o('@3dt`]"Yٛ7$2@99 Nsϛ.x9ry.pi\ϜOro`yyp|=mGCtKs v %,?X>g0/Tf75|KuE4zqe• /?L]~LpZ~ꢕtβ{fYܲL>Y ;_UJI7` WɤԏOU^U-՟)H,o{~5ZNOkҪxzI5.T/Ɨ+Ow$|׼(?3 hgq&YKqMxì-l'azˁxbE#gpl(핞w/4S/b3_',rSn`0I|%]&9Fi]Zc?.mYPw뙝uX ucO38bE&HzeQǽLQ/x[CẅCYkq+f`u1^V56 ?.?y^guБs2I8 ?4CU=Mq7Bq]8m˴vn˸ceg%]Li/9d>wKs3\7<0H"c))V ݌EG}y?}4dWGQN[#<\R]l/}GN<# +些oNUpռt}ԼκiY^~y3Dw_nt 3DYjhₙy_oKMbgeu3:;m0Q Y1r f3KMrn~>y;qu瘮T)6]KϹZ%i&QDHOeMj'l01#2Qf)b/}LDtK1}S3J$-^7ٸv2tʟd]R*~":}'+-/+xaV>rޗǹ߳cMMuYq8꾐nBXqt6W祛^ZKw㥻t+LE}ZWd~kDˍ!7Yζ/U[𪭮ӫVq5~k\oOA ~>UT?Sm[Wapg.Z\}K.{bK;]'Y/Nkͻs^1ۥ;λ{=u|g޲M\._N8J1^lq<|"b7AзS 5_½^UH.4?O78׏'ǧh;.+\]ptno`IMqxIoNxN"Y;Vw#"ox^쥎hU"gB.9xy]j7&S2=V$6 Yjwyޢ6$ޙy[WF׭gzY5ΝO)k Fd]4t*Z`A]5htb !xeYʚp#D_yW]Y32^㐼!=`lI_5q\U@$DH<d(:3)~'Ƶpo`BMexaruAǮgOwsSR:PØYfpTVXN ms%9!T,an6ijuq\Wrj *xrjy ;6u oЯQsA ~AWYW}+y( ω"Kw,1_6fu͑>5ϒBcq0@Xo{7i`QX' , ;\1`Hyl]/'J7[wJBi@Lʄ#D @,@;cyb| zZiU@R…[Yn\(088X^@,Xh'qPhZCHiCEa )YlZew;h@%qd .}PH!.2S@KOXXfX&+{B;h5θK$+eoǹؠ!{,0a|*WK ;_Ya_0_nWzlqАʏ}dωBca\FY Qb8,ªiq*N:̚" v.+Qb: U GWqPlSrptF@%v˅ 2^.]\-t"YC&"·m |i>>nm+Z({ﲄvU|n50,hngp" )"0k7;hb7FLAG~c0yڥK ?0- Q]A)hCb)H&@ߟ5@sD3&ˁeM}l;P|;qRlo\:cYp nNehg0 RԐdE2Ð,9.Ḻ5ȯZϧ|re@m"ɭ-&>=MV .O>Qz|0*ZjY"%rvV@|ϯ8Hb4Gip1}n0rܒ[p_pk7Z@u7/NN}~ntT=]0=E^C r M)r3@mXtVhq-4x vQCOJM7_@|/, !Fa*Qĝd ߦȖ7?D* Xt-M:I z z nuPKSKnap (?^C_M89ڨ֢sNf%%|,*" V^B!&Ash$I!ixF$+ K6*v@"HRNÙlJ٠DV"J"!lDP I 01يw._u*q%s%~U!]TqFjfԈ>2"Lj3m6ImKz?kUFBP7zJTV.|3*"l!|C11, (qU}*'gD pTLdbz8~)BLh6ۖB\L&{.2Je(!$!FCuJP4KpmY!DdU+R[kX+A;͋MNT0dƔp~hI&d…͋9fF:Դs7m2+Z惢uq̱*q+ժ-7sNZjl 78llxje+&I݉LqYVôM_Hc??i@Oi\I00IYP{%4!q ]*HQ %b(0\q5N&k8Vs0?UN Pp =<=JE7*EUaeԀbA4cx"R 7$ *t=Gkh mƧQwa!sScBCS[ˇ[C݄7VMxke=[ IeVm <B=Ep'ZJ$GO 5F|&T~U]OwCm6mTgSʋjkڼuqQԂ^/8Xi\ }C-= L18 Qbu;=u ʢgTfd G6~B+Ih*6&)k-1p2g1WGa5mn@2OrDA:fyml&# X/ A:cca6| +z'8af_B Av7QA/(JD1vʧ޾G O'&⎸SZan+p e ˧7 MAsAF gmtHXfs_5̨\-HNΪCV^yHyr?}o|ń!,<ݰ/S8^M`[V1ϡ1OfS3&*؊4qƮoAs-~><2.q3yYm7uJvT*j: 4eM<ذ[eL7:~ aXU5 Zv>J 7k74`;g':j})_lt1J?V>e~>.V)_A_*+ZVZ%v<Rt;8A^ƔX;y83_]oɂO] 3g׶ gTב vG>N2z6Y`ANp 0@܂ {3鑯7WKA?yɱNAVM~S NQ;IqR,c{fWcQsǰhjGd8Y QZA] ۋ!v=wjF =tV%V3I8RGA!."J^Pĭz-o^Ҋ /ޡ :]rxđx4\ ߇C~ٸR`ksaב܏wҹeH ;Ic *^e`aXЈA0_hP *"ekŧx'XN,-)Ȏx#s p#c8b(~'ONzY&JY>vj#?3<2~YHX^7ZiR4\xJ9sNNOKVpB‹ ,'R.#bɧS: E–xoW1 1M^dC[Յ7s#N )YQ޿㾗HrԻJ2o${F#V-bA3c>f'{f35Q}JPlGyt+r\kS8a%TټX 'T|\1/]cnHW_5 |pͷAsn -]EKU? TWu,Wq{hT-ǀmҭD׭*C¨E>Qh@{OGyO(|e 6@ءzX6̳Q8dB+0${%61T*JݟN\0w^@oVShX^Բk0hLa"L:Nc͝M^4;_>O\08ޣyBZɍ&.di>{7FzW`7d|q{k[6.e"_s\+Ǯ_CC5 (:z/oJ>1@"/̮?1DRn+w( \ED] 娈{US7BQyD< }zȶ1In jz;}pknFT?L8WRe0PJG, csD[6/21"gM*> AcՈ6.HS p"a o&h>M6w/]= OsSUd6Nǹ- C"L=<{Cxzcyc:&2n6M$XLwq @iTp/ ) Ay [3ȚĚX]tsԧ{ +⟸OaIƢ 3|( ˁ PNG,r2JlȺ0@#PPQC`KDI$JӮV#N S-8v..3f=zbkx 5t@7oj~:"P<)$2[ou -֭7bwV=@_R.mLңp߆wZl,:C M'RХWBߤ^bI-1Qz5n{_[t3\;y\oNh6n9טUkȇF_/~"1pSScPNG4 W9I@H@5_薵~}A= 8޵L^8~ : cI} }藇t=X.Uߡ:4[vQeAV? ˦k 3qMz߿vF{6^];">+~Q.; R`Q2#x3^G/(u8^ 'hх6<SNA ;'k[v_G ܎Z1>؃8RxM} Ux%de-Kq.8N^9;]qeFkH^L#A]_Bc0uv]v:=z0eFJ5o`V}̾2HZCNxzȊ"`jdEdMa;+ [SXX>_aQY|-^olSh/@Gap*˧D)6~Iu VϻB%| 1qY{1:ٍ&%znz?{n2ӏ t34zcIGWuLH!M VgCͲS[-2nkE׽+pA`^0I(]0 A>Ʉ}#VES]LoԙH$I : #c9ؐ>+3tY"};MQzH5JNH/n- tѣ`u#Ka)۔sUh}97EwJC)P3d Idknf)Q$P!|AzElu1&M1&)g#*ˠE6p"0t|{4aO_tW~ (ܼ5&:Άy#ud^fu GsZ5n>t*jLwoGp%J)*t4QP!њTGGFne4r~?fH>0?ad"]-Lr+AmnC$ϔ̗5o(O$I`hyX9J\CR)]`9T0q& #8!q  cXDpq@5k ] )+%ŬY؋ѶFEkpp 1ꉏ;'jdq)/q#:"i!"!"N6 `(`C:ƭq9 >vJ!i+дX.\M[[` ݁nD w6GEƈFQ7M8:ǻ8 ULa,o)k^ЁQo2l"^/z|*CpMGR~b?4V`eAP(Է c^dc-%I0 lL)_1o;jY"~RXoiqrlR- QݲpS\b)Qr.LkyʿSz90+(6Vlnuc >`@l8Icif{[ d 9VZ6t7lt72@*7N*?Ő:մ٭6"/rwT=v|4PPS|)˛*gXdgWŭ;qەF 7+7@ncW!i{Hv b:m2`1b@?h7RgCU6Jl)PN5[|b v)lhڎm')Q -p`P.)) S*кr?L_) EPs3M:pAchǀߥ~t6nRD)fRt+%L>t'&=np豷}iaxP0\p\p3%7w4˖&YіH{넷{op@p[K< k]e$VKD$B"mMM`eeQ|lp?2փTL 6  A6#BVφ| EBY5W-ºښ%ՕV3aVsVH>Q8UT5*-t/keװc: Q"5Tz Ku(?Tz@JuLcU3 I`W!MGcV/'nԉ%K!FEaEEsDSеͩbB3BSZ tnƤ|u#,hMh4!f9!CAwWEjSwe0e1\m:Avĭnk&Pj˥$'-jbc4fCnд4MkF Uq(2,HZ_})4m9H,m]!vUوd oؼ)­]9QL>>ڀaR~hŎb3DZ1+tTOڙ}.ZU'VV *O7j-GE+eS|j: u0ԢbQ .?,uHH]@Lq0g t3k6zܮB?ECE_TMBi<:xRxMp6`' E@a^g^̓XW|gF<29$]k# 8b~‚įcuxjǕpJ|(:k+Qw8i{Ǩ3[p3bیQ&f?nn&J|sR]l<.L$v2EB*xXϔ)%gTfFqؓg>rq\jvHtkz{\Nй/1l E<,SW!%v[EWe.^3kuـDSkJQL$ڹ"yQeR`92O*|I͔*Pq^Tw@ULeƙ.:׹A}*%2b vX̥**ㄍ?w@l>4ߦuw^ӝ{K# x)fjT W@y,K'Y%eګ,4dK0O2tQg5|E9=ׇƷ>/4-x^Gn "2es7 z ߠ}T[7ȡݵ+o5f7EgП3( n Ame*~??ǟ+7e W-ۏ\L?L!"i~@[$ Ya||y?&2Q5efo R+2y(UR{ʴHUXP&?ϱڄOo sC?3XBskP6[VͿet/^g8GQ$|N { (Bd^W܍6nUF*͕jRlǑ(g~{_7[Bz M^Ik W\Σ;Ǝ"dr1hxqB)Lf]]cbYJ1u71:|l)yħx,ߔ6m7}"; H ą4iw{,+(" Ob#[azKa&])]qyADP=Em8j{[ڞ){9t7Di*YBssJl_@T>њL ā + ^~W`VYw0u%3]}Rg(Ê',~t0YJ:?OQۅJ% Nu=Dt(C8*҄jQ* Qcc8pY>7?6Kb!wf[p=C@J*Og667l=ryPs,iMwf #*Z`VDz# τ+ܒ+-Vx "@1 QՕEUWPU; O[SRfkTiԄIMfg3;1HNc`gOqE {ܽw׾58qC}vs*mφ?7^a7'GvÇ}S(yYj+œh L\gXvQ| )]{alTxyi~Dפn58wrÏg'2*U/9 z:p7G*Wg+F1VP84V-ϫB^ XEI]a$ oo oqy,}A@m0qn?;Q^;|cbN25?'ʝd2]Ɣet~)X } G{:C+:rP-)e|Ϻ*5 q4iE=5 u Ο*|'tցRI+ Ee=?NnVjGgDeuM4m1 xkݬ麗W$)7c-qz,@9PP+ܶ Y_ܠr tNK^䣘 . ȴK^ #oE4Z@ibh0W ] F]ܖ!e=a.Cc@\ > V)ڂD+푰ȅsU_ ^@+`[XfdTᆲާ&%CM u["iɄB,<\I9AxBޏd.)}oPuJ^pmfIQ%tBI|fP@ FeKW%ЎHB3^{1ʳPeN tݘ@"H"NÙlJYEVbd{^lP‘ jlPHH@ |w.7xm0q4aw<ד\Ih#cUDE(^Q)VMLтp>:Q= vZylq͖Ň %b4HY(؉Rk`uV'F:NO@4盄ܩ,?9lg;fM: NaIoV9I'<}rN}3[j铚GĘ'>_Or|r&$XOqX$Gw$XDCKw9#&KXEN*wFuCrhE.Ia)T$,ZQ$)6GYKXxe4Sq˕r=AkvT:$3^,z}t=TJW'N_&\@ ;m$MVTI!l@2-ȝ"CgB)яWo'c?o?j CgQ;}}^2z*5GawD0zCx IjWFQqQb}/), ,0D$8I!WLBZi lco8L՛~} Q"b%^P" 2!AOý} 7#qô-^Q2 }Xm;hzS)_f\_'_}}aBR$/^IF&N <"Q2RpR{B6k(ߔ E/ "\%ibe1G4U=| gjE'9*7jJZَ5U Ɲ׀0;)1c2ucz%Bs('Cc8vS&*adD1K, FJc9KQ,E$1|q7o1Ú'e $6m4>fBۯfxOp<~[ګn~ovw+(犍EMa@(k+>aSVr3XJGy紐xEϧXBql>ƀߔߧߥ2r̂R/WkZmVZ% 9đ}x?׷O7oǛl)D Ŕyz"{jOϦ7Ǚ0ezqVd׳{X?B ʮEXw/\B&Yfw;ϋ(>~%DW)f846z^H+W?6[5-/-;JT /)d=YkoЦFj'sl ߹9t(k >Ӛ=Ch`dR52]fh9ϔ>cRu{yO IBY $8EEv{3FڇbSϋۛ+.*(qfh9S p_+6䯂D9<9=\T؛7WfKprpaªJQ#W£ XjqaL{9&qn'>g!@}E澒L`3RfJIճb(kQ}ghW%WZ/ZŦڼlѻڅozAA7jjW-٨hã@d$beے~(_o_1<-ׅ%lؗQ܃xe~ Z?o&IjEH*b~?F$%:'qђfx%/2e;_[[ŭ+w oo)5}8IU($ KTdͻxq$;stCQ,7> | u&Z\ʞ%lfO,R*)ߣ^L2s0}S c`W|q$gs.97{wwhgf8ι[LQIGfA`F2 H+(Vg*kJ |6κ>O\2ZQZRS*o?8?;f[Ѣ"ͧT,)yW d)mf8Zؘ[GB%6HW,l8/UbȎ;6FEڏ̦)h[d4_M "OP/#YH͋=L :].*hˑ%a !xd=+z4ӅVUx\m=H#FNr}D\VgH)R'0DT΢Fr=XK?,] 7oJ1=0-ΓR K) E*jDX)^A_tqU%aHHmlH<-^+]+}靨R&R+"P&RVDjyy# Z,/$Q)_j{ \Ki}^uگYBUp\삋rw&| BUP-6* Z pR .=R[gHWDp9Ah=o K^V8 ]&NNx*E~ofE y}6/.ѿϽ.AsA D:98//NQJn"Nu/~d;5 fM{IiDΐD>A޳Ώѷ _yI+j:6m'Z4B# \_vfҹ38s8s:qffJ44CPYjr:PmiZа~3Ԫ)^jĮ RlfJfk4CP8[j=P}nZ54C֕Eˇ%Eˇj`Zl f`Zl fi`ZifUVoZif*VoZif5no&'\K^!+/ąZ=FhڰXm\xI4s)_f~v<♹ 1UM)e k,SU0(`Q,c ? k/L3K*B `Y,BKg{Ի35+r `s[E}7 M^U=lT:;d:~ڹ"ଟM|>[*|ݥ 傓'S,J形CNBzL{cϗAP& o_?&ӻ(?XO oXab}<㬴EK= ?c7&[ s׿'{{T/*fd<=]7(Y_ûVs̥{۴?KcU*r+O\]HѨTU e]iֺ6i]5HKjx52$H0ˍi!*7\>+;+8,m Vm Rq_ us_I/,@X} RD2 >"LBi%p4.Xq$9D7=>Cx2%%cUX\ޠ<꒓qBxz[K'v`錧L>j>/M^aJX3l^p W> /`[? ~Ԇ.^Y 86a|M_/Kú]O|z&Яa֏5KX uҰn+>ʥr^ 4^ R̉!*>؅va>]l/ ymB D2ꃝ\ *_sYI,kB@c}*iRX_Me:7N, Ut,*3wrZD~:T) [Kz98u23]-l5.׋xȠDAEk9A˔}:.VʈUª*,VFVHa2b s:OXq7'y&` ^TX&s:JX/ކ*yO ltA7X& ?PJa./|!2,i ,i+,iaIKRA/aÒ"jaI1@DÒݏEzZaUOSZ bEZaUOS*%OV,^b%ư' +1=C\X1)eBÒ',gG5KN, KtX*4/l&,i ]e&ʀE6aIsˆ]’%V:,in2`6-]9,) iz(i +z iz(V0,i~2b0U=TXX1Be* aU+#4?!1>nN 4OL$,i^`Ò)fҳ~tA7X&0޿`$,iUUKZ7?D%=ǟ%:vaIX:JR-xGk7ZٝP0)@CjoTL]t.^ͷϵZ".,&=KYjyaxI׃9YyToUoaNbFhuP0 lrԦ}cQ¢p$Uŭ3f;E&xBaBv9] ]> f3Ûty vv,QΎgcMbQ ӊ5v6,%z3Uki]Y֫xڙ&bү0: UOxdRC%heZ*F(cず$.w1irqs!&3ϒJTk@ K4:]Ba]/.VL8RxR Ma缹Mf8`3G=el N>+{ B`[{Ģuqb!vZXVc]+4OO!-@/ ϱ]Ǭsv1_7Xz1]+Lp΁Ic:Uq tR6LL'bzA:]T/S/IYRqWςi'^L*.$$73)Ky^(^_Ң%zֽbZLJ֮s82|NL_s>3Y.kvJ%|q`[{pd),Łm99@\p |*p qxfKqW8Y q* ļq2~*ĐSEB82<'πe+kv\H-nü$X(O^gEc;f{EpzSJS;":8&v]Pv׬tzc`s[">{aþv~R^F3O᧝?iqriwK%{=_`U, qBWtH)|@^/io 8A{s6⾯kg "޼7Ї=4=H|PLјwxt?Hm0y)8Ύsp /MrǟwxozߞgnGɔ![-L-{|~eJJQ:xrrN{xP׿1ӗy(1 j; i?z,F[Ra e7? YB & {`~oG*IַkN &-\b@!ӭrEޠYuV:7k5Q_jE?l^2yOd|PM '"(Go[r5; :Y_JHo__!-?|IJQ2Gc{ >JHj2x~ϣqo: s?ֿ#VLGn09#Ӈ`L=Vy"G|5Gw4ws4GFYRgI社SmWI?Ʒ9.&UzN唤˧ӗ ePozc5ōs'>ia=TzHEQx3w=?7uY7m7+~{u Xp38_×'Q]iu./:5BGK?&?ߟT(QEiͳ0 ݑwB{FSqt66dWtQ j5~$C"zozO|l />vP/qkO߹q_L֟oi zBUG,(s BHbH^\+d,Dlww~?S߰G50CՋ!P݊P`?F.J=ӑӫQުܡOytkyϏ _ AMm־$Iv0f=.1|~y ~[jo2I=(9|~[8w큷b!2`S3|O2ncͷE\lc*9ƀ>X% }39|(A8jMgK;Wn 37 W4c%D ʘ(S 70`2J =t/,!n"ҡ?~I<0p}`<[^ٌ%U)xǂ͂PAp4O$_j jB w!3"c TbzbpQ߮͢kq دj{.ץmo粴oZjc#lf,Enᛨ ,iI6W vwemoETDzpY< 4%fT< J~Zb,״_jK`2# _W3X-DKn:}EYl2dV"X$>߯MZwbq.iR̛MJR6yw[F##㝌@.25A5mLD$8RdlB?b辠IWZ % 69I]n`@OJ#̯Kz0G,2dV4+dli @xz/q- G|߅']ެ{) إR;j|ǫsaȕKY}<9R30UmV>fx v iU.# ֡:Nas$1@rdPZ7T`fH*oxH&*@cP|H4 8e$ϟtz0HbqĄK&D/Uad+J./'ڕ?[ʟ-6Ο#w类qw`OCA`?Y͕6&qfSZTYa aG 1FFBMTi7Y!-`ܘ[7~/_e~2X>O'pJYi_ &}is>|PnlJm`APIzh|I}vx$eKQ1l7HBo T_R6Z_ ɾ%x^U^W4G>՛-x]XMM,Wa:t S Y:Hҥp$i {eDO,ILe5$$]I:IA8NNBu0w1BӉ~I3lBrEӈ^&|$ҙNRZDnu ⊈ZO(. ]X98dUZIN:d6[8ڟDɂzYݤCDEJY{1)xC4fQCjkQ`mU՘=pB yRW942VIOTj52$@+<=v\e 3؁!OA(ƚP=MAeB42嘚uȂAQ⨕F TS)O !ZC@Aѥ~ F(Lj) [ ĖKD JjdfLoJgd^`!:T ͠![ [ %D\p^X`:Lf\A,1K-1dUò$5]V1mIggUӕ]s[O0[I6[ZDrMƻ1k"*}i)}[s).O II{LFw1.qKݘru%& LWRe!qzaW\7eQ&5KuagWm )ޡAIY;>n1vukpBad~F#`sݘIЯ0ʷGO~Hq#F>EF\"; 4!T78,MO>U A0.?:tdU|dj\H)"ŕ!x=VrMVY(6lx$\TFĻ/NqU1~k&vVM50'ʌi7X7Y0\fC[so*o<}W`DW hl*Ԓ/T㴔 EIBnPb x6XyYR9QK9B|҉%DZXe+l/htBW8dF2wo a, 13ű V@؉[5t1PQD+xBOdbo]_^!cgss8|pe!}60 _E0U'i?Q0N|gdǬϠ (!y J7O {II,Dֶ;XAYL*&~- ٷLpTea*B<֬R[R:uSa3\.WB3nͪ77xV^ xk5vٰWJajVU YfQjNSi*tlf)\ Z&bRV<Ɯe's[.sɕ9In׻E;{8<erO B>wKw<wO*Ǥ'n۔e~O |LNjmߵ.eFWzO3d؃CR==J߫,i^/-^qqe^JAgnn?GWLFZ/Zs\aګ}T ;/7n5xKzZ;Y^ȒZYjf @yfuˆGiJ1\^ sѲWri*=j7 {RJi*/$a*z +,(\XiV>l"RE7[k]^[!ΰL^nF5|y7bC┍+k֮lXn^P'$` vjW/K`kZN$3 Bϖ,}bߒЫځ~4GK^aDK-Mk{qϟ%g}tIEBŬ\5K(7a1n>Wy >0ܪPcl[P)ݹ|/~*a|xϒL>W|n8V&ZϞel=7Ђ-п8<+_ G60Mߋq)_197&` ̾;{6ՌkF,I@ʅG[ݏWպF,8fU`_)%fQZ,BeL1[ݚ.2D݋jWu΍\EkF,I_U]J=?}ZżJTʾx˘u ñ0aAz -#ٛ`%L1\hT11qqgŃ̜챥a@d9P@O\N>j%lF,c(ک4Q., 3=~;-+Y 2ku; IVI89™)0+]LPz yj¾K,UK#0-kwKl ޲t_;U L;,Y,EB,fZ q]bX&Nڳc4 dI?Z= y_ qG<'7r-m{O [ʇL Oɗtn(ĺmq>;wAKEaWr/x!FvrwGzۖ";K?I(bK_ A ]^grBMe"X!rB!TH-z JZc[ 1A 0:l؀# w5;K< 0 h2U7|)Ȱ×jۊїGm 3q~žp6 tM+%sM~֚zP9F/ ӎ.PHt2%cniqJ^:P۵m7#>Ovz }s5ïz4Tup'|c~n 1tcݫ]#Ji=*zkЪ[3:;l7ckZ 7}Y,Z.%{JPږŀZq|`ߵGJ0M9 dȍR!`1 G]n]f(G#|ܱ=,?~ 2}zA|ذǬࠌFW ˚νzDY`ɕF٘\j.rew"i1 f]=>_#qדOlĴ*i9eۻZ:^G&D^DCNa,yl1F=~*I&Ng* \vpkd/Hv i 1,~-@%dؑjJ)nt}lZyu:cHwvzU;Wꝗ6&rE==u:9?&<]r8ALy^h81ZU>/|1T?R†=QBaAkC+ŤmƖY=/v4c*M++^k-U1^ì%Up2@]JUk{MNn 6&=궨vt갌7WpjFhN͌_U7޸]`""}.e5*L y~)gPGkB-fGz+S;<^lC:9r0:~x~/NeH1`>O!;x/-[RmVyr؁U֢ŲYqdy̓Ue8q]{\{ UԏKk% J"0 9Y~vЗЋh-@xV@G]|Q 4]6 }(//,m@0`{qP iFс_7>E,;ۡ1Jr4Vԋd(^=w(4ҮuHݮ\w0%I*0N |8Fq=NMrxޏǵvUaU[lkGǓjVQ?6j9j4_}碕%g,qJVFb%*4tL&(2&)2h0g:yů9B0=zI`{6/ ԯEqU7!:OF~d$z6 "!$[ ss=DA8PO.`a$5=),m XIK `AHI$WGv %Ǖz4kNF9EUʝ w=Fg|^VzŇo"pĒfXI8v/FR BoeݿЭ ](l6mӋH E ÃtϞ E (V8*))H Q' _¿Jr6PT ר~WvY߃nF* J>b=PVxJ{-ּAknr>/Y*>-p<$/͠ Fdk@!l9e(N0H9G$; KڭV3k! _]Wvf:l0:$`ŽQM쾿 a)49SNJb9ѽ4XψY99Ç6"::kĒaqήT%}*U!MgwfP'_YR՛,9P5)۔`*ɛbB,0onU11E)qcBTK$X2YU,R[fh*+{X {k:U+zPNuY * *z:粂T%Qe|A̲XEOG 晅І3F{1c7avr=%2rxiw]#zy,fF^L",2YݖmE@U^֝Z]R^XE\C[=x{1@{GkE'0W܍_ @QP! moL{4H>VsKI}?z|}4gz9m*@)9 b4m'XZ.*b C]rթ)5@Ʊ;\?9nE1ۮq1]JW/Gf,dhBP7ke B-Rh\5[EK{c{89v:=XN/G6֮tHrKt݀Lz%ν vmQINtt*vAnpѪJ=n6i2Q/Pwn=nk]}/NHýj &."+@3>5FMDU򐠗UU6M$>`<Ǯ?_%6Iُ_?.Ų8W~G'"g9%G/:%zGht m, yށ%>K1brCVD)n?oˎ{$'> &Mr̸e0M о=ݠ. n'^Cߠxvnm<` @h8Z=4A 8g@a^ڛLH~VT#h4|x!X֪ȃlMsw(v >|ƃÓ4.VC8`~p By?yw%@P {:rzև؟A):Pg:A˃L;4u*I*;m[_:37m%PyvەOn;{=fI|9ewAY*)\|[iy W(Dޑ2+v>E.iu;ְpf Z&U350q9ˊeod3KXE/hVq|U\P*mK8(ET۬{jb6x׻ g, $ƆVJ֢ND+@Dףu+Ӯ3>QKE=Ul뀝h|4uߥC7Uc0mW`WK ۏnPARIF}3hZ3ttpSk#X,0+U(ؼ~,4EeBۛT2N)A_d1`Ŏ"A v>d?6B0V,KݖծC]MPRSh@@7)p(|=?̈'YϟsBx%#~ 5IZSQpذe[-s'K Jv sZX?6&ΤS*Su(?_t|zqnZPS:͵>!;yuG!+u@SΆȒ yCa yݐgf׫773ofoyb PzfQnpL n[;7&pHQBUmt*U_)<h ;5  nrgPQWre>{D-ul2?"ZN\R9ŞmKV`Q"QOxolCmoJUyBPa{Гay0G}"-Wi(}Og5el0q,0?Eb0om4giAqzn?ިǺ?>0wK/&^3ƍiL/:C3mYlX ۗL yDϨ HK]sI7'?Ac ]Iz2W~jÖJj)̇B$)@M=ѸqDy J1&` G1$ߨ'0%)9?'Л`sv27Vk@7Dx~VxM: .M{\ ' ܷZUi!T4G􂮉kuhU('+2U!vT">l ` @y7Kgmg!,3ݥf9a (7%|F_p|k;Jr#`I6H ~AR mB>h]5/5Ei|^ NSWG/:e=*pu>clTo>Z4فX4LU!wgcjk-^@&D̯קcݧit>t,%c>t"BP`〉UyE?ß ]'kBBga9B@,;u+{W7V .TuϪw9QDV/]/%!e sX, ;X񦏟]dpԻ}#ϰY}R zF@PSY9er* BBH_lm4+u~@ 0F,q0tLkܧw\$A, XTegh<VçT26L:x;چo$[Xrn@aVZ~n Dh*0g[aP-mP~bv}Mp';hf4G.t|Ȫ]VWfS-TF ^F"^2 RL=e0h/K*Y~~ڮk{G?DSwh*Zd+,ub^U Bۇ] ;As[EJ.q-#iĒ(>@I9|2dR2)CPs&ږR׃ &d8R[L(raq6T<̊*2L I5^*uJAF&O`nJ0zJ 2A0~8WIىM:vz!SϞg_Wm?{F'bO_M:̔Ġ(&\/ueLGtZvPm+$9:;L|(qCEH^MiO+s'b =e䋧tKO/2rޛ !Q)'"@y0QO;' wF) w GFY_ (ha9w/Wn/.xiOfzz/񏃏0B?^"%8UZ 2]F0:\یt@l$yn'q3`:)P53QpcQ%~ݣdy,w\C?d&& D[:b^PoAuR^p,+\$¶cue#+CHq x91jĵ6o ;cI24bK],ZN0 kƆՎ6Iv絢?U򺎵#5ڃh^t2J ŶNrSTзx?Wˣ55ҬTO1UBUiYrA%,˃Û`UnUbp" kJK@tWԍkn[-Z(Z8 _ W;0iݚW:@_447(6q,ao0ڡQ #`) xh4.['R G:pM K^>yuP奝PP=t>T/*Mh}#ldtK9'^Mo~L =$e򥾸l@XzGL˲%PzTQڑOG1CӂfQ44&-8iAGH/RA1wWc `xnm d G@rtU,4XC vkEqcW:Tz;fa, 6pCfxNf1I,T9tDG. b( Kl De5U$GU71MFYPۊxzXSg7^ҙ IT4ZW^ s Gv0M" cvHq fDktTmC`ђdLB}UOUE>*eS5l;"5|iBt0ͱ.  f*"0ս"04'ޔc.RĥEHjPyP,CHEq@ӪCѯ45h%_`™/ M8ΧCWѯ k7P]UGW+`#mmՑ4T,ԁp0\(ܑmHm4Tfec1f`W8a@);n*UANBr9`}^KH'Č箝aE+{CgDߺEOIuo U!נ7cvJ ,-eʣ <7)+e9& >C5 iR)G[# zx (dh+Ԅ|TB"hA_,lyy""N@i| a|n}/ DpG۰e`)J)ts}zC4^x,v Н`!To ԃdA( l$4>IĿXIwkK ^ qw++jt4&˒)Y)/yn|o+&KxIci S5p@(LE 165l+%`2ny]al@u  V ^kV랸\/R[ 16O#/+*vh zA҄w<>P^*ZҲ2J+qLCd"){9T!JMF1Z\USxIKû@úߣG:/Jr::BDnv|}Q_uP/-vQB[Cu/_f P"fch4P_haZ^4U? pafIq( tHt/_Vuخݱخخ#g/7ىb,3T>[bx IaCveNRlTggvQjHJJ1Ln<=4҅6 ?y|1gZalALD"x{C<˓ȖrdK+ռXESn`诽 >d01gw;GCJ/m7*D%/n),00;Jsʡ7DoJ daOby %M8HP?K6M榪/qnBDTe&*EZu5ݱ"ԱFY s7* @{Σ75-ihm >U pzW=dPǘ3'z\rg3R(6eK[X9IpҮj:\%%j";g$Q#7Z/jF +ѢޛuQ'^3.jNV5E-–uYN3=57IgzIi6V)p'ѽO(+q֭PDJ17GM4pivۃVvU]ӵY$QHhȎ2%:at_n!*u%׎)|Pvȉ|a#y?|t+)DFY7kRE fhcEi<[iѢ̱]g%AU ?5O72s 9{;?O H8k.e=ɥu dnOI摄e$ȟ( Of#qz3oi imjm[c:Qcu;pFa!@/aa9Uł(@B"lkZA'bIƕ]۵j4dL9@wрb,t H޼jJ!<2ƴM:WK8(w f CnL_Y%bhl+JVwQ&0FE JpoY^U nM XFB]71.F?#fa~4GldA0 C,ª ?{XKV4a, 3U6U*?*"55JzFX%j*'GT kFu?JR=ߤh \]6_S>=voINy<}fR?IUcaR [2)Ri-CBWVM~>%ʱ ƓpSP%I\8SqCğ4t8T;9^v1&+b +. v`3i VQ/"h^S6Xc7#LMT`O51}=A3P>8~>_~{av.^qW dwv+_P #{"F^,$_7 Ɗ(8J,aXu08,1_9_:1C)}+ ?nR[ AϟI$f 2)*p*:)vS𥄱Kl\C3FR%9@iˉ$7(eHMŴ!o>шeTR삑bE"at !E-sDQdPT1LH# REB~0b>dj~0? Gopx:!@%M# UҤ1j)(pSZ>K#v K/#PG`/F6!%ECU?\2U\H4T I׊6m@UaMx~s<5Oo.b@FB7P@_k!A(ʋ}R8Е@Ҋ(T=?]sGG+oMDRecDey C(NBe;?~J]OCZ+| p pbNV(}fӾ~=dDZLGQ:`<@rmyL^PE&"khkhoU,*2 ~e&*;Lv͝PQF˿^ k&2}Pٖ崙CY +[;[gi+[VC;GL-Gyާs+4QoyOo$N҈0M^-#otUdj/gKJ+'yНE OqevP$#LL1N`l,:r3A`M0$sjA f &3Add iEE K$@/At^A5^\\Hfwa|\;g'ola|=p77WB 5==L<}4`鑣5 ȬReW^wH?Z ˈXHXL}bTnN+؆yc 3sw,T6:a LSйh>`mj4- F7~i< :s[.Ԗh皚6[iL H š4#2N"MR/dEiuN6$4GDA"pȫL)/ q]STVz3d"k TyN"} Ty'7.bfw^dv.5"H*f'$'<( Lw[Yjey`+*~*{btX?eyT$8MʼnڵfKJۂ.BDUUˀ.wͅ۔SMD ]z]N.EՂgtX{a*ߒԌKbe&u0jb^- ڛg؋NtrJMoBpR4/ON;1.^&HBnjgé/G_lw|2=_| U=< w)*XW>yN>1 s_wXUU/O0QY !2Yx"̨fY]pu!GjϳSFPwfΈhR;W>Ӳ 'u/Չ{wSr6r۳*]3O"{=~x'&u#q;^/Mv:G;QQ8~Gr^羥u;c5J\l|{^v^~6x2\F*& Wmҍ,|LtwUNe q*Yz2WSzz^_^^^+P-᫪I#Rv:S&78qE$Ǥ ZQz,utD1)>\ǎ~w,f~4ŧw"Sqy[g't %c7i|lbTICDY-ҧ3\[eLWBR+T/O٥Nq&VB:;, I.\" ePsRjݲTe<͊9T5B Ro[J18)SSӭ\Vl(4"r]k̛;q2Ǎ.\V[6["SM[;I$=KEWJg6:QExEq!П [^.>.#_TZ.k=<_lc׉h\Q H3bY:bLh`ʕalIcst]㕖vse ?__4M)[b9a%d .du@iyQEP| SqyNݦa*5b!I*T(Q:F%H?g%;C)"2 ߊ }.Ol{JuvXL["ʻ\]tz];2;\~.9DeM{;_$'zD>Knc\ "bdH' i rm-<|mX`$9/F,_2KG2!bdZ1(F1fB b Iis+]ƸYȻGqٜ.c&mԃD4hF5Iګ,nk"?&Q4ڥ/-LoKuVxz^ \N%Aʯ;đxWDhJ8Dh"Arìhs‹#ċ J9ڞsv[GQ|?g+ĞsUýO:Gsέ GGG$S 9^Z'G2$vގbct#.5f{vnx*t<:Ixdps=;Bg~D1% ƺ3/f2{/t QC6D9L"=Ckd#ڵؤT:8GDU嗔Rl 18:[cݙ%XI(: VJ6Swu*KNAI KʢzRP,*|K?^^2ʇzw>Բ1G6A[]N:L}{կ3zV3㿚2q=SuŦnQLA7 %č?u+8BTbxhNSZ-H B_oGO/SLO5͟qtcΒvϙk>"Ҏ>A'B"*s'F3wfn. _Rҟчy;oq->_?a]ru:ٚn0wќD3׆UDs͘տެ(vW"<|j\h1&ws842l ²UUw߬$S&8_Уx[fFЯ7GӅ]gCa;K-NljQvK;⎡ku|m:gTIsvڿV3yN'xI7׹@OO}|"zkVy<4й+ys?8CK]` 4"y 3ǷΓEKp3ډW!*Tf&a:~=rEsQ{co}cd6hhdj # $ș-Y4C.J[)ʲ!V:倌Q=dz`y˷DIsb[:lpuLEzNg'nU&/_7s8vd ]mlΠݜ@7-lNaX3Eq4z#Ɠ,AW)7%({һ}A7F!SY#> yM՞rֿ+5+4a0:M~x eiQs!vM!ZCO+1TWJ1UD}@sm<K 2!j0أw@h8qʹ]n-؊6[jU&!BicZ x+V1s;uKl֮-Yn˥9Y~ݶq (?|9ccDkɯieqy.6U((([~ Wpb|Uq=Ssg*etO,lwo_qJY3m+RqAu*N}.m4ڰ!B<. (+g[| *2ΫHmODVI?#VGrv/&\CzOso0f10ѓe8N[L kv>W&xqR&Ujm?F9^# kkey:Zdż;]f{Gŗx͸ajb?H9cEgu9 H"!b|@oH4x{53TfnF/;Jհ[|-fw{A|@ ?++ʦee묊z#r3>gw2ܝ,-==?&*0A$?G&}l]~;'\ ԜTaˍ0ج^ nL2af$BGbѣ2͌%#Oit9IUwznp`;\-=W;dr;}I ;lp̅e7m&)}ޙz/cf\}ww2-"ә6rqrevmlՁf{-E(`%tZeRUzvDipޝq}~ }a0&tk7cA81nsDf^lx˯5ZKP錚kEI)F'f4N9f㬣L !A3=xGr#4Pmٜ`%W'GeZ\=uMsyJW K_/(Nz糣P ή5R$nWř@X5Od뤣MGؒ\5r iKu`1$S,4ؼ 3Pզz&QaqxibPy~ΐ 5snMPAmW.3Lal!6Ď ͚/敢My5/๻ˑ] @Y߳\ρZhH:Uk (Co A &TOPw$ۂ7O6JF(#^s|/6*vn7-#Jvw{|JU,J形CN\BzL{cϗAPݛB'ؚٮ4GnafvSO`$\T} xO.~ 1tdܿ&B?phd}73ec@n%P8? 1Q 9Rp8B MDOv(!gt3퉳cLQ 144O#{y"Qnd/ : 6{n'?jL:Ku4`ux9sz`:ϒ|^G/RM&RV=y/~0—;:Yg.v>R '(¡t]d/O1Ex<hR_r毰穧]MOi%Ļb2EU*l^ Oc9X7Ӊ&>ڸrafI]*.}gPeySPf #Nx%ٽe6, J%{o%g Wbu/Έ}EK<z7;m4?ݲYi{#, X<2@s W]d _J^_5r츏ς!nB膻DŽv#VSQj- 0WрZUԶYt&C>|" <zCeT`kee&xQ3;KCpz..]3<ɡQ1Wr6@d/ jˉ>tWP)H<-E9XYG٩O4xx.Ls7qlJ]+6Jh_x(GB<}eq^<2+k:5 I5;q$ܧ>P܏GG Ϗ`xGnoHBKpLGs.?^vTm\\ysXYnk lzSt8`E) rϗR] [o)wۥpD^cgvf G-4;1u W׭R ɲCoTU=Z%UrIfzy3=WR^b 3{XKX9KU(K4徻AyҮUV?8uCZrܛN70D_wn~50FQ$;qƋF}u"9!/;=ՒU3A}.8uKː>h2SSfSo#\LSJeGjK7?n* \]&G@U%VJ7"hՎJsu(_i% Ǔz俈XcwҲda0v.# 9rrXCJ!j޷/Z'G//`Heel4Zbx}l[pBuOa5aڀ`#8Ыa<Tɟ¢WqBJh`[}/}"a}؜#p}\uc$(e#3A+ݻP(T4v8"1{##{H{^I?2Hw &I"\'gΥ..,*LF^/,E]GL̘׍ ocsV00а6rKXGBY-=-ĉȪ7I+鍎01'asD="M?2Vspb|D@JVzG!C?GUz&JNE` ~Cm<IpC`T>ɰ"QT:7M9r:$RfA` :u\_#Dr8M.6\$i,=6nAaqn/i ghu⾐͠(~ qϭ2' goN {7xߡG& 8 `VX'#<} `8xzy1@{PFO?{oƍ>&3ǖZI-I<{8ž{#XyX|,nOj]  *[ I7N xw&:7`Q@9>nV~ѵ"cmo~C"܋a6'aYt<(a*H FQG@0OJM ȳk(vSRqʏh#m6ɨ:薨:): Ör/k֏ay$=4bQ͊T yZz(1?Cd!"7-(Iʱ7z]pPaF]?C 'I'z 4IBz4O<LzT4@3s&WscTYA?)fQ\?j:Q|}= ӒBa^s`bnC\VRYLF3JEFlf* 1 {C$ָDi'X 0)H1frү쬜R#.GT1plEx2|5Y㴼|:י0Kȸx&J@s5anhԡh' ~rD `·G2ɓLJ*롛ʴƋPtg3,2:F} k!I"r?654xQȊ6>"ZÊlVє*įJ-ͷ: ը H V@XInG7Mn)u wX(O}!.?YHczC޴BIŴR>b[`U4L4WKD+6i I' :cJCodZ$D7tJT; 84=[RD<]5|%d7lYIhАIEv -l&) ،LCi}b6&J Ƨvw0TE3\|.EEL szq}< >?.iu gx zT<b-~L7$"cs\'Bg α&GFG<1ŧ-HƯ%^ }ylܼ}}qlB:y!J ~رc 1ƩGGOS·SeS.Au`N)N=rӚK)8MK *ST2bfy&ԮB 0BؐI7U **;d"Si} ) nGk6)R ( ` ?o6^5_h.D`(9 W]w0aRl6a#/,rC'+_-#~l=[!ǚDjA& +-U1%gM6`asp&#l6G,l AÁ :oDB][ef&L^(^4٨_/Z VH"[!tuYX𩈕LO-m΅~69ɓ"ܙ%R:ܒ`ZA,-",1","MdlKΒSJ LV*@dCr*>1l?Cg*UhH̾"-ř.nLOc$eAC dU5Ly,갆!gGK,M#XRepvO6t&Nq17V]VQ]#SmnuKFÑ{|gp|XY'plM*<[6dh9+fm DfNRC[EHh&?@ {ܽ -nFZP2yX`Ղ%. J^\ 4NRa1ЂBW2 r% Hlkt:B!^ L€\y$fz ^6:@9{|=y HSF9"71\B$@zNx0u# l ?nְ$&oD(2Q=Pߩl2e_\)<竏k]<ל@:k/Mi9>I].XE]K;G(/מ,nM!_H QɋeeÈ(RGtB^ baA00'1T 8f(5"Rrf,6B#C@1ʀLunLIlD1| l6F;v?vgi4v :f# #K[ỳ]6Xjꚁ5Mr_#`dY&ϓ[yrHr3v湤ӑu\6bǠwe>_D.iSm=l?Zfm[Iȃ%O ld@EG Bd 7 Wʒ?sAƦ\? F]t5 Ͻ !w{Ϯ&9 (%U #ߣu A7m_%)?_&푳ʦI=ju~*yRes™u8ٜ#8&Iă$~ƪ}x^AcZHFGNe*W#'@,ӎeG,^˴;m,S7W;OL>i%# dڱ dcڈEϜBf!KU}`EGZ "iUJo;>`A=(mÞC\,Z| 1:C Fcj.Qy?s 8^J=7?"%qvqiن5x&GK ,N̈|Sebryήi]h0 I?"ns^t| l#]Q¤#1 0*. ád)BRXd@GbTWvأG .x+$jQY(e 4R)5G:lM3ű%VBj#*l0YTY15]@}/d%xS:Hm#(t:/[y(sX8҆vTCpױUzEFaZa  k>j쏔3UXe9bew,ha==LYY/,,|N1 ˘3{L̰̈́:2YБ'3%aX)<蟙W%>Q aX|QrR$FBkCД'y;Er+y$]+%"^rJBW4JT4Q*q,]+˺jsD ;E(+BTY67bف{ J eTZPgT4:1?| CЄ@YqNS]sb7sbL֜̾='Oz~27S(;cfK$U.gLDG(Q,/P!̉hxCVtf&L=M.EԖF}0F[`)Պ0'ZQ=1 ?SVb"' .B>]+p,)q<[bH>D$V&u,A"LvxF!x>!x<͙a|U;x  [hh\V2 Ѫ|tYIcJJQ4P||v{ /` /XM1 . %,twbN\Щ.ftZIN+>t ̊CIIjӄ*ѦuT4#SԺigjVG7M +M淈|ZhY4jr2"yDP.GV CmGurM?%pQFx8 RmREfrU 0{z5!b&Qƴ'ѳHnQ5ثױ%œ[(}bQǬ~i1_>W&&GCeM|7WM?<3oLl|d9S XYE} KkD*uƤ* kƻsI`Wxa)g{0f0f@bwfgsJ&Zjo IYI+KPrthQI;`(ccAX\8reyÑw/ ܙ{8ڑ7VV 6 Y6?ч,ېeeBbO =iS(+G_g>Y:lBɦx2qhKn|4#TФ9㱮;ECʌʦet7;-dFݖK[%VQR]wn!PۮByDTnOXH񏧒? L8?pKhU BPl \REF)O#6}8 `ފ&ءuȩ'"C֜ f㝂f<9Yy,9$oL҉YQfG:,>fѰ>ZBoyF m5ׅ wVњ-<Au`G/`d8]fHS<#*}yrm/lLBTĈkJ aYjO痟G"yX#t*g }H мW脀Np0ܩQ) $=oW,A-açiH@7RܭFwA05Hް6f l@ 5~= >?W^}1#Б\vJߐZ˸Z7t"M3$3֭a$BVPpS )9D)?6ifu &E=!,3Wi]rqzxe1'DW*_㪧M MH뢪`,fpFE>JI!77>&2zEs_E4R2N@KƐ8|FXTbh'%gQ?Ȣ2# {K B)i2P, {q 9:0\ ~dq ;u!^@o+ k5}#  ⚡]Ps^԰ƴ3O> p5TwbuY>b9/@mxڒs ~x+p{C yFd%E>kTE'*qi8%*B,މ2hhlPFMUE'dSZu]h$Tpr[45<'+.`MFf"E(FD+.gѮ.t)ymT&&ZIԦsb?†N1-#/2 ,W(VTe 5NS-ǛL6F[ ' @Dk)G;$~ !K!(g<(iaΙ 4 k0ih"7Z/Zag%bJ. 9s[ 1zHvjGc<4!F8%s´ny1i73sbڌ`nf46Sa!JC P"댡!_mlOt97>#xb\y9n{REuIDlWͮ>G|ʴ(7T`t|#Gu.71ޠσnxzPRE]%=$ Ze"h>ƖT-sI"/eӘdh__baF(ےl{QL0O fQ,ѧCUk<$>:@J\g@ęF\<6@|Z!)bw+\Lgd\^yGzEbC 3i΋U<=U=UigUGl!'y:ubL& [A~9V |;k4yl)ĺȔr9h5Q8$+nA7ǴI BHfDh˭' _4&'`̅Yfq5]C; &H"Re.(k]i/iiTk,2n7Se'nIߩ i2Մi}wb Df ͒vZe[ڌ{ڊI!q2w]/PBuj( x$N"6ayN}ILC)%.WLEME I|s}VjfUhJY@}{;%DMFaݫ`oUDvckyw[]ݳ^s 8g"IJ[.q/Yk+kce((JF_jMB7`tB[-tkZ;)~FFv>ӡSpyDhk@:1,> cAKywk/hFIRx~( e+B8m*ΰ?4)6lu#S SÂי7C~dWL*}&J4ҕhq%Y E. VϬQv0z8}x5,XN[tJМ~< :@-ci{n'xb( $\\R5G¦I}+* d(y % ፈzc=蠹Tp~$C"'dʋZFX |xo SKL CM=>%ޓ>?>Z$yC,Wa3z|z&P zC^=vOO=L<܏I ^0Ju}bf{knt~z> f8O>pM#JZ~/O0m%ɞ_1~Hp W /}5j.i ?I"wv9_~" d"%99kjv̦0 :$]; *-xޠ{."+jiO.XދnֻǕaVxce7]rXh<f(ts߿b߅{WПb~_c-ieDj/U)GG28~nc=Jהu>ut}$}h߮aGC"&zG\=\=S"yҩSSm:I:NuNuj'jk;N:vtT;T[۩IҩSSm&I&NMNM :ɄgB$u?jj:ZvBMIJM1U[ߩ ׫IJ?"D턨i0ii4<B8S$G$5u5u:ښv\b'q]죆*}̆x0旱~vÑD?H:0I#i$d kp;D u#!68\CVQP):3: 3:3: 3:3: 3:3:3:3:3:3:3:3: 3:3: 3wAg⚣Ѿp3ژ A6Gцz|"wU n 8 ___________________X$RWpI[P  Ppu:85hp`+5H1(EGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGDŽDŽDŽDŽDŽDŽDŽDŽDŽWz)WUXH:0I#i$d/K%kB1 /M_J2ᚙLf&`3 Q33L3}M(PkBuܸERt>L{0a:t>L{60maôi ?L{60maL3f>̄{ 0a&L3fo{yQk9?gX;?[g!W&3-J*jh+JhN us|6H[OBzZ>4׬Ifgsy3kN'j:9<qΦCl:9;qΆCP<l55C1!iHjz8gը4M lh8}=|Eښ.})qJ6_h (8]=oε:=;>Gj X%4up~ͦέ吶GmK m&\h{5Eۯ0kl*_+( l4@ ΩRp>M~_@T ΗhBL\`8Gf 0c.U&p^Lz?`l4~}>8&&90 ̄ 4 %Ty/Icg€#Ĺ._ 6qKLs\bM߲b綤 8%iI &eI+P\! \+:?8tIBhAMpu`|R 8O%`D OhB|ܔN8'%ix M4pJ2!Iр'1X' >pI8$iB%/z)$ Rw)|ኔX)NYS?\dWZO,=i=3"WZ~<ԕ,곏9}wbGXaonX._qW~xD];f?lXJ[6* Vawя!Q-Ga-<5Y-ϨF}6.H}xl5UIk[%oO[7[?n( gR7孲Z? -sDg~Fj MewgAoٟuXS̊n<>7ZZ|_e{oW*U(Lzy'E,a'Xx[outKɊ9tKe;z栵ߌ~B˗Šw(}ҀZj$㴏Gc a^yP:GcXzt:G'ټ_Kz;M{h|(SW8)L"ojLtVULҭl<#KOMSӱt,=/kw&s _)W$[IlћG joëءgb8,:hZZ𷭷[ka/_'W7{\Ԇ޿mw5Gqe_-71@nxG= & OJAa zǵ|,oreO>$*: ^fɍU*q#clǝǝǝwwwr8qt0f6v6vvV|I٬vzwzv IIV'|'I'Y *UD.BtQi?TB #0 #jHQeUFQeUOk>77M.$}?&t1& g ~瀼Q Dr^hGIta^`TJ*J\IUR+J*q%j\IUR+Jq%UVI5sK{վȰk%g-z䟰ȊF(/FdbkgDODmm7]P "0j4QI5QȅZ&IդQU\%loh`;:&ll$l(l$lpl똰ñc¶N̶N¶6PEsx6];];][];]]ۄwwwmwwmwm޵c޵޵e%c.Z"qCZpnnnnnnb»I̻I»»I» ǻ &1& &ʸCT:8KkmD۾޼'|x –߄cAwgƑ$2Ѥh:յvK5+N jAԲJJvl$uvX:I*mJvfi/@\_Bf]5al11]g&cՈuf3[klVcшue/k[>²bmֶn?9sȌ#FܺYj?7o =Wȋ,'?᤼ny8x"ā8X@KH` ,q %$ā8X@KH` ,q ǁ/ ݊\yǂ?#X?KGEy_PqcU'^`bcbcbcb%Xz &^`b%Xz &?&?&?&?&?&>jE[wSc/NecO,L, Kҫ01BL,LkKQ6LCNXjF/2Z}(,3rD  5Pq\4dRĿ {Q]Q7Bc/m!Ty[.-"w᭻K44׻u?#tZ PͫݕxsiMYXIj~.%ˍt74)==HnVY||G H~m'"v"d+Z_ =xUWBɷˍwk"Qz+zW翋"M?v8I{)_;WdǏz$7ДON=x.PdoIRO'Eɻ7"+[nCrXE@7$o\IR,'@iߢHNZ[tap$[}"+˒H?{ۊ){ nxx\`Q W%XV'`Q?*8bWw=I?{CR7'mvV2oSt߻Yvґu&>5ߟr]!ON;'x~R ]mףcBKXMUz"4qAU7>9_΢Ń|( Ye缆^$E[vҗ“&G? @Jw@?]]h:wN T:ycmg5ǃ'd5Bh=BFw%S#2.vO_%0b=o>mz3NoMg,*il~5А>;?l7~\*Ղ(.ߠu:ji#ӝ pcH3τrNiSi%jաY^qR>i[Ჟ[uߐ"T}@7nw&D'WO;h΁r}0viR GӄIPgw.|Q݇ݽ݊ͩP{AM!Aí}Tf zzuO=/0 H|}iЕM "[{mӇՠ;Du??z_'@k\_G ~B|=ɗ4>A>m5(3ef#CkL8H?!vK#IH04g K^sqKL+L03:ĺ-LLϒ9.eCj-eK(t5)R}ב҈/CcF?NyMڴ񦙴iM3iBMZM|9&4sptβ 1e7Bq^U[|I+:>>I;/dKR!o(ⵄ"\|:.D "~.a?'.+aLaeɳ8 +Y1q$$0c!_E^ϝ N[zm^{=Ƌ]G}q,i<Me8;65 DMJיKἙx$%CQT b$~iE"5w}D]66Hg{C4v_gn@dl9Eɳ/88)r;wN?ܙn6ߣVӵ`Ps<;RS<%@)iGD?%C)ȝYfL@y(nF>"1:Krqy|$傪{"Fldž nuϐOmL ˾²aVNS(h' `h'N cg Lt#I:/sN[3MWD򛽱b[j`cWku;iVVZvW"d{~/U$zG٢~yvu!QE?vrK+L3Z\%_"Cc#F1bSth0D)a8;p5J/,WAա+,}=*;\jXԌ* 8sfeu beɧf Ԥ` 03.-&' ,0%6tf}]o՛)6-Ti"MP<'c o,ˢ8S쿊uSdbnV`mhB6|EA[؜L=Ay2 Lq5yj$ 3J*M3a*dVt{۱iEmpx&mZvŘ(֥],$~M(V2296K3Wmf,< x˂XsQ$&Ŕ]/F2eo2HBjMI+qx1+ΏR#HC+_:wwegg.K_i0!v=><yt 3~fvYF,zM N@"=='+;86n?+Y_Hjύ;C)CgC  }6q YJRIt2I&Pi a̰ ˘1'݁4yd~YFzQ7!l\YՇ4+_U?_'l?___V8.V̒sk-GƦZԚzyV ?.!<'X…嵴~Y&*j~Mi>y8N J3;Gwcʙ?)? <Og f㈵?$PAJO{mB?;w ˪w yd w=SID.(`r: ^yvo74H4w0O)3Qn97G'i %s'sZ]H$*p>4{uf/8y 3iϼ}ޤeEno{E??N3gdʬ'D@ v'yG>ϴz R V`zreIk晘K'ˣt;CyFxs%.qg }D*o6Ɠ(xxSw`]g$uR^(G%ouEٻB3]f+$ǀn!Ty[. 1L}zf,\\Ν4%X2̙TI96\4u/翂߭'EZ kku jx߯qwo< ُۓ&n$.9n'&*GB!/?~xpE=cgۯN~/YҚma=j:] @N1LCWAvOr*@ э2 )ő18a!yQ`3eN/|caٚ)v?a 6S9)gcv7 Vǐsky#H6t6 4ъVT3 t}Lo-BQѵsR7{FoJj#M%5kfV[`Z~!xK 1$0{@MlWg}dۈ9T[P hĈ6L0;L_fC"钡m`Ii'3wmȜ /MH@dʠyrZ]I!p6A~ ^ FsDƢ([XJD{N2# vg< RFa8s\d+<:NJ!^~&rX.V $QIzެo ys6Ĝ;HHm򨒻8{ҋ8,DPQA/$ o)gBSɺ/92?euKO?3+R`^xO?"ERZJyiB(;E<ѓ}hk)\A : {pR7ʖ'3 /^E)S& e"X,cmG3p[{/y_h<'14Y5Hs`g$&Ҍ3ə19^D; !eY˘l#zS<%Z b({H'wʏR6U/[e~%@sK`;3,'p٫(*'N?cˁ #_Jiug)'Xԍi`qe[:^mm` Rd A\rBy% ɝOtC";ma V`\vqR$y 6I>iլlz YFO!4ǧF,2AKq^IJBb/"м8e,&C":Wc9LTI?AY. FM(GJFp 6a]|K dKJ m!Ty[. !s/Rt}jքd@b^T.o*d%ˍa.drvz=|VQlpR(쫹(R`z.87z3汀KF9|)R%k;㯗e.YpI ʗx+d((.oX^"vV߯N0ٻ7'e㚹2S)n(<+awo: N #3@ɑX U{ L+xY?bjȠ'_kv;9)RWI^i, rcďEd蘧=XF\J;j׭Z1(U_z𻠟..fut_rVE-&A PٰXmD. d]'Gb2p|4BL%l:$sPqSi|p\i*nTivjCdv)gg^Tݽ_]_5y6HSHV9Cdq|{C-Zx1[~9N6B D氾ZUeዊR-h7}(oօI%nBj0``+ /LFT 0 *H׸xoJ:NPv1¢Is~U.KxJ[@u}%ζwRrOE9~9b{"}4EyH<Xypd=eI;UH,tʉlT8u4k %W ̓,QQ͚^!3ZzߜO},zm񒱫 v8WGzFqp3a·o Yv }&I%9@׾w۩(ߎuR{SR;H{9GsRyH,Nu6c@?IU@{c6p.Pr@Z& ZO*;ewR7C=y9>D7V3r Lrv}yri]Tn8N9@sY-+;)gIH?AMO&"øg2.ʜJ' *Q~+MlR\;ԮRL VP.=*M5R^̫wR qo ^sRJ_%6i:JZBo+o-8Q)@2> :pyQtTYȊ!TYMur߬O{_.FQoW_O"Kn/O) ǜ.<uUSQZglj6ߣV͍ OVx/;H!_f؟/BjY/gg{A^t'Tn3{3B614Ψ˺=8㓸l@&!R HĈPr-Rc:0$'h5SDf%(gnN10g:a &h'wOudne2WP"J|Yn/H\vWN `N S~çQxi>G<i~߳I1^@sn++UIQ?s% zZW"/p]j2F'z*⽖Jp|/5vGSHsUn U,S pj[81v屯*k^ԩ"K =˜AjK%4oA7T_fH}uz 9A9eHj~u~Yc=:N(hϸ,H3#0y- |~^ӂ3NPǚ\u>:g6GFԍ u#~Dq"X2̄){!3Gp~e|Av &M3\_sQ쿑U}FɿF:<o(r7QZgٿIkZts ,00p7'OkӇ+ypef8Upj{H'']o5ѫVJ;98))EFL8F#S"1VzwXk5^1 <9/+sb!^U˄'}=~"j)Y%٦r0%9، u\PP ܎ZV\xY Z,2K˗0;8cK-_tB2R i)&}Ki/#R:!ҧ6n5jKq7'vY w-/ d|29#>t* T2A%^ބS۽><%ZdɼEA!#"2fpE.Ww*Uo(X?1c5Q Q 'ÂL  `1bdܐ_֡nfc&@{/Gd8c!An` AN ܜ!#s" 825nċt/ҽH_.Hd>-Dx}sG@ݪ:)o%q 8iUy{sL9D4x[F1O d,Sx&RLZdYbO ~Uw"?%m?irOn'MB ٜ}u*;)+޲s\wW}A}pxUN GpmYL6J `:J *MD)F<[:)Nߙh/P(?S?Ais߿-f(N0'voO4'˩OrLu`oA;e;)\?seꇾܶ/o?qwo< h{)_ȰK 4JKI\Vd_/\DG-)rbsQ\&. ERzmim*`;/k*`{::RUwQo 8¿X-sCkevR߷(CKјmb1@rNT+J8)tb?ʏ+Ddw#pʂK1;RԂ~AOݻV KwgNE+@5*XmҠ.ٛ~XkTm/) u_N󟴖N9,R_yڣ -~}1HNJ#[0I沓e$7L_I@?]]"1~ Zi1=reKJQyRsS(|c7٨>\$ڙ)㏰s@7D'[d[/j\JxJ3Pia#rZT]'E7 C!?n,ھeV0dW?]4?ӱ)~|'pC,{[1].=ݴH=<;,eiUN(zGA[)]IJ}_'E!;4: DtpPWJTU5 NVXVKC5'sъ]&u另uP zV|'EK/faHQyWE&o8 >*|'xltʺ̷Aqy!Q%9A8cʍU@S%qyuWUNٟJ:Ѳ7OSnEvt]j'&Q߻5wRoRr)TT`f \>mF` k{IQ濝Zx*@8G;xw*6POy\3W+{6Ֆ(?pWIA~AE]|:.}aplIZ)ZtDq6@Z1zG_e]MV-orEFaLyA<Յh@g-H~ڼVq *b/%.&~F𪢠yw~l/fE ̃N{=5cwS{SR;H{I9]4ix$'F:ݲUIQ쿁7pi J ©C>3XSp1 eɡ+d >?ѵP;Oc7wK./%FCAv`fv2njy_9)gCHƊ39vGhAZ^6QXK|SV"<\jK\5X 3"tiVP.}*MER^̫w#R qWWMXqVfGgዢ$:pL2꒕e@debHZ.Ut|[CUq+j<_}"?7jܞ} ,ً=uRV/Nb!ٞ:lG^2!/6C^Z17!/4l5+Ov$!/v gf-kq=zw ^ ݛbmaY'g| 4D!{q h ( ZT8;^D8!e+LRL9fMy0-c [2&obSSn3/QOH 90^2HcK^ȘZ~ݗEnݝ]9)?}YoҚma=j:\gC: *L dʓ J8\UY"d%8RI[MՄ]NNJ)q q'o*|1ͱqAlH,VZ̓g@~&{3B6/aN5˺ 8VsHW2a@IZ0H~ZJn=Şc jvL5ckk)"< k+$cŮ 4'kˍ՜#z~iͲC湤E%Ii(/NJ-6n33r( sӐNib9M"1{7§4pXU,}gz4>_5*so%%V+@:& PףcBK۱0!ЫzuwU @{c0N$~,st|tqpt1,G5+/gl="p|~>@ї 2K"oHWjǟ~O_s]ԋSɟg1߰H7Gr„,WIBnOR1zHXfB;*+4>\(i/awGPs )P:bC?pQ@쒯[u!'N?9AG=VpcHճτ"rNiWSi% kY`qR>%y[Ÿ[ߐ" f?ʔyM:PO~fqćqQesR7hMP@_ښm#3%`"ej-e (tND5)`hėgLٴᦙiM3e"MYMz&43htƲ 1aAIy+?𐎂 C6́rHs1ԩ:&YrldzeڌςCwwmsJcX2V0v1zȀ1oLQf`T_<_R^` 3bpC@aED)te1\Xt݌ 8D^+y hT]UbJ>5bvI+N)FO.JL!Cv§ | _0wpv/tAr!& MC$B>[uOV40 +؞J~de6]>CPU+Ij+pmA^ x6x ,B<Т~=&p";0lsQ8|%spTj² u @_8'aA_qp>4thvŹܒ3*@H[,;fsaw^VDH0nb${K0OC  G_GEάU #$=Q?*ˈj~Wٖo;ʷu'*Uk8Z3&awrz'{+FLbf3Ӆ= ٗ;}\G:onV[$_NSU$E3NX}f,k5GPy~k^^ߑޫ&V}]չ xC5`yUn >a.^/t>6sKO DhBhV(`ASACA0] 0`JAc٬۪)4Ԩu6Ds]N+(Q SY/$u+YNWIVfW݌?br˻R< ?+j+1:|(<%ϻ.`?Ĝ:O'J6h@6{G3|--Ө%Eo\Ti[{= -m}i_Z$)uyJJO'~5-^M[]3?C>DyVG2QړҸ !|lȭPuhH~#6U?qI][4?!@0dh>m;c`C/GX-`XQ"faj7OF(oe-JW֪ʢ5м)ּ1MU kf4B(eFL܆C|_J 0فҢhP.S[$H*5ƴI7@'Lubۿ"x\ț'ELiayDw^,̎, 67NJ[ѶIw0Vou4H~@h O#gh!`l5tLpW緋V7CGް%ௗ<6"E{M,"EТ zL-)[K htsb|΁#RWë &AS<4:P媃I!FnjԨ:c2׈#S/7ˤ`+a"(:`-T#&ѥ)"k l_aE!PmݬjjPmm x# WVu jj$M[\V?*IwYr)|2;d)|@{ӟQo4llPs~&7vQ{LYvFT,Vշ32^Kt1A4~:f1>dKzbjd(irW#)9,bu-J:`TzR0bܼQ6EZ!蝥dĻc^;nhaPߗ&kD֬(\RۈF(AAbJ$_ERKC1P[_|bTB 96ؓ}1HQD@o;m,(l*K:T)-hAJ!SD\ ad2Ͷ?mNm$)L7?kK+Xl?Amq?AmNd͊AjѲ )EsdNEĮ`zN1%-g $KКD-(ϷnAQ03Y*4tSQ?s{[|ДҦ Mf56۴ka=]r xKm^4K+m*p[byV[h&uKo|(W>0I>4C Tq=GoOmYG8UҔ'g9OcCxP,5k9֔fc-ϷnPTlM{DIUx|?J5v_Yު1Vd 7U|-p,i^B; X2,O)eᵒ&U򕒫:r3#Άn5DR"^ߝw \%*wWQQ #VV'%BO=00%B544Du.x6AZT<.͸VCFE3_p>p #ʥ_c*ބQvc#zj^]M W׼R7B:1[V~%7n9?դ'_/k->IFBu\| q  E[rgȚmQ}L|NϢuKZ,Ho3Zd@ȯ` *9tjOI r<UoaRSMnr~?n&Wl^l%VVrVrۏ&jWd+rp]P͹k]sn^ ]uC^@XwbQJ1)+TskJc_}k6̽ TlOBh9:w7^?΁YV:/o^HnAZlp@>&npsf(2?Ösp>?U$Q;NJ]O`~T&'}'vFKzg7 ǽ7&V;I&5B%/+tgxxk \ sHS6vwI kIJBJ4$V8R@7)dF7Ap3Y j&}i^gfTG_ &,s莂lf62 0U9}JroI% [ gv[z_I?9ɟT[þ!qxYϖGq)pBۻ YWs2Zoyi7R?l % Z5@ĿHA*RDA@P ȷ4i-pmeUٮ鿚?9<jyl!PbU @'kH1Up9Vp( *~(:; nBG?c*RH·Gať?@XTmX)AgN&͋Iome߳1ӿgOH"F= F 6?G5W"P??|h^Lvj?.JAǹ'zex5@Nq۝vuV[I.|Fo@;xctoзMwH  `91 }" |.〴( "pdl4hvF; oQLPzFfC̜r=S`l'x@;O s}ȏryBQO9/1-=ܲ' u} (7,F;|̋;@Ko"f7 !T#Bo-{ĩй4NE4zCbӗ2K+'ϗ$|<#/܉wR;߁[H#Z\\ʛP~Xl$y<w;π;O'ΓpI $y< w;π;/]a!αJs%R޵cύ?Iu$hd7Ky#=hAA- n$0%Lb Z΃1%Lb X$0 &I,aKLb 8dvĒˉygv֪d mEj;OESUr sirVN"mT}vrHrd/ғ#<ҫgX% Վ$TP\8bǽ; gT%'fIZiTK1ahg˳/O!}_dVvYџ\޾}ʾ˲b;R|'X_aQI 7n?RN?`ǁq$п_g۩W"w2ZW0 x@9鿊_fo{+,)ӿՙ/aH>8jJRD?!_!CHC=hqN=*EDZWb/JyEHZ Lu{_GӫntZݔg.tXَp<ˠ,.ɿ17 V=I8s~$\IT˱ Di AL`{t?ߟ7DmW^/]0GDctzFwC~7Z k:8V¥m5?8 m@2?$NEn{ ao ?Z?V%ND$L"IYIw=bz3!$zo,G_o3$ٛދ(BOЂ ׁ.?}ǃa-3$}Dߞ^tƷp[f|8~ KMrt}I׍r#*yבۨv!Iۉ[aw^gIܓv'=*UgxWA\|>;;N=>"M뤼D{3P m>\LE|B"*T|8b=ٹ 92ƔLU7M>@~gNAoU֔#_,a#֧|ZP@rl8 }FW&d] s[2)/St|Ǧ3%'1CPCc@цƶ'y­z.Ş4 kCF\wQ/l!Yk4ʜ5PiFh40\( NÃk J`paƜyZe)۾X Noanm%pb\C#Â,3%_2Hm2ĸ1`1aن6,?DJzrIilzrQB1YャFW{&oM5 0i?'emݮ\ն## VA<څAaz 242꤉rYTl.uZN;,hI]j#ƍe"3$zJl}Pr{c"zLˣT=濕һ,s֍ h>K)c@-Km. &6e䃒|K1 jl.Mt|-W!q-ʸR k\TƵT^Z*;uoɯeMq z[hw+Z5bRZp IWl7/5fnX'4st>ڣ7to]M ߈-=:M';OB__G::8@rgux|A9<70)vu[BSb8Ewh1@ߌɥ>̽boxu~ӡOs^COl-癿 };d}=aD;Tǟ~ {Sw7Yf-g.m9tzbFy2]@W,fD&j]'hoJ 1~<(ÑyUyƳg|TA|7C=?djQgRqy^lq}ڍ_ 2i"b((I36'X`9)0K ϟh<>c8<8\>p8\x.k80ǁ80<ǁM<2huO"Ќgj]sOEh%)\^ `^Z ބಟD#./んM`6La?2f?LxN r#r!1aaP8*, I=qk<O`_`9/0 |(*@~ %'ϟ^pxz#q$x <DǑAH8< G!H@$x q%b"'Ps $Q,.4 '4Yy} s^Bh 0Gф90&фA4a& s4aM ChMDh 5a(gQ([X-pδ9 %t]`9/0 xq8@0<q0< ` ` .p9. pA0\ 9s00`` 0A00`|7 ﺧyas^nꟿ}3hk ykm{P!/z@/XU}<'G=_0O_~xSo24փ ؿzg`Bc_]h κ1 h `^g1f~{k=C wO_m$=(_0xu|u[pд6*Z }uʅuahE0tߜv"?x }9JЊ`Z\i4 0ջ0Hp.Lc~_P]s$=@^]Z=xՃZ=xlZ=x,Z=xՃY_^Y_kVx%]m ,2> |E @?v3uaG0b]@lXvu`G0`]WvzEz5~ϡ|^=Xs :iCsNV=~hՃlV=~dՃ,V=~`ՃU=~\ՃU]~XՅlU]~TՅ,U]~PՅT]~LՅT_Q]@ & ~Bu/. ``NQidq2_A%wi@L]@ .ES&\>^(g;tЃ R=ՃQ=ՃQ=ՅLQ]Յ Q]ՅP] u . `,P ۟޼r|6Y,(_rǂ XgObA+T<[_E KWke/,5r V,])=1X:WZ~+W+ld+-_q+m 㕮~W5+{^W6d+{.W:.e+p[imVPJGs(V>uW+T5O+ 뺣Nl37+eu9Y+}^C.V=|Jϗ%^P=-9W9^;~B1ѳJϗ|ٯ XnUj%ْS,PQUzO,%%w*(_O_^;~BH~TzFS%/*(_tt)O_q51^;!Sz>Y/;OS@о8yߔ/MAj5NSz3yо1/9LAj/R'zK/JAZ*/yJKR)I$^Rz$>R)HKR@TA()Q  ,:GA|M163 Z.QЊDW,[E(NQg}%*D(p!:DAMXJndg( _%W(7Po AK^PP1 b zgy[*66~HnkMqL !"Rz R΅TWTxK^tƥNgqSQQ.KMLLKy(P٭YJaX!,γ֥.󔁬h$tCGI׀JOwtz@qLeFǚdX>)|7ʙ (":ƜK]}$SEI^i6]˲'$TX穒a/R3N293yǛClNX.;ic`ySlgQ7(!bǶQU*x&S !)ϒH&.wi!{A\kTQ~ q,Ŋ1HѬ{ي59TrqFA.$>NT1olAS2xJ:53В*(φ&7A^\?<?x5&~pW!~Aʀ+? l 4e @S{>RD ]FR/!;]b#JYmoii_yfW3 (HkGU_ 涖m8p7@y Պ2 ]չAT**Űd1qP5z]8Q]nR۱*FRW(Jw48\kݗ?-sx;L5.n)}rV|A9FqiVėF|ߍ hFo֔^GһGG;;~`4E %F6 /7` MwK=FӻkuMc޲ߑͱ vg׃?fHN73vrXiBjԪ%^ ? awoIv.y`XJ(C+LNmddo2[Q;%@(EW#=[PަNЇN/aJvOm~"6?-?:c?:Y\|=i1|r_2BQ*f7Z#7 ;^Ύh-[v V*Ws݅uZC[|pćBV܌~ |]P2E軚 4b# Ϗ?ZWG)Ldm^ac ~߅ƟR||I&jaS**ʫp.C?Wg=S[\+_ی#_g7jvjN&wI4d"[M[ <-y/'4 =z&d^ 9Etb6_ ۍ+Rp<'#4@..NZC ]=mN`4&l|@x:Y+@"o!ra+& )-mO1&( OIbC^ &`$(KY$| ^gYs~odU diAQDs?Ne=i U$z,I!{LVHl$OP@]{:ЁN-Y*!@3+<|S;_I#T D tAJvUCMhk+r=BVi㴅povөU$ouL֟Owu r~2GӝxŦzv}E?W?i(]ږiWoO{/IYV hc_=%??N\:N )%~w<L)&vOOWholNuST:թNuST:թNuST:թN[?6;cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/000077500000000000000000000000001427272030600225535ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_amax_sub.c000066400000000000000000000041451427272030600252050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "f77_amax_sub.h" // // Define CBLAS subrotine wrapper interfaces. // #undef GENTFUNC #define GENTFUNC( ftype_x, chx, blasname, blisname ) \ \ void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ) \ { \ *rval = PASTEF772(i,chx,blasname) \ ( \ n, \ x, incx \ ); \ } #ifdef BLIS_ENABLE_CBLAS INSERT_GENTFUNC_BLAS( amax, NULL ) #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_amax_sub.h000066400000000000000000000037601427272030600252140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_asum_sub.c000066400000000000000000000041751427272030600252270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "f77_asum_sub.h" // // Define CBLAS subrotine wrapper interfaces. // #undef GENTFUNCR2 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \ \ void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ) \ { \ *rval = PASTEF772(chr,chx,blasname) \ ( \ n, \ x, incx \ ); \ } #ifdef BLIS_ENABLE_CBLAS INSERT_GENTFUNCR2_BLAS( asum, NULL ) #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_asum_sub.h000066400000000000000000000040061427272030600252250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_dot_sub.c000066400000000000000000000072601427272030600250460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "f77_dot_sub.h" #ifdef BLIS_ENABLE_CBLAS // // Define CBLAS subrotine wrapper interfaces. // #undef GENTFUNCDOT #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \ \ void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ) \ { \ *rval = PASTEF772(ch,blasname,chc) \ ( \ n, \ x, incx, \ y, incy \ ); \ } INSERT_GENTFUNCDOTR_BLAS( dot, NULL ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTFUNCDOTC_BLAS( dot, NULL ) #else // // Define CBLAS subrotine wrapper interfaces for complex types. // For the "intel" complex return type, pass a hidden first parameter // (by address). // #undef GENTFUNCDOT #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \ \ void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ) \ { \ PASTEF772(ch,blasname,chc) \ ( \ rval, \ n, \ x, incx, \ y, incy \ ); \ } INSERT_GENTFUNCDOTC_BLAS( dot, NULL ) #endif // -- "Black sheep" dot product function definitions -- // Input vectors stored in single precision, computed in double precision, // with result returned in single precision. void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ) { *rval = PASTEF77(sds,dot) ( n, sb, x, incx, y, incy ); } // Input vectors stored in single precision, computed in double precision, // with result returned in double precision. void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ) { *rval = PASTEF77(ds,dot) ( n, x, incx, y, incy ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_dot_sub.h000066400000000000000000000050261427272030600250510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_nrm2_sub.c000066400000000000000000000041751427272030600251400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "f77_nrm2_sub.h" // // Define CBLAS subrotine wrapper interfaces. // #undef GENTFUNCR2 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \ \ void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ) \ { \ *rval = PASTEF772(chr,chx,blasname) \ ( \ n, \ x, incx \ ); \ } #ifdef BLIS_ENABLE_CBLAS INSERT_GENTFUNCR2_BLAS( nrm2, NULL ) #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/f77_sub/f77_nrm2_sub.h000066400000000000000000000040061427272030600251360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/integrate-cblas-tarball.sh000077500000000000000000000204211427272030600263200ustar00rootroot00000000000000#!/bin/bash # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name(s) of the copyright holder(s) nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # # bump-version.sh # # Field G. Van Zee # print_usage() { #local script_name # Get the script name #script_name=${0##*/} # Echo usage info echo " " echo " "$script_name echo " " echo " Field G. Van Zee" echo " " echo " Unpacks a CBLAS tarball and performs whatever preprocessing is" echo " necessary and appropriate in order to integrate the CBLAS source" echo " code into BLIS." echo " " echo " IMPORTANT: This script is designed to be run from the following" echo " directory:" echo " " echo " frame/compat/cblas" echo " " echo " Usage:" echo " ${script_name} tarball" echo " " echo " Arguments:" echo " " echo " tarball The name of the CBLAS package that will be unpacked." echo " If tarball is not in the current directory, the full" echo " directory path should be given." echo " " # Exit with non-zero exit status exit 1 } main() { # -- BEGIN GLOBAL VARIABLE DECLARATIONS -- # The name of the script, stripped of any preceeding path. script_name=${0##*/} # The name and path of the CBLAS tarball. tarball_path= # The name of the CBLAS directory after it is unpacked. cblas_dir=CBLAS # The name of the sub-directory that we will create and into which # we will copy the source code for CBLAS wrappers. src_dir=src # -- END GLOBAL VARIABLE DECLARATIONS -- # Process our command line options. while getopts ":h" opt; do case $opt in h ) print_usage ;; \? ) print_usage esac done shift $(($OPTIND - 1)) # Check the number of arguments after command line option processing. if [ $# = "1" ]; then tarball_path=$1 echo "${script_name}: preparing to extract from '${tarball_path}'." else print_usage fi # Check that src_dir does not already exist. If it does, abort. if [ -d ${src_dir} ] ; then echo "${script_name}: found '${src_dir}' directory; please remove before proceeding." return 0 fi # Un-tar and un-gzip the tarball. echo "${script_name}: extracting '${tarball_path}'." echo "${script_name}: expecting unpacked directory to be named '${cblas_dir}'." tar xzf ${tarball_path} # Create the directory into which we will copy the source code for the # CBLAS wrappers. echo "${script_name}: creating local '${src_dir}' directory." mkdir -p ${src_dir} # Copy the cblas.h header file. echo "${script_name}: copying cblas.h from '${cblas_dir}/include' to '${src_dir}'." cp ${cblas_dir}/include/cblas.h ${src_dir}/cblas.h # Copy the cblas_f77.h header file, removing all prototypes. echo "${script_name}: copying cblas_f77.h from '${cblas_dir}/include' to '${src_dir}'" cp ${cblas_dir}/include/cblas_f77.h ${src_dir}/cblas_f77.h # Create some temporary files to facilitate #including BLIS-specific # cpp macros. echo "${script_name}: creating temporary files." echo "#include \"bli_config.h\"" > include_bli_config.h echo "#include \"bli_system.h\"" > include_bli_system.h echo "#include \"bli_type_defs.h\"" > include_bli_type_defs.h echo "#include \"bli_cblas.h\"" > include_bli_cblas.h echo "#ifdef BLIS_ENABLE_CBLAS" > ifdef_cblas.h echo "#endif" > endif_cblas.h # Process each CBLAS source file. echo "${script_name}: copying source from '${cblas_dir}/src' to '${src_dir}' with" echo "${script_name}: '#ifdef BLIS_ENABLE_CBLAS' guard:" for cbl_src_filepath in ${cblas_dir}/src/cblas_*.c; do # Strip the path to obtain just the filename. cbl_src_file=${cbl_src_filepath##*/} # Append the ifdef and prepend the endif macro statements to the # current file and output to its new location in ${src_dir}. echo "${script_name}: ...copying/BLIS-ifying ${cbl_src_file}" cat include_bli_config.h \ include_bli_system.h \ include_bli_type_defs.h \ include_bli_cblas.h \ ifdef_cblas.h \ ${cbl_src_filepath} \ endif_cblas.h > ${src_dir}/${cbl_src_file} done # Remove the temporary files. echo "${script_name}: cleaning up temporary files." rm -f include_bli_config.h rm -f include_bli_system.h rm -f include_bli_type_defs.h rm -f include_bli_cblas.h rm -f ifdef_cblas.h rm -f endif_cblas.h # Process some bugfixes to syntax errors present in the CBLAS source. echo "${script_name}: fixing syntax errors in CBLAS source:" fix_file ${src_dir}/cblas_chpmv.c "s/ F77_K=K,//g" fix_file ${src_dir}/cblas_chpmv.c "s/ F77_lda=lda,//g" fix_file ${src_dir}/cblas_zhpmv.c "s/ F77_K=K,//g" fix_file ${src_dir}/cblas_zhpmv.c "s/ F77_lda=lda,//g" fix_file ${src_dir}/cblas_ssyr2.c "s/F77__lda/F77_lda/g" fix_file ${src_dir}/cblas_dsyr2.c "s/F77__lda/F77_lda/g" fix_file ${src_dir}/cblas_strsm.c "s/F77_N=M/F77_M=M/g" # Now process some optional fixes that eliminate compiler warnings. echo "${script_name}: fixing compiler warnings in CBLAS source:" incx_string="s/, incx=incX//g" incy_string="s/, incy=incY//g" fix_file ${src_dir}/cblas_cgbmv.c "${incx_string}" fix_file ${src_dir}/cblas_cgemv.c "${incx_string}" fix_file ${src_dir}/cblas_cgerc.c "${incy_string}" fix_file ${src_dir}/cblas_chbmv.c "${incx_string}" fix_file ${src_dir}/cblas_chemv.c "${incx_string}" fix_file ${src_dir}/cblas_cher.c "${incx_string}" fix_file ${src_dir}/cblas_cher2.c "${incx_string}" fix_file ${src_dir}/cblas_cher2.c "${incy_string}" fix_file ${src_dir}/cblas_chpmv.c "${incx_string}" fix_file ${src_dir}/cblas_chpr.c "${incx_string}" fix_file ${src_dir}/cblas_chpr2.c "${incx_string}" fix_file ${src_dir}/cblas_chpr2.c "${incy_string}" fix_file ${src_dir}/cblas_zgbmv.c "${incx_string}" fix_file ${src_dir}/cblas_zgemv.c "${incx_string}" fix_file ${src_dir}/cblas_zgerc.c "${incy_string}" fix_file ${src_dir}/cblas_zhbmv.c "${incx_string}" fix_file ${src_dir}/cblas_zhemv.c "${incx_string}" fix_file ${src_dir}/cblas_zher.c "${incx_string}" fix_file ${src_dir}/cblas_zher2.c "${incx_string}" fix_file ${src_dir}/cblas_zher2.c "${incy_string}" fix_file ${src_dir}/cblas_zhpmv.c "${incx_string}" fix_file ${src_dir}/cblas_zhpr.c "${incx_string}" fix_file ${src_dir}/cblas_zhpr2.c "${incx_string}" fix_file ${src_dir}/cblas_zhpr2.c "${incy_string}" # Now that we're done with everything, we can remove the CBLAS directory. echo "${script_name}: removing '${cblas_dir}' directory." rm -rf ${cblas_dir} # Exit peacefully. return 0 } fix_file() { # Get the first function argument: the filename and path to fix. local filepath="$1" # Get the second function argument: the sed command to apply. local sedstring="$2" filename=${filepath##*/} echo "${script_name}: ...fixing ${filename} with 'sed -e ${sedstring}'" cat ${filepath} | sed -e "${sedstring}" > ${filepath}.new mv ${filepath}.new ${filepath} } # The script's main entry point, passing all parameters given. main "$@" cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/000077500000000000000000000000001427272030600220665ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas.h000066400000000000000000001103551427272030600233300ustar00rootroot00000000000000 #ifndef CBLAS_H #define CBLAS_H #include // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. #include "bli_system.h" #include "bli_config.h" #include "bli_config_macro_defs.h" #include "bli_type_defs.h" /* * Enumerated and derived types */ enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif /* * =========================================================================== * Prototypes for level 1 BLAS functions (complex are recast as routines) * =========================================================================== */ BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); /* * Functions having prefixes Z and C only */ BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); /* * Functions having prefixes S D SC DZ */ BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); /* * Functions having standard 4 prefixes (S D C Z) */ BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); /* * =========================================================================== * Prototypes for level 1 BLAS routines * =========================================================================== */ /* * Routines with standard 4 prefixes (s, d, c, z) */ void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); /* * Routines with S and D prefix only */ void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); /* * Routines with S D C Z CS and ZD prefixes */ void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); /* * =========================================================================== * Prototypes for level 2 BLAS * =========================================================================== */ /* * Routines with standard 4 prefixes (S, D, C, Z) */ void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); /* * Routines with S and D prefixes only */ void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); /* * Routines with C and Z prefixes only */ void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); /* * =========================================================================== * Prototypes for level 3 BLAS * =========================================================================== */ /* * Routines with standard 4 prefixes (S, D, C, Z) */ void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); /* * Routines with prefixes C and Z only */ void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); /* * =========================================================================== * BLAS Extension prototypes * =========================================================================== */ // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_caxpy.c000066400000000000000000000011001427272030600245120ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_caxpy.c * * The program is a C interface to caxpy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_caxpy( f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_caxpy( &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ccopy.c000066400000000000000000000010311427272030600245060ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ccopy.c * * The program is a C interface to ccopy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ccopy( f77_int N, const void *X, f77_int incX, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_ccopy( &F77_N, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cdotc_sub.c000066400000000000000000000011731427272030600253450ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cdotc_sub.c * * The program is a C interface to cdotc. * It calls the fortran wrapper before calling cdotc. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cdotc_sub( f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY,void *dotc) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_cdotc_sub( &F77_N, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY, (scomplex*)dotc); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cdotu_sub.c000066400000000000000000000011751427272030600253710ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cdotu_sub.f * * The program is a C interface to cdotu. * It calls the forteran wrapper before calling cdotu. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cdotu_sub( f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY,void *dotu) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_cdotu_sub( &F77_N, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY, (scomplex*)dotu); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cgbmv.c000066400000000000000000000103761427272030600245030ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cgbmv.c * The program is a C interface of cgbmv * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; F77_INT F77_KL=KL,F77_KU=KU; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_KL KL #define F77_KU KU #define F77_incX incX #define F77_incY incY #endif int n=0, i=0; const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; float ALPHA[2],BETA[2]; int tincY, tincx; float *x=(float *)X, *y=(float *)Y, *st=0, *tx=0; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_cgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_cgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; TA = 'N'; if (M > 0) { n = M << 1; x = malloc(n*sizeof(float)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if( incY > 0 ) tincY = incY; else tincY = -incY; y++; if (N > 0) { i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } } else x = (float *) X; } else { cblas_xerbla(2, "cblas_cgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif if (TransA == CblasConjTrans) F77_cgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, (scomplex*)ALPHA, (scomplex*)A ,&F77_lda, (scomplex*)x,&F77_incX, (scomplex*)BETA, (scomplex*)Y, &F77_incY); else F77_cgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, (scomplex*)alpha, (scomplex*)A ,&F77_lda, (scomplex*)x,&F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); if (TransA == CblasConjTrans) { if (x != X) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } } else cblas_xerbla(1, "cblas_cgbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cgemm.c000066400000000000000000000060541427272030600244730ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_cgemm.c * This program is a C interface to cgemm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char TA, TB; #ifdef F77_CHAR F77_CHAR F77_TA, F77_TB; #else #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_cgemm", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_cgemm", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_cgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(2, "cblas_cgemm", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_cgemm", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_cgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B, &F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_cgemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cgemv.c000066400000000000000000000102371427272030600245020ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cgemv.c * The program is a C interface of cgemv * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n=0, i=0; const float *xx= (const float *)X; float ALPHA[2],BETA[2]; int tincY, tincx; float *x=(float *)X, *y=(float *)Y, *st=0, *tx=0; const float *stx = x; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_cgemv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_cgemv(F77_TA, &F77_M, &F77_N, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { ALPHA[0]= *( (const float *) alpha ); ALPHA[1]= -( *( (const float *) alpha+1) ); BETA[0]= *( (const float *) beta ); BETA[1]= -( *( (const float *) beta+1 ) ); TA = 'N'; if (M > 0) { n = M << 1; x = malloc(n*sizeof(float)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; F77_incX = 1; if(incY > 0) tincY = incY; else tincY = -incY; y++; if (N > 0) { i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } stx = x; } else stx = (const float *)X; } else { cblas_xerbla(2, "cblas_cgemv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif if (TransA == CblasConjTrans) F77_cgemv(F77_TA, &F77_N, &F77_M, (scomplex*)ALPHA, (scomplex*)A, &F77_lda, (scomplex*)stx, &F77_incX, (scomplex*)BETA, (scomplex*)Y, &F77_incY); else F77_cgemv(F77_TA, &F77_N, &F77_M, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)x, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); if (TransA == CblasConjTrans) { if (x != (const float *)X) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } } else cblas_xerbla(1, "cblas_cgemv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cgerc.c000066400000000000000000000037651427272030600244740ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cgerc.c * The program is a C interface to cgerc. * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda) { #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif int n, i, tincy; float *y=(float *)Y, *yy=(float *)Y, *ty, *st; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { F77_cgerc( &F77_M, &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY, (scomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (N > 0) { n = N << 1; y = malloc(n*sizeof(float)); ty = y; if( incY > 0 ) { i = incY << 1; tincy = 2; st= y+n; } else { i = incY *(-2); tincy = -2; st = y-2; y +=(n-2); } do { *y = *yy; y[1] = -yy[1]; y += tincy ; yy += i; } while (y != st); y = ty; #ifdef F77_INT F77_incY = 1; #else incY = 1; #endif } else y = (float *) Y; F77_cgeru( &F77_N, &F77_M, (scomplex*)alpha, (scomplex*)y, &F77_incY, (scomplex*)X, &F77_incX, (scomplex*)A, &F77_lda); if(Y!=y) free(y); } else cblas_xerbla(1, "cblas_cgerc", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cgeru.c000066400000000000000000000023431427272030600245050ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cgeru.c * The program is a C interface to cgeru. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda) { #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { F77_cgeru( &F77_M, &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY, (scomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; F77_cgeru( &F77_N, &F77_M, (scomplex*)alpha, (scomplex*)Y, &F77_incY, (scomplex*)X, &F77_incX, (scomplex*)A, &F77_lda); } else cblas_xerbla(1, "cblas_cgeru","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_chbmv.c000066400000000000000000000071131427272030600244770ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_chbmv.c * The program is a C interface to chbmv * * Keita Teranishi 5/18/98 * */ #include "cblas.h" #include "cblas_f77.h" #include #include void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,f77_int N,f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n, i=0; const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; float ALPHA[2],BETA[2]; int tincY, tincx; float *x=(float *)X, *y=(float *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_chbmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chbmv(F77_UL, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; if (N > 0) { n = N << 1; x = malloc(n*sizeof(float)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if(incY > 0) tincY = incY; else tincY = -incY; y++; i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } else x = (float *) X; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_chbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chbmv(F77_UL, &F77_N, &F77_K, (scomplex*)ALPHA, (scomplex*)A ,&F77_lda, (scomplex*)x,&F77_incX, (scomplex*)BETA, (scomplex*)Y, &F77_incY); } else { cblas_xerbla(1, "cblas_chbmv","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( order == CblasRowMajor ) { RowMajorStrg = 1; if(X!=x) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_chemm.c000066400000000000000000000053561427272030600245000ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_chemm.c * This program is a C interface to chemm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_chemm(enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char SD, UL; #ifdef F77_CHAR F77_CHAR F77_SD, F77_UL; #else #define F77_SD &SD #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_chemm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_chemm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_chemm(F77_SD, F77_UL, &F77_M, &F77_N, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_chemm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_chemm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_chemm(F77_SD, F77_UL, &F77_N, &F77_M, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_chemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_chemv.c000066400000000000000000000070411427272030600245020ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_chemv.c * The program is a C interface to chemv * * Keita Teranishi 5/18/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n=0, i=0; const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; float ALPHA[2],BETA[2]; int tincY, tincx; float *x=(float *)X, *y=(float *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_chemv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chemv(F77_UL, &F77_N, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; if (N > 0) { n = N << 1; x = malloc(n*sizeof(float)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if(incY > 0) tincY = incY; else tincY = -incY; y++; i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } else x = (float *) X; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_chemv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chemv(F77_UL, &F77_N, (scomplex*)ALPHA, (scomplex*)A, &F77_lda, (scomplex*)x, &F77_incX, (scomplex*)BETA, (scomplex*)Y, &F77_incY); } else { cblas_xerbla(1, "cblas_chemv","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( order == CblasRowMajor ) { RowMajorStrg = 1; if ( X != x ) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cher.c000066400000000000000000000051131427272030600243170ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cher.c * The program is a C interface to cher. * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX ,void *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif int n, i, tincx; float *x=(float *)X, *xx=(float *)X, *tx, *st; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_cher","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_cher(F77_UL, &F77_N, &alpha, (scomplex*)X, &F77_incX, (scomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_cher","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(float)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif } else x = (float *) X; F77_cher(F77_UL, &F77_N, &alpha, (scomplex*)x, &F77_incX, (scomplex*)A, &F77_lda); } else { cblas_xerbla(1, "cblas_cher","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(X!=x) free(x); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cher2.c000066400000000000000000000066621427272030600244130ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cher2.c * The program is a C interface to cher2. * * Keita Teranishi 3/23/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n, i, j, tincx, tincy; float *x=(float *)X, *xx=(float *)X, *y=(float *)Y, *yy=(float *)Y, *tx, *ty, *stx, *sty; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_cher2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_cher2(F77_UL, &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY, (scomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_cher2","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(float)); y = malloc(n*sizeof(float)); tx = x; ty = y; if( incX > 0 ) { i = incX << 1 ; tincx = 2; stx= x+n; } else { i = incX *(-2); tincx = -2; stx = x-2; x +=(n-2); } if( incY > 0 ) { j = incY << 1; tincy = 2; sty= y+n; } else { j = incY *(-2); tincy = -2; sty = y-2; y +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != stx); do { *y = *yy; y[1] = -yy[1]; y += tincy ; yy += j; } while (y != sty); x=tx; y=ty; #ifdef F77_INT F77_incX = 1; F77_incY = 1; #else incX = 1; incY = 1; #endif } else { x = (float *) X; y = (float *) Y; } F77_cher2(F77_UL, &F77_N, (scomplex*)alpha, (scomplex*)y, &F77_incY, (scomplex*)x, &F77_incX, (scomplex*)A, &F77_lda); } else { cblas_xerbla(1, "cblas_cher2","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(X!=x) free(x); if(Y!=y) free(y); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cher2k.c000066400000000000000000000056451427272030600245660ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_cher2k.c * This program is a C interface to cher2k. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; float ALPHA[2]; const float *alp=(float *)alpha; CBLAS_CallFromC = 1; RowMajorStrg = 0; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_cher2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_cher2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_cher2k(F77_UL, F77_TR, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, &beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(2, "cblas_cher2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='C'; else { cblas_xerbla(3, "cblas_cher2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif ALPHA[0]= *alp; ALPHA[1]= -alp[1]; F77_cher2k(F77_UL,F77_TR, &F77_N, &F77_K, (scomplex*)ALPHA, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, &beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_cher2k", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cherk.c000066400000000000000000000052631427272030600245000ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_cherk.c * This program is a C interface to cherk. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_cherk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_cherk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_cherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, (scomplex*)A, &F77_lda, &beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_cherk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='C'; else { cblas_xerbla(3, "cblas_cherk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_cherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, (scomplex*)A, &F77_lda, &beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_cherk", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_chpmv.c000066400000000000000000000067271427272030600245270ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_chpmv.c * The program is a C interface of chpmv * * Keita Teranishi 5/18/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,f77_int N, const void *alpha, const void *AP, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif int n, i=0; const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; float ALPHA[2],BETA[2]; int tincY, tincx; float *x=(float *)X, *y=(float *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_chpmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chpmv(F77_UL, &F77_N, (scomplex*)alpha, (scomplex*)AP, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; if (N > 0) { n = N << 1; x = malloc(n*sizeof(float)); tx = x; if( incX > 0 ) { i = incX << 1; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if(incY > 0) tincY = incY; else tincY = -incY; y++; i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } else x = (float *) X; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_chpmv","Illegal Uplo setting, %d\n", Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chpmv(F77_UL, &F77_N, (scomplex*)ALPHA, (scomplex*)AP, (scomplex*)x, &F77_incX, (scomplex*)BETA, (scomplex*)Y, &F77_incY); } else { cblas_xerbla(1, "cblas_chpmv","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( order == CblasRowMajor ) { RowMajorStrg = 1; if(X!=x) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_chpr.c000066400000000000000000000050021427272030600243270ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_chpr.c * The program is a C interface to chpr. * * Keita Teranishi 3/23/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif int n, i, tincx; float *x=(float *)X, *xx=(float *)X, *tx, *st; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_chpr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chpr(F77_UL, &F77_N, &alpha, (scomplex*)X, &F77_incX, (scomplex*)A); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_chpr","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(float)); tx = x; if( incX > 0 ) { i = incX << 1; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif } else x = (float *) X; F77_chpr(F77_UL, &F77_N, &alpha, (scomplex*)x, &F77_incX, (scomplex*)A); } else { cblas_xerbla(1, "cblas_chpr","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(X!=x) free(x); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_chpr2.c000066400000000000000000000064571427272030600244300ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_chpr2.c * The program is a C interface to chpr2. * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N,const void *alpha, const void *X, f77_int incX,const void *Y, f77_int incY, void *Ap) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif int n, i, j, tincx, tincy; float *x=(float *)X, *xx=(float *)X, *y=(float *)Y, *yy=(float *)Y, *tx, *ty, *stx, *sty; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_chpr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_chpr2(F77_UL, &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY, (scomplex*)Ap); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_chpr2","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(float)); y = malloc(n*sizeof(float)); tx = x; ty = y; if( incX > 0 ) { i = incX << 1 ; tincx = 2; stx= x+n; } else { i = incX *(-2); tincx = -2; stx = x-2; x +=(n-2); } if( incY > 0 ) { j = incY << 1; tincy = 2; sty= y+n; } else { j = incY *(-2); tincy = -2; sty = y-2; y +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != stx); do { *y = *yy; y[1] = -yy[1]; y += tincy ; yy += j; } while (y != sty); x=tx; y=ty; #ifdef F77_INT F77_incX = 1; F77_incY = 1; #else incX = 1; incY = 1; #endif } else { x = (float *) X; y = (void *) Y; } F77_chpr2(F77_UL, &F77_N, (scomplex*)alpha, (scomplex*)y, &F77_incY, (scomplex*)x, &F77_incX, (scomplex*)Ap); } else { cblas_xerbla(1, "cblas_chpr2","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(X!=x) free(x); if(Y!=y) free(y); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cscal.c000066400000000000000000000007441427272030600244700ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cscal.c * * The program is a C interface to cscal.f. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cscal( f77_int N, const void *alpha, void *X, f77_int incX) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_cscal( &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_csscal.c000066400000000000000000000007251427272030600246520ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_csscal.c * * The program is a C interface to csscal. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_csscal( f77_int N, float alpha, void *X, f77_int incX) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_csscal( &F77_N, &alpha, (scomplex*)X, &F77_incX); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_cswap.c000066400000000000000000000010241427272030600245100ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_cswap.c * * The program is a C interface to cswap. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cswap( f77_int N, void *X, f77_int incX, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_cswap( &F77_N, (scomplex*)X, &F77_incX, (scomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_csymm.c000066400000000000000000000053511427272030600245320ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_csymm.c * This program is a C interface to csymm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char SD, UL; #ifdef F77_CHAR F77_CHAR F77_SD, F77_UL; #else #define F77_SD &SD #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_csymm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_csymm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_csymm(F77_SD, F77_UL, &F77_M, &F77_N, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_csymm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_csymm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_csymm(F77_SD, F77_UL, &F77_N, &F77_M, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_csymm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_csyr2k.c000066400000000000000000000055471427272030600246260ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_csyr2k.c * This program is a C interface to csyr2k. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_csyr2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_csyr2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_csyr2k(F77_UL, F77_TR, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_csyr2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_csyr2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_csyr2k(F77_UL, F77_TR, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_csyr2k", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_csyrk.c000066400000000000000000000053461427272030600245410ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_csyrk.c * This program is a C interface to csyrk. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_csyrk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_csyrk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_csyrk(F77_UL, F77_TR, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_csyrk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_csyrk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_csyrk(F77_UL, F77_TR, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_csyrk", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctbmv.c000066400000000000000000000077001427272030600245150ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ctbmv.c * The program is a C interface to ctbmv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; float *st=0, *x=(float *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ctbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ctbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctbmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ctbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if(incX > 0) tincX = incX; else tincX = -incX; i = tincX << 1; n = i * N; x++; st = x + n; do { *x = -(*x); x+= i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ctbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ctbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctbsv.c000066400000000000000000000077111427272030600245250ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ctbsv.c * The program is a C interface to ctbsv. * * Keita Teranishi 3/23/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; float *st=0,*x=(float *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ctbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ctbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ctbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if ( incX > 0 ) tincX = incX; else tincX = -incX; n = N*2*(tincX); x++; st=x+n; i = tincX << 1; do { *x = -(*x); x+=i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ctbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x+= i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ctbsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctpmv.c000066400000000000000000000074101427272030600245310ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ctpmv.c * The program is a C interface to ctpmv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif int n, i=0, tincX; float *st=0,*x=(float *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ctpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ctpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctpmv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)Ap, (scomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ctpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if(incX > 0) tincX = incX; else tincX = -incX; i = tincX << 1; n = i * N; x++; st = x + n; do { *x = -(*x); x += i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ctpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctpmv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)Ap, (scomplex*)X,&F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ctpmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctpsv.c000066400000000000000000000074221427272030600245420ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ctpsv.c * The program is a C interface to ctpsv. * * Keita Teranishi 3/23/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif int n, i=0, tincX; float *st=0, *x=(float*)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ctpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ctpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctpsv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)Ap, (scomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ctpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if ( incX > 0 ) tincX = incX; else tincX = -incX; n = N*2*(tincX); x++; st=x+n; i = tincX << 1; do { *x = -(*x); x+=i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ctpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctpsv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)Ap, (scomplex*)X,&F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ctpsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctrmm.c000066400000000000000000000076221427272030600245270ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ctrmm.c * This program is a C interface to ctrmm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight ) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_ctrmm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper ) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_ctrmm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans ) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ctrmm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else cblas_xerbla(5, "cblas_ctrmm", "Illegal Diag setting, %d\n", Diag); #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ctrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight ) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_ctrmm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper ) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_ctrmm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans ) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ctrmm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_ctrmm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ctrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb); } else cblas_xerbla(1, "cblas_ctrmm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctrmv.c000066400000000000000000000075751427272030600245470ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ctrmv.c * The program is a C interface to ctrmv. * * Keita Teranishi 3/23/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; float *st=0,*x=(float *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ctrmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ctrmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctrmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctrmv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ctrmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if(incX > 0) tincX = incX; else tincX = -incX; i = tincX << 1; n = i * N; st = x + n; do { x[1] = -x[1]; x+= i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ctrmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctrmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctrmv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { x[1] = -x[1]; x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ctrmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctrsm.c000066400000000000000000000077141427272030600245370ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ctrsm.c * This program is a C interface to ctrsm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_ctrsm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_ctrsm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ctrsm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_ctrsm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ctrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, alpha, A, &F77_lda, B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_ctrsm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_ctrsm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ctrsm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_ctrsm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ctrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb); } else cblas_xerbla(1, "cblas_ctrsm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ctrsv.c000066400000000000000000000076151427272030600245500ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ctrsv.c * The program is a C interface to ctrsv. * * Keita Teranishi 3/23/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; float *st=0,*x=(float *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ctrsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ctrsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctrsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctrsv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ctrsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if ( incX > 0 ) tincX = incX; else tincX = -incX; n = N*2*(tincX); x++; st=x+n; i = tincX << 1; do { *x = -(*x); x+=i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ctrsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ctrsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ctrsv( F77_UL, F77_TA, F77_DI, &F77_N, (scomplex*)A, &F77_lda, (scomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ctrsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dasum.c000066400000000000000000000010061427272030600245040ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dasum.c * * The program is a C interface to dasum. * It calls the fortran wrapper before calling dasum. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" double cblas_dasum( f77_int N, const double *X, f77_int incX) { double asum; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_dasum_sub( &F77_N, X, &F77_incX, &asum); return asum; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_daxpy.c000066400000000000000000000010371427272030600245240ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_daxpy.c * * The program is a C interface to daxpy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_daxpy( f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_daxpy( &F77_N, &alpha, X, &F77_incX, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dcopy.c000066400000000000000000000010071427272030600245120ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dcopy.c * * The program is a C interface to dcopy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dcopy( f77_int N, const double *X, f77_int incX, double *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_dcopy( &F77_N, X, &F77_incX, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ddot.c000066400000000000000000000011531427272030600243300ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ddot.c * * The program is a C interface to ddot. * It calls the fortran wrapper before calling ddot. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" double cblas_ddot( f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY) { double dot; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_ddot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); return dot; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dgbmv.c000066400000000000000000000043701427272030600245010ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dgbmv.c * This program is a C interface to dgbmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; F77_INT F77_KL=KL,F77_KU=KU; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_KL KL #define F77_KU KU #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_dgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_dgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(2, "cblas_dgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_dgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, &alpha, A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_dgbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dgemm.c000066400000000000000000000056551427272030600245020ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dgemm.c * This program is a C interface to dgemm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc) { char TA, TB; #ifdef F77_CHAR F77_CHAR F77_TA, F77_TB; #else #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_dgemm","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_dgemm","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_dgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(2, "cblas_dgemm","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_dgemm","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_dgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, &alpha, B, &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_dgemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dgemv.c000066400000000000000000000041411427272030600245000ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dgemv.c * This program is a C interface to dgemv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_dgemv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_dgemv(F77_TA, &F77_M, &F77_N, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(2, "cblas_dgemv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_dgemv(F77_TA, &F77_N, &F77_M, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_dgemv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dger.c000066400000000000000000000022331427272030600243170ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dger.c * This program is a C interface to dger. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda) { #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { F77_dger( &F77_M, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; F77_dger( &F77_N, &F77_M ,&alpha, Y, &F77_incY, X, &F77_incX, A, &F77_lda); } else cblas_xerbla(1, "cblas_dger", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dnrm2.c000066400000000000000000000010051427272030600244140ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dnrm2.c * * The program is a C interface to dnrm2. * It calls the fortranwrapper before calling dnrm2. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" double cblas_dnrm2( f77_int N, const double *X, f77_int incX) { double nrm2; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_dnrm2_sub( &F77_N, X, &F77_incX, &nrm2); return nrm2; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_drot.c000066400000000000000000000010351427272030600243450ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_drot.c * * The program is a C interface to drot. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_drot(&F77_N, X, &F77_incX, Y, &F77_incY, &c, &s); return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_drotg.c000066400000000000000000000004571427272030600245230ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_drotg.c * * The program is a C interface to drotg. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_drotg( double *a, double *b, double *c, double *s) { F77_drotg(a,b,c,s); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_drotm.c000066400000000000000000000006441427272030600245270ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS #include "cblas.h" #include "cblas_f77.h" void cblas_drotm( f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_drotm( &F77_N, X, &F77_incX, Y, &F77_incY, P); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_drotmg.c000066400000000000000000000005421427272030600246730ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_drotmg.c * * The program is a C interface to drotmg. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_drotmg( double *d1, double *d2, double *b1, const double b2, double *p) { F77_drotmg(d1,d2,b1,&b2,p); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsbmv.c000066400000000000000000000037541427272030600245220ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dsbmv.c * This program is a C interface to dsbmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dsbmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsbmv(F77_UL, &F77_N, &F77_K, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dsbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsbmv(F77_UL, &F77_N, &F77_K, &alpha, A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_dsbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dscal.c000066400000000000000000000007121427272030600244640ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dscal.c * * The program is a C interface to dscal. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dscal( f77_int N, double alpha, double *X, f77_int incX) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_dscal( &F77_N, &alpha, X, &F77_incX); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsdot.c000066400000000000000000000011601427272030600245110ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dsdot.c * * The program is a C interface to dsdot. * It calls fthe fortran wrapper before calling dsdot. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" double cblas_dsdot( f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY) { double dot; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_dsdot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); return dot; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dspmv.c000066400000000000000000000035651427272030600245400ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dspmv.c * This program is a C interface to dspmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *AP, const double *X, f77_int incX, double beta, double *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dspmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dspmv(F77_UL, &F77_N, &alpha, AP, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dspmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dspmv(F77_UL, &F77_N, &alpha, AP, X,&F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_dspmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dspr.c000066400000000000000000000032301427272030600243440ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dspr.c * This program is a C interface to dspr. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_dspr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_dspr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); } else cblas_xerbla(1, "cblas_dspr", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dspr2.c000066400000000000000000000033741427272030600244370ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dspr2.c * The program is a C interface to dspr2. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_dspr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_dspr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); } else cblas_xerbla(1, "cblas_dspr2", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dswap.c000066400000000000000000000010021427272030600245050ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dswap.c * * The program is a C interface to dswap. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dswap( f77_int N, double *X, f77_int incX, double *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_dswap( &F77_N, X, &F77_incX, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsymm.c000066400000000000000000000051761427272030600245400ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dsymm.c * This program is a C interface to dsymm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc) { char SD, UL; #ifdef F77_CHAR F77_CHAR F77_SD, F77_UL; #else #define F77_SD &SD #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_dsymm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_dsymm","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_dsymm(F77_SD, F77_UL, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_dsymm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_dsymm","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_dsymm(F77_SD, F77_UL, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_dsymm","Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsymv.c000066400000000000000000000036651427272030600245520ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dsymv.c * This program is a C interface to dsymv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dsymv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsymv(F77_UL, &F77_N, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dsymv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsymv(F77_UL, &F77_N, &alpha, A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_dsymv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsyr.c000066400000000000000000000033451427272030600243640ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dsyr.c * This program is a C interface to dsyr. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const double alpha, const double *X, f77_int incX, double *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_lda=lda; #else #define F77_N N #define F77_incX incX #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_dsyr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_dsyr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); } else cblas_xerbla(1, "cblas_dsyr", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsyr2.c000066400000000000000000000036141427272030600244450ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dsyr2.c * This program is a C interface to dsyr2. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY, F77_lda=lda; #else #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_dsyr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_dsyr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_dsyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, &F77_lda); } else cblas_xerbla(1, "cblas_dsyr2", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsyr2k.c000066400000000000000000000054111427272030600246150ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dsyr2k.c * This program is a C interface to dsyr2k. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_dsyr2k","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_dsyr2k","Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_dsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_dsyr2k","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_dsyr2k","Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_dsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_dsyr2k","Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dsyrk.c000066400000000000000000000052141427272030600245340ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dsyrk.c * This program is a C interface to dsyrk. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_dsyrk","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_dsyrk","Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_dsyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_dsyrk","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_dsyrk","Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_dsyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_dsyrk","Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtbmv.c000066400000000000000000000063541427272030600245220ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dtbmv.c * The program is a C interface to dtbmv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dtbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_dtbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtbmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dtbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_dtbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_dtbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtbsv.c000066400000000000000000000063661427272030600245330ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dtbsv.c * The program is a C interface to dtbsv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dtbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_dtbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dtbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_dtbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_dtbsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtpmv.c000066400000000000000000000060771427272030600245420ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dtpmv.c * The program is a C interface to dtpmv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dtpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_dtpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dtpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_dtpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); } else cblas_xerbla(1, "cblas_dtpmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtpsv.c000066400000000000000000000061001427272030600245330ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dtpsv.c * The program is a C interface to dtpsv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dtpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_dtpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dtpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_dtpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); } else cblas_xerbla(1, "cblas_dtpsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtrmm.c000066400000000000000000000076151427272030600245320ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dtrmm.c * This program is a C interface to dtrmm. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_dtrmm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_dtrmm","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_dtrmm","Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_dtrmm","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_dtrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_dtrmm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_dtrmm","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_dtrmm","Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_dtrmm","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_dtrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb); } else cblas_xerbla(1, "cblas_dtrmm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtrmv.c000066400000000000000000000063151427272030600245370ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dtrmv.c * This program is a C interface to sgemv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dtrmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_dtrmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtrmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dtrmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_dtrmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtrmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_dtrmv", "Illegal order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtrsm.c000066400000000000000000000077351427272030600245430ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dtrsm.c * This program is a C interface to dtrsm. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if ( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_dtrsm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower) UL='L'; else { cblas_xerbla(3, "cblas_dtrsm","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( TransA == CblasTrans ) TA='T'; else if ( TransA == CblasConjTrans) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_dtrsm","Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit) DI='N'; else { cblas_xerbla(5, "cblas_dtrsm","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_dtrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if ( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_dtrsm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower) UL='U'; else { cblas_xerbla(3, "cblas_dtrsm","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( TransA == CblasTrans ) TA='T'; else if ( TransA == CblasConjTrans) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_dtrsm","Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit) DI='N'; else { cblas_xerbla(5, "cblas_dtrsm","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_dtrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb); } else cblas_xerbla(1, "cblas_dtrsm","Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dtrsv.c000066400000000000000000000062771427272030600245540ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dtrsv.c * The program is a C interface to dtrsv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_dtrsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_dtrsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtrsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_dtrsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_dtrsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_dtrsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_dtrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_dtrsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dzasum.c000066400000000000000000000010111427272030600246720ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dzasum.c * * The program is a C interface to dzasum. * It calls the fortran wrapper before calling dzasum. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" double cblas_dzasum( f77_int N, const void *X, f77_int incX) { double asum; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_dzasum_sub( &F77_N, X, &F77_incX, &asum); return asum; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_dznrm2.c000066400000000000000000000010111427272030600246030ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_dznrm2.c * * The program is a C interface to dznrm2. * It calls the fortran wrapper before calling dznrm2. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" double cblas_dznrm2( f77_int N, const void *X, f77_int incX) { double nrm2; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_dznrm2_sub( &F77_N, X, &F77_incX, &nrm2); return nrm2; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_f77.h000066400000000000000000000143401427272030600240100ustar00rootroot00000000000000/* cblas_f77.h Written by Keita Teranishi Updated by Jeff Horner Merged cblas_f77.h and cblas_fortran_header.h (Heavily hacked down from the original) */ /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef CBLAS_F77_H #define CBLAS_F77_H /* * Level 1 BLAS */ #define F77_xerbla xerbla_ #define F77_srotg srotg_ #define F77_srotmg srotmg_ #define F77_srot srot_ #define F77_srotm srotm_ #define F77_drotg drotg_ #define F77_drotmg drotmg_ #define F77_drot drot_ #define F77_drotm drotm_ #define F77_sswap sswap_ #define F77_scopy scopy_ #define F77_saxpy saxpy_ #define F77_isamax_sub isamaxsub_ #define F77_dswap dswap_ #define F77_dcopy dcopy_ #define F77_daxpy daxpy_ #define F77_idamax_sub idamaxsub_ #define F77_cswap cswap_ #define F77_ccopy ccopy_ #define F77_caxpy caxpy_ #define F77_icamax_sub icamaxsub_ #define F77_zswap zswap_ #define F77_zcopy zcopy_ #define F77_zaxpy zaxpy_ #define F77_izamax_sub izamaxsub_ #define F77_sdot_sub sdotsub_ #define F77_ddot_sub ddotsub_ #define F77_dsdot_sub dsdotsub_ #define F77_sscal sscal_ #define F77_dscal dscal_ #define F77_cscal cscal_ #define F77_zscal zscal_ #define F77_csscal csscal_ #define F77_zdscal zdscal_ #define F77_cdotu_sub cdotusub_ #define F77_cdotc_sub cdotcsub_ #define F77_zdotu_sub zdotusub_ #define F77_zdotc_sub zdotcsub_ #define F77_snrm2_sub snrm2sub_ #define F77_sasum_sub sasumsub_ #define F77_dnrm2_sub dnrm2sub_ #define F77_dasum_sub dasumsub_ #define F77_scnrm2_sub scnrm2sub_ #define F77_scasum_sub scasumsub_ #define F77_dznrm2_sub dznrm2sub_ #define F77_dzasum_sub dzasumsub_ #define F77_sdsdot_sub sdsdotsub_ /* * Level 2 BLAS */ #define F77_ssymv ssymv_ #define F77_ssbmv ssbmv_ #define F77_sspmv sspmv_ #define F77_sger sger_ #define F77_ssyr ssyr_ #define F77_sspr sspr_ #define F77_ssyr2 ssyr2_ #define F77_sspr2 sspr2_ #define F77_dsymv dsymv_ #define F77_dsbmv dsbmv_ #define F77_dspmv dspmv_ #define F77_dger dger_ #define F77_dsyr dsyr_ #define F77_dspr dspr_ #define F77_dsyr2 dsyr2_ #define F77_dspr2 dspr2_ #define F77_chemv chemv_ #define F77_chbmv chbmv_ #define F77_chpmv chpmv_ #define F77_cgeru cgeru_ #define F77_cgerc cgerc_ #define F77_cher cher_ #define F77_chpr chpr_ #define F77_cher2 cher2_ #define F77_chpr2 chpr2_ #define F77_zhemv zhemv_ #define F77_zhbmv zhbmv_ #define F77_zhpmv zhpmv_ #define F77_zgeru zgeru_ #define F77_zgerc zgerc_ #define F77_zher zher_ #define F77_zhpr zhpr_ #define F77_zher2 zher2_ #define F77_zhpr2 zhpr2_ #define F77_sgemv sgemv_ #define F77_sgbmv sgbmv_ #define F77_strmv strmv_ #define F77_stbmv stbmv_ #define F77_stpmv stpmv_ #define F77_strsv strsv_ #define F77_stbsv stbsv_ #define F77_stpsv stpsv_ #define F77_dgemv dgemv_ #define F77_dgbmv dgbmv_ #define F77_dtrmv dtrmv_ #define F77_dtbmv dtbmv_ #define F77_dtpmv dtpmv_ #define F77_dtrsv dtrsv_ #define F77_dtbsv dtbsv_ #define F77_dtpsv dtpsv_ #define F77_cgemv cgemv_ #define F77_cgbmv cgbmv_ #define F77_ctrmv ctrmv_ #define F77_ctbmv ctbmv_ #define F77_ctpmv ctpmv_ #define F77_ctrsv ctrsv_ #define F77_ctbsv ctbsv_ #define F77_ctpsv ctpsv_ #define F77_zgemv zgemv_ #define F77_zgbmv zgbmv_ #define F77_ztrmv ztrmv_ #define F77_ztbmv ztbmv_ #define F77_ztpmv ztpmv_ #define F77_ztrsv ztrsv_ #define F77_ztbsv ztbsv_ #define F77_ztpsv ztpsv_ /* * Level 3 BLAS */ #define F77_chemm chemm_ #define F77_cherk cherk_ #define F77_cher2k cher2k_ #define F77_zhemm zhemm_ #define F77_zherk zherk_ #define F77_zher2k zher2k_ #define F77_sgemm sgemm_ #define F77_ssymm ssymm_ #define F77_ssyrk ssyrk_ #define F77_ssyr2k ssyr2k_ #define F77_strmm strmm_ #define F77_strsm strsm_ #define F77_dgemm dgemm_ #define F77_dsymm dsymm_ #define F77_dsyrk dsyrk_ #define F77_dsyr2k dsyr2k_ #define F77_dtrmm dtrmm_ #define F77_dtrsm dtrsm_ #define F77_cgemm cgemm_ #define F77_csymm csymm_ #define F77_csyrk csyrk_ #define F77_csyr2k csyr2k_ #define F77_ctrmm ctrmm_ #define F77_ctrsm ctrsm_ #define F77_zgemm zgemm_ #define F77_zsymm zsymm_ #define F77_zsyrk zsyrk_ #define F77_zsyr2k zsyr2k_ #define F77_ztrmm ztrmm_ #define F77_ztrsm ztrsm_ /* * BLAS extensions */ #define F77_saxpby saxpby_ #define F77_daxpby daxpby_ #define F77_caxpby caxpby_ #define F77_zaxpby zaxpby_ #define F77_sgemmt sgemmt_ #define F77_dgemmt dgemmt_ #define F77_cgemmt cgemmt_ #define F77_zgemmt zgemmt_ #define F77_sgemm_batch sgemm_batch_ #define F77_dgemm_batch dgemm_batch_ #define F77_cgemm_batch cgemm_batch_ #define F77_zgemm_batch zgemm_batch_ #define F77_cgemm3m cgemm3m_ #define F77_zgemm3m zgemm3m_ #endif /* CBLAS_F77_H */ cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_globals.c000066400000000000000000000001351427272030600250200ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS int CBLAS_CallFromC=0; int RowMajorStrg=0; #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_icamax.c000066400000000000000000000010461427272030600246410ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_icamax.c * * The program is a C interface to icamax. * It calls the fortran wrapper before calling icamax. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" f77_int cblas_icamax( f77_int N, const void *X, f77_int incX) { f77_int iamax; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_icamax_sub( &F77_N, (scomplex*)X, &F77_incX, &iamax); return iamax ? iamax-1 : 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_idamax.c000066400000000000000000000010351427272030600246400ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_idamax.c * * The program is a C interface to idamax. * It calls the fortran wrapper before calling idamax. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" f77_int cblas_idamax( f77_int N, const double *X, f77_int incX) { f77_int iamax; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_idamax_sub( &F77_N, X, &F77_incX, &iamax); return iamax ? iamax-1 : 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_isamax.c000066400000000000000000000010341427272030600246560ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_isamax.c * * The program is a C interface to isamax. * It calls the fortran wrapper before calling isamax. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" f77_int cblas_isamax( f77_int N, const float *X, f77_int incX) { f77_int iamax; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_isamax_sub( &F77_N, X, &F77_incX, &iamax); return iamax ? iamax-1 : 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_izamax.c000066400000000000000000000010501427272030600246630ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_izamax.c * * The program is a C interface to izamax. * It calls the fortran wrapper before calling izamax. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" f77_int cblas_izamax( f77_int N, const void *X, f77_int incX) { f77_int iamax; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_izamax_sub( &F77_N, (dcomplex*)X, &F77_incX, &iamax); return (iamax ? iamax-1 : 0); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sasum.c000066400000000000000000000010031427272030600245200ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_sasum.c * * The program is a C interface to sasum. * It calls the fortran wrapper before calling sasum. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" float cblas_sasum( f77_int N, const float *X, f77_int incX) { float asum; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_sasum_sub( &F77_N, X, &F77_incX, &asum); return asum; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_saxpy.c000066400000000000000000000011221427272030600245360ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_saxpy.c * * The program is a C interface to saxpy. * It calls the fortran wrapper before calling saxpy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_saxpy( f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_saxpy( &F77_N, &alpha, X, &F77_incX, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_scasum.c000066400000000000000000000010071427272030600246670ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_scasum.c * * The program is a C interface to scasum. * It calls the fortran wrapper before calling scasum. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" float cblas_scasum( f77_int N, const void *X, f77_int incX) { float asum; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_scasum_sub( &F77_N, X, &F77_incX, &asum); return asum; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_scnrm2.c000066400000000000000000000010071427272030600246000ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_scnrm2.c * * The program is a C interface to scnrm2. * It calls the fortran wrapper before calling scnrm2. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" float cblas_scnrm2( f77_int N, const void *X, f77_int incX) { float nrm2; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_scnrm2_sub( &F77_N, X, &F77_incX, &nrm2); return nrm2; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_scopy.c000066400000000000000000000010051427272030600245270ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_scopy.c * * The program is a C interface to scopy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_scopy( f77_int N, const float *X, f77_int incX, float *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sdot.c000066400000000000000000000011471427272030600243520ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_sdot.c * * The program is a C interface to sdot. * It calls the fortran wrapper before calling sdot. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" float cblas_sdot( f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY) { float dot; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_sdot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); return dot; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sdsdot.c000066400000000000000000000012061427272030600246750ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_sdsdot.c * * The program is a C interface to sdsdot. * It calls the fortran wrapper before calling sdsdot. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" float cblas_sdsdot( f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY) { float dot; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_sdsdot_sub( &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, &dot); return dot; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sgbmv.c000066400000000000000000000043771427272030600245270ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sgbmv.c * This program is a C interface to sgbmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; F77_INT F77_KL=KL,F77_KU=KU; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_KL KL #define F77_KU KU #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_sgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_sgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(2, "cblas_sgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_sgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, &alpha, A ,&F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_sgbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sgemm.c000066400000000000000000000060711427272030600245120ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sgemm.c * This program is a C interface to sgemm. * Written by Keita Teranishi * 4/8/1998 * * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc) { char TA, TB; #ifdef F77_CHAR F77_CHAR F77_TA, F77_TB; #else #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_sgemm", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_sgemm", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_sgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(2, "cblas_sgemm", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_sgemm", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_sgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, &alpha, B, &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_sgemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sgemv.c000066400000000000000000000041161427272030600245210ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sgemv.c * This program is a C interface to sgemv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_sgemv(F77_TA, &F77_M, &F77_N, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_sgemv(F77_TA, &F77_N, &F77_M, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sger.c000066400000000000000000000021771427272030600243450ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sger.c * This program is a C interface to sger. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, const float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda) { #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { F77_sger( &F77_M, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; F77_sger( &F77_N, &F77_M, &alpha, Y, &F77_incY, X, &F77_incX, A, &F77_lda); } else cblas_xerbla(1, "cblas_sger", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_snrm2.c000066400000000000000000000010031427272030600244310ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_snrm2.c * * The program is a C interface to snrm2. * It calls the fortran wrapper before calling snrm2. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" float cblas_snrm2( f77_int N, const float *X, f77_int incX) { float nrm2; #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_snrm2_sub( &F77_N, X, &F77_incX, &nrm2); return nrm2; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_srot.c000066400000000000000000000010441427272030600243640ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_srot.c * * The program is a C interface to srot. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_srot( f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_srot(&F77_N, X, &F77_incX, Y, &F77_incY, &c, &s); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_srotg.c000066400000000000000000000004531427272030600245360ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_srotg.c * * The program is a C interface to srotg. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_srotg( float *a, float *b, float *c, float *s) { F77_srotg(a,b,c,s); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_srotm.c000066400000000000000000000010261427272030600245410ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_srotm.c * * The program is a C interface to srotm. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_srotm( f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_srotm( &F77_N, X, &F77_incX, Y, &F77_incY, P); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_srotmg.c000066400000000000000000000005351427272030600247140ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_srotmg.c * * The program is a C interface to srotmg. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_srotmg( float *d1, float *d2, float *b1, const float b2, float *p) { F77_srotmg(d1,d2,b1,&b2,p); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ssbmv.c000066400000000000000000000036411427272030600245340ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ssbmv.c * This program is a C interface to ssbmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ssbmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssbmv(F77_UL, &F77_N, &F77_K, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); }else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ssbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssbmv(F77_UL, &F77_N, &F77_K, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_ssbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sscal.c000066400000000000000000000007101427272030600245010ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_sscal.c * * The program is a C interface to sscal. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sscal( f77_int N, float alpha, float *X, f77_int incX) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_sscal( &F77_N, &alpha, X, &F77_incX); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sspmv.c000066400000000000000000000035421427272030600245520ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sspmv.c * This program is a C interface to sspmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *AP, const float *X, f77_int incX, float beta, float *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_sspmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_sspmv(F77_UL, &F77_N, &alpha, AP, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_sspmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_sspmv(F77_UL, &F77_N, &alpha, AP, X,&F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_sspmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sspr.c000066400000000000000000000032361427272030600243710ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sspr.c * This program is a C interface to sspr. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const float alpha, const float *X, f77_int incX, float *Ap) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_sspr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_sspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_sspr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_sspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); } else cblas_xerbla(1, "cblas_sspr", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sspr2.c000066400000000000000000000033751427272030600244570ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sspr2.c * This program is a C interface to sspr2. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_sspr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_sspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_sspr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_sspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); } else cblas_xerbla(1, "cblas_sspr2", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_sswap.c000066400000000000000000000010001427272030600245220ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_sswap.c * * The program is a C interface to sswap. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sswap( f77_int N, float *X, f77_int incX, float *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_sswap( &F77_N, X, &F77_incX, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ssymm.c000066400000000000000000000053101427272030600245450ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ssymm.c * This program is a C interface to ssymm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc) { char SD, UL; #ifdef F77_CHAR F77_CHAR F77_SD, F77_UL; #else #define F77_SD &SD #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_ssymm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_ssymm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_ssymm(F77_SD, F77_UL, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_ssymm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_ssymm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_ssymm(F77_SD, F77_UL, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_ssymm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ssymv.c000066400000000000000000000036601427272030600245640ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ssymv.c * This program is a C interface to ssymv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ssymv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssymv(F77_UL, &F77_N, &alpha, A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ssymv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssymv(F77_UL, &F77_N, &alpha, A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); } else cblas_xerbla(1, "cblas_ssymv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ssyr.c000066400000000000000000000033411427272030600243770ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ssyr.c * This program is a C interface to ssyr. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const float alpha, const float *X, f77_int incX, float *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_lda=lda; #else #define F77_N N #define F77_incX incX #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_ssyr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_ssyr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); } else cblas_xerbla(1, "cblas_ssyr", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ssyr2.c000066400000000000000000000036101427272030600244600ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ssyr2.c * This program is a C interface to ssyr2. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY, F77_lda=lda; #else #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_ssyr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasLower) UL = 'U'; else if (Uplo == CblasUpper) UL = 'L'; else { cblas_xerbla(2, "cblas_ssyr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_ssyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, &F77_lda); } else cblas_xerbla(1, "cblas_ssyr2", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ssyr2k.c000066400000000000000000000055241427272030600246410ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ssyr2k.c * This program is a C interface to ssyr2k. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_ssyr2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_ssyr2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_ssyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_ssyr2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_ssyr2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_ssyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_ssyr2k", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ssyrk.c000066400000000000000000000053321427272030600245540ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ssyrk.c * This program is a C interface to ssyrk. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_ssyrk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_ssyrk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_ssyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_ssyrk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_ssyrk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_ssyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_ssyrk", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_stbmv.c000066400000000000000000000064051427272030600245360ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_stbmv.c * This program is a C interface to stbmv. * Written by Keita Teranishi * 3/3/1998 */ #include "cblas.h" #include "cblas_f77.h" void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_stbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_stbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stbmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_stbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_stbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_stbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_stbsv.c000066400000000000000000000063651427272030600245510ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_stbsv.c * The program is a C interface to stbsv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_stbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_stbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_stbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_stbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_stbsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_stpmv.c000066400000000000000000000061141427272030600245510ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_stpmv.c * This program is a C interface to stpmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_stpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_stpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_stpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_stpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); } else cblas_xerbla(1, "cblas_stpmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_stpsv.c000066400000000000000000000060761427272030600245660ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_stpsv.c * The program is a C interface to stpsv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_stpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_stpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_stpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_stpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_stpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_stpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); } else cblas_xerbla(1, "cblas_stpsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_strmm.c000066400000000000000000000075611427272030600245510ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_strmm.c * This program is a C interface to strmm. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_strmm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_strmm","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_strmm","Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_strmm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_strmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_strmm","Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_strmm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_strmm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_strmm","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_strmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb); } else cblas_xerbla(1, "cblas_strmm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_strmv.c000066400000000000000000000063141427272030600245550ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_strmv.c * This program is a C interface to strmv. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_strmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_strmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_strmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_strmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_strmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_strmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_strmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_strmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_strmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_strsm.c000066400000000000000000000076071427272030600245600ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_strsm.c * This program is a C interface to strsm. * Written by Keita Teranishi * 4/6/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_strsm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_strsm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_strsm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_strsm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_strsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_strsm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_strsm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_strsm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_strsm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_strsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb); } else cblas_xerbla(1, "cblas_strsm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_strsv.c000066400000000000000000000062751427272030600245710ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_strsv.c * The program is a C interface to strsv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_strsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_strsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_strsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_strsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_strsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) TA = 'N'; else { cblas_xerbla(3, "cblas_strsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_strsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_strsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, &F77_incX); } else cblas_xerbla(1, "cblas_strsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_xerbla.c000066400000000000000000000036761427272030600246670ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS #include #include #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_xerbla(f77_int info, const char *rout, const char *form, ...) { extern int RowMajorStrg; char empty[1] = ""; va_list argptr; va_start(argptr, form); if (RowMajorStrg) { if (strstr(rout,"gemm") != 0) { if (info == 5 ) info = 4; else if (info == 4 ) info = 5; else if (info == 11) info = 9; else if (info == 9 ) info = 11; } else if (strstr(rout,"symm") != 0 || strstr(rout,"hemm") != 0) { if (info == 5 ) info = 4; else if (info == 4 ) info = 5; } else if (strstr(rout,"trmm") != 0 || strstr(rout,"trsm") != 0) { if (info == 7 ) info = 6; else if (info == 6 ) info = 7; } else if (strstr(rout,"gemv") != 0) { if (info == 4) info = 3; else if (info == 3) info = 4; } else if (strstr(rout,"gbmv") != 0) { if (info == 4) info = 3; else if (info == 3) info = 4; else if (info == 6) info = 5; else if (info == 5) info = 6; } else if (strstr(rout,"ger") != 0) { if (info == 3) info = 2; else if (info == 2) info = 3; else if (info == 8) info = 6; else if (info == 6) info = 8; } else if ( (strstr(rout,"her2") != 0 || strstr(rout,"hpr2") != 0) && strstr(rout,"her2k") == 0 ) { if (info == 8) info = 6; else if (info == 6) info = 8; } } if (info) fprintf(stderr, "Parameter %jd to routine %s was incorrect\n", ( intmax_t )info, rout); vfprintf(stderr, form, argptr); va_end(argptr); if (info && !info) F77_xerbla(empty, &info, 0); /* Force link of our F77 error handler */ exit(-1); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zaxpy.c000066400000000000000000000011001427272030600245410ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zaxpy.c * * The program is a C interface to zaxpy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zaxpy( f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_zaxpy( &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zcopy.c000066400000000000000000000010311427272030600245350ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zcopy.c * * The program is a C interface to zcopy. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zcopy( f77_int N, const void *X, f77_int incX, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_zcopy( &F77_N, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zdotc_sub.c000066400000000000000000000012071427272030600253720ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zdotc_sub.c * * The program is a C interface to zdotc. * It calls the fortran wrapper before calling zdotc. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zdotc_sub( f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_zdotc_sub( &F77_N, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY, (dcomplex*)dotc); return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zdotu_sub.c000066400000000000000000000012111427272030600254070ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zdotu_sub.c * * The program is a C interface to zdotu. * It calls the fortran wrapper before calling zdotu. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zdotu_sub( f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_zdotu_sub( &F77_N, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY, (dcomplex*)dotu); return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zdscal.c000066400000000000000000000007271427272030600246640ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zdscal.c * * The program is a C interface to zdscal. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zdscal( f77_int N, double alpha, void *X, f77_int incX) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_zdscal( &F77_N, &alpha, (dcomplex*)X, &F77_incX); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zgbmv.c000066400000000000000000000104171427272030600245260ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zgbmv.c * The program is a C interface of zgbmv * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; F77_INT F77_KL=KL,F77_KU=KU; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_KL KL #define F77_KU KU #define F77_incX incX #define F77_incY incY #endif int n, i=0; const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; double ALPHA[2],BETA[2]; int tincY, tincx; double *x=(double *)X, *y=(double *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_zgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_zgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; TA = 'N'; if (M > 0) { n = M << 1; x = malloc(n*sizeof(double)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if( incY > 0 ) tincY = incY; else tincY = -incY; y++; if (N > 0) { i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } } else x = (double *) X; } else { cblas_xerbla(2, "cblas_zgbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif if (TransA == CblasConjTrans) F77_zgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, (dcomplex*)ALPHA, (dcomplex*)A ,&F77_lda, (dcomplex*)x,&F77_incX, (dcomplex*)BETA, (dcomplex*)Y, &F77_incY); else F77_zgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, (dcomplex*)alpha, (dcomplex*)A ,&F77_lda, (dcomplex*)x,&F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); if (TransA == CblasConjTrans) { if (x != X) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } } else cblas_xerbla(1, "cblas_zgbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zgemm.c000066400000000000000000000060501427272030600245160ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zgemm.c * This program is a C interface to zgemm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char TA, TB; #ifdef F77_CHAR F77_CHAR F77_TA, F77_TB; #else #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_zgemm","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_zgemm","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_zgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(2, "cblas_zgemm","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_zgemm","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_zgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B, &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zgemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zgemv.c000066400000000000000000000101731427272030600245300ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zgemv.c * The program is a C interface of zgemv * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char TA; #ifdef F77_CHAR F77_CHAR F77_TA; #else #define F77_TA &TA #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n, i=0; const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; double ALPHA[2],BETA[2]; int tincY, tincx; double *x=(double *)X, *y=(double *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(2, "cblas_zgemv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif F77_zgemv(F77_TA, &F77_M, &F77_N, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; TA = 'N'; if (M > 0) { n = M << 1; x = malloc(n*sizeof(double)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if(incY > 0) tincY = incY; else tincY = -incY; y++; if (N > 0) { i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } } else x = (double *) X; } else { cblas_xerbla(2, "cblas_zgemv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); #endif if (TransA == CblasConjTrans) F77_zgemv(F77_TA, &F77_N, &F77_M, (dcomplex*)ALPHA, (dcomplex*)A, &F77_lda, (dcomplex*)x, &F77_incX, (dcomplex*)BETA, (dcomplex*)Y, &F77_incY); else F77_zgemv(F77_TA, &F77_N, &F77_M, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)x, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); if (TransA == CblasConjTrans) { if (x != (double *)X) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } } else cblas_xerbla(1, "cblas_zgemv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zgerc.c000066400000000000000000000037721427272030600245210ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zgerc.c * The program is a C interface to zgerc. * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda) { #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif int n, i, tincy; double *y=(double *)Y, *yy=(double *)Y, *ty, *st; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { F77_zgerc( &F77_M, &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY, (dcomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (N > 0) { n = N << 1; y = malloc(n*sizeof(double)); ty = y; if( incY > 0 ) { i = incY << 1; tincy = 2; st= y+n; } else { i = incY *(-2); tincy = -2; st = y-2; y +=(n-2); } do { *y = *yy; y[1] = -yy[1]; y += tincy ; yy += i; } while (y != st); y = ty; #ifdef F77_INT F77_incY = 1; #else incY = 1; #endif } else y = (double *) Y; F77_zgeru( &F77_N, &F77_M, (dcomplex*)alpha, (dcomplex*)y, &F77_incY, (dcomplex*)X, &F77_incX, (dcomplex*)A, &F77_lda); if(Y!=y) free(y); } else cblas_xerbla(1, "cblas_zgerc", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zgeru.c000066400000000000000000000023431427272030600245340ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zgeru.c * The program is a C interface to zgeru. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda) { #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_M M #define F77_N N #define F77_incX incX #define F77_incY incY #define F77_lda lda #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { F77_zgeru( &F77_M, &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY, (dcomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; F77_zgeru( &F77_N, &F77_M, (dcomplex*)alpha, (dcomplex*)Y, &F77_incY, (dcomplex*)X, &F77_incX, (dcomplex*)A, &F77_lda); } else cblas_xerbla(1, "cblas_zgeru", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zhbmv.c000066400000000000000000000071251427272030600245310ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zhbmv.c * The program is a C interface to zhbmv * * Keita Teranishi 5/18/98 * */ #include "cblas.h" #include "cblas_f77.h" #include #include void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,f77_int N,f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n, i=0; const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; double ALPHA[2],BETA[2]; int tincY, tincx; double *x=(double *)X, *y=(double *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_zhbmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhbmv(F77_UL, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; if (N > 0) { n = N << 1; x = malloc(n*sizeof(double)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if(incY > 0) tincY = incY; else tincY = -incY; y++; i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } else x = (double *) X; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_zhbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhbmv(F77_UL, &F77_N, &F77_K, (dcomplex*)ALPHA, (dcomplex*)A ,&F77_lda, (dcomplex*)x,&F77_incX, (dcomplex*)BETA, (dcomplex*)Y, &F77_incY); } else { cblas_xerbla(1, "cblas_zhbmv","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( order == CblasRowMajor ) { RowMajorStrg = 1; if(X!=x) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zhemm.c000066400000000000000000000053571427272030600245300ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zhemm.c * This program is a C interface to zhemm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zhemm(enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char SD, UL; #ifdef F77_CHAR F77_CHAR F77_SD, F77_UL; #else #define F77_SD &SD #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_zhemm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_zhemm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_zhemm(F77_SD, F77_UL, &F77_M, &F77_N, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_zhemm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_zhemm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_zhemm(F77_SD, F77_UL, &F77_N, &F77_M, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zhemm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zhemv.c000066400000000000000000000070511427272030600245320ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zhemv.c * The program is a C interface to zhemv * * Keita Teranishi 5/18/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n, i=0; const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; double ALPHA[2],BETA[2]; int tincY, tincx; double *x=(double *)X, *y=(double *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_zhemv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhemv(F77_UL, &F77_N, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; if (N > 0) { n = N << 1; x = malloc(n*sizeof(double)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if(incY > 0) tincY = incY; else tincY = -incY; y++; i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } else x = (double *) X; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_zhemv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhemv(F77_UL, &F77_N, (dcomplex*)ALPHA, (dcomplex*)A, &F77_lda, (dcomplex*)x, &F77_incX, (dcomplex*)BETA, (dcomplex*)Y, &F77_incY); } else { cblas_xerbla(1, "cblas_zhemv","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( order == CblasRowMajor ) { RowMajorStrg = 1; if ( X != x ) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zher.c000066400000000000000000000050001427272030600243410ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zher.c * The program is a C interface to zher. * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX ,void *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif int n, i, tincx; double *x=(double *)X, *xx=(double *)X, *tx, *st; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_zher","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zher(F77_UL, &F77_N, &alpha, (dcomplex*)X, &F77_incX, (dcomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_zher","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(double)); tx = x; if( incX > 0 ) { i = incX << 1 ; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif } else x = (double *) X; F77_zher(F77_UL, &F77_N, &alpha, (dcomplex*)x, &F77_incX, (dcomplex*)A, &F77_lda); } else cblas_xerbla(1, "cblas_zher", "Illegal Order setting, %d\n", order); if(X!=x) free(x); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zher2.c000066400000000000000000000067021427272030600244350ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zher2.c * The program is a C interface to zher2. * * Keita Teranishi 3/23/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_lda lda #define F77_incX incX #define F77_incY incY #endif int n, i, j, tincx, tincy; double *x=(double *)X, *xx=(double *)X, *y=(double *)Y, *yy=(double *)Y, *tx, *ty, *stx, *sty; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_zher2", "Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zher2(F77_UL, &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY, (dcomplex*)A, &F77_lda); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_zher2", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(double)); y = malloc(n*sizeof(double)); tx = x; ty = y; if( incX > 0 ) { i = incX << 1 ; tincx = 2; stx= x+n; } else { i = incX *(-2); tincx = -2; stx = x-2; x +=(n-2); } if( incY > 0 ) { j = incY << 1; tincy = 2; sty= y+n; } else { j = incY *(-2); tincy = -2; sty = y-2; y +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != stx); do { *y = *yy; y[1] = -yy[1]; y += tincy ; yy += j; } while (y != sty); x=tx; y=ty; #ifdef F77_INT F77_incX = 1; F77_incY = 1; #else incX = 1; incY = 1; #endif } else { x = (double *) X; y = (double *) Y; } F77_zher2(F77_UL, &F77_N, (dcomplex*)alpha, (dcomplex*)y, &F77_incY, (dcomplex*)x, &F77_incX, (dcomplex*)A, &F77_lda); } else { cblas_xerbla(1, "cblas_zher2", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(X!=x) free(x); if(Y!=y) free(y); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zher2k.c000066400000000000000000000056451427272030600246150ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zher2k.c * This program is a C interface to zher2k. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; double ALPHA[2]; const double *alp=(double *)alpha; CBLAS_CallFromC = 1; RowMajorStrg = 0; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_zher2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_zher2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_zher2k(F77_UL, F77_TR, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, &beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(2, "cblas_zher2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='C'; else { cblas_xerbla(3, "cblas_zher2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif ALPHA[0]= *alp; ALPHA[1]= -alp[1]; F77_zher2k(F77_UL,F77_TR, &F77_N, &F77_K, (dcomplex*)ALPHA, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, &beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zher2k", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zherk.c000066400000000000000000000052651427272030600245310ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zherk.c * This program is a C interface to zherk. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_zherk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_zherk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_zherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, (dcomplex*)A, &F77_lda, &beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_zherk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='C'; else { cblas_xerbla(3, "cblas_zherk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_zherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, (dcomplex*)A, &F77_lda, &beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zherk", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zhpmv.c000066400000000000000000000067411427272030600245520ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zhpmv.c * The program is a C interface of zhpmv * * Keita Teranishi 5/18/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,f77_int N, const void *alpha, const void *AP, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif int n, i=0; const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; double ALPHA[2],BETA[2]; int tincY, tincx; double *x=(double *)X, *y=(double *)Y, *st=0, *tx; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_zhpmv","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhpmv(F77_UL, &F77_N, (dcomplex*)alpha, (dcomplex*)AP, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); } else if (order == CblasRowMajor) { RowMajorStrg = 1; ALPHA[0]= *alp; ALPHA[1]= -alp[1]; BETA[0]= *bet; BETA[1]= -bet[1]; if (N > 0) { n = N << 1; x = malloc(n*sizeof(double)); tx = x; if( incX > 0 ) { i = incX << 1; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif if(incY > 0) tincY = incY; else tincY = -incY; y++; i = tincY << 1; n = i * N ; st = y + n; do { *y = -(*y); y += i; } while(y != st); y -= n; } else x = (double *) X; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_zhpmv","Illegal Uplo setting, %d\n", Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhpmv(F77_UL, &F77_N, (dcomplex*)ALPHA, (dcomplex*)AP, (dcomplex*)x, &F77_incX, (dcomplex*)BETA, (dcomplex*)Y, &F77_incY); } else { cblas_xerbla(1, "cblas_zhpmv","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if ( order == CblasRowMajor ) { RowMajorStrg = 1; if(X!=x) free(x); if (N > 0) { do { *y = -(*y); y += i; } while (y != st); } } CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zhpr.c000066400000000000000000000050101427272030600243550ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zhpr.c * The program is a C interface to zhpr. * * Keita Teranishi 3/23/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif int n, i, tincx; double *x=(double *)X, *xx=(double *)X, *tx, *st; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_zhpr","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhpr(F77_UL, &F77_N, &alpha, (dcomplex*)X, &F77_incX, (dcomplex*)A); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_zhpr","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(double)); tx = x; if( incX > 0 ) { i = incX << 1; tincx = 2; st= x+n; } else { i = incX *(-2); tincx = -2; st = x-2; x +=(n-2); } do { *x = *xx; x[1] = -xx[1]; x += tincx ; xx += i; } while (x != st); x=tx; #ifdef F77_INT F77_incX = 1; #else incX = 1; #endif } else x = (double *) X; F77_zhpr(F77_UL, &F77_N, &alpha, (dcomplex*)x, &F77_incX, (dcomplex*)A); } else { cblas_xerbla(1, "cblas_zhpr","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(X!=x) free(x); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zhpr2.c000066400000000000000000000065071427272030600244530ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zhpr2.c * The program is a C interface to zhpr2. * * Keita Teranishi 5/20/98 * */ #include #include #include "cblas.h" #include "cblas_f77.h" void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N,const void *alpha, const void *X, f77_int incX,const void *Y, f77_int incY, void *Ap) { char UL; #ifdef F77_CHAR F77_CHAR F77_UL; #else #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif int n, i, j; double *x=(double *)X, *xx=(double *)X, *y=(double *)Y, *yy=(double *)Y, *stx, *sty; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasLower) UL = 'L'; else if (Uplo == CblasUpper) UL = 'U'; else { cblas_xerbla(2, "cblas_zhpr2","Illegal Uplo setting, %d\n",Uplo ); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif F77_zhpr2(F77_UL, &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY, (dcomplex*)Ap); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_zhpr2","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); #endif if (N > 0) { n = N << 1; x = malloc(n*sizeof(double)); y = malloc(n*sizeof(double)); stx = x + n; sty = y + n; if( incX > 0 ) i = incX << 1; else i = incX *(-2); if( incY > 0 ) j = incY << 1; else j = incY *(-2); do { *x = *xx; x[1] = -xx[1]; x += 2; xx += i; } while (x != stx); do { *y = *yy; y[1] = -yy[1]; y += 2; yy += j; } while (y != sty); x -= n; y -= n; #ifdef F77_INT if(incX > 0 ) F77_incX = 1; else F77_incX = -1; if(incY > 0 ) F77_incY = 1; else F77_incY = -1; #else if(incX > 0 ) incX = 1; else incX = -1; if(incY > 0 ) incY = 1; else incY = -1; #endif } else { x = (double *) X; y = (void *) Y; } F77_zhpr2(F77_UL, &F77_N, (dcomplex*)alpha, (dcomplex*)y, &F77_incY, (dcomplex*)x, &F77_incX, (dcomplex*)Ap); } else { cblas_xerbla(1, "cblas_zhpr2","Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(X!=x) free(x); if(Y!=y) free(y); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zscal.c000066400000000000000000000007421427272030600245150ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zscal.c * * The program is a C interface to zscal. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zscal( f77_int N, const void *alpha, void *X, f77_int incX) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif F77_zscal( &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zswap.c000066400000000000000000000010261427272030600245410ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zswap.c * * The program is a C interface to zswap. * * Written by Keita Teranishi. 2/11/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zswap( f77_int N, void *X, f77_int incX, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_zswap( &F77_N, (dcomplex*)X, &F77_incX, (dcomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zsymm.c000066400000000000000000000053521427272030600245620ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zsymm.c * This program is a C interface to zsymm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char SD, UL; #ifdef F77_CHAR F77_CHAR F77_SD, F77_UL; #else #define F77_SD &SD #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_zsymm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_zsymm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_zsymm(F77_SD, F77_UL, &F77_M, &F77_N, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_zsymm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_zsymm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_SD = C2F_CHAR(&SD); #endif F77_zsymm(F77_SD, F77_UL, &F77_N, &F77_M, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zsymm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zsyr2k.c000066400000000000000000000055471427272030600246550ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zsyr2k.c * This program is a C interface to zsyr2k. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_zsyr2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_zsyr2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_zsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_zsyr2k", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_zsyr2k", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_zsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zsyr2k", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_zsyrk.c000066400000000000000000000053461427272030600245700ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zsyrk.c * This program is a C interface to zsyrk. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc) { char UL, TR; #ifdef F77_CHAR F77_CHAR F77_TR, F77_UL; #else #define F77_TR &TR #define F77_UL &UL #endif #ifdef F77_INT F77_INT F77_N=N, F77_K=K, F77_lda=lda; F77_INT F77_ldc=ldc; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_zsyrk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='T'; else if ( Trans == CblasConjTrans ) TR='C'; else if ( Trans == CblasNoTrans ) TR='N'; else { cblas_xerbla(3, "cblas_zsyrk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_zsyrk(F77_UL, F77_TR, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_zsyrk", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Trans == CblasTrans) TR ='N'; else if ( Trans == CblasConjTrans ) TR='N'; else if ( Trans == CblasNoTrans ) TR='T'; else { cblas_xerbla(3, "cblas_zsyrk", "Illegal Trans setting, %d\n", Trans); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TR = C2F_CHAR(&TR); #endif F77_zsyrk(F77_UL, F77_TR, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zsyrk", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztbmv.c000066400000000000000000000077021427272030600245460ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ztbmv.c * The program is a C interface to ztbmv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; double *st=0, *x=(double *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ztbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ztbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztbmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ztbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if(incX > 0) tincX = incX; else tincX = -incX; i = tincX << 1; n = i * N; x++; st = x + n; do { *x = -(*x); x+= i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ztbmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztbmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ztbmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztbsv.c000066400000000000000000000077131427272030600245560ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ztbsv.c * The program is a C interface to ztbsv. * * Keita Teranishi 3/23/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; #else #define F77_N N #define F77_K K #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; double *st=0,*x=(double *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ztbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ztbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ztbsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if ( incX > 0 ) tincX = incX; else tincX = -incX; n = N*2*(tincX); x++; st=x+n; i = tincX << 1; do { *x = -(*x); x+=i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ztbsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztbsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x+= i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ztbsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztpmv.c000066400000000000000000000074121427272030600245620ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ztpmv.c * The program is a C interface to ztpmv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif int n, i=0, tincX; double *st=0,*x=(double *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ztpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ztpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztpmv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)Ap, (dcomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ztpmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if(incX > 0) tincX = incX; else tincX = -incX; i = tincX << 1; n = i * N; x++; st = x + n; do { *x = -(*x); x += i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ztpmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztpmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztpmv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)Ap, (dcomplex*)X,&F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ztpmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztpsv.c000066400000000000000000000074241427272030600245730ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ztpsv.c * The program is a C interface to ztpsv. * * Keita Teranishi 3/23/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX; #else #define F77_N N #define F77_incX incX #endif int n, i=0, tincX; double *st=0, *x=(double*)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ztpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ztpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztpsv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)Ap, (dcomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ztpsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if ( incX > 0 ) tincX = incX; else tincX = -incX; n = N*2*(tincX); x++; st=x+n; i = tincX << 1; do { *x = -(*x); x+=i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ztpsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztpsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztpsv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)Ap, (dcomplex*)X,&F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ztpsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztrmm.c000066400000000000000000000077361427272030600245640ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ztrmm.c * This program is a C interface to ztrmm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight ) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_ztrmm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper ) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_ztrmm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans ) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ztrmm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_ztrmm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ztrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight ) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_ztrmm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper ) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_ztrmm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans ) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ztrmm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_ztrmm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ztrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb); } else cblas_xerbla(1, "cblas_ztrmm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztrmv.c000066400000000000000000000076151427272030600245710ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ztrmv.c * The program is a C interface to ztrmv. * * Keita Teranishi 5/20/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; double *st=0,*x=(double *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ztrmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ztrmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztrmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztrmv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ztrmv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if(incX > 0) tincX = incX; else tincX = -incX; i = tincX << 1; n = i * N; x++; st = x + n; do { *x = -(*x); x += i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ztrmv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztrmv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztrmv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ztrmv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztrsm.c000066400000000000000000000077561427272030600245740ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_ztrsm.c * This program is a C interface to ztrsm. * Written by Keita Teranishi * 4/8/1998 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb) { char UL, TA, SD, DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_SD &SD #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; #else #define F77_M M #define F77_N N #define F77_lda lda #define F77_ldb ldb #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Side == CblasRight) SD='R'; else if ( Side == CblasLeft ) SD='L'; else { cblas_xerbla(2, "cblas_ztrsm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(3, "cblas_ztrsm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ztrsm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_ztrsm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ztrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Side == CblasRight) SD='L'; else if ( Side == CblasLeft ) SD='R'; else { cblas_xerbla(2, "cblas_ztrsm", "Illegal Side setting, %d\n", Side); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(3, "cblas_ztrsm", "Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( TransA == CblasTrans) TA ='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_ztrsm", "Illegal Trans setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if( Diag == CblasUnit ) DI='U'; else if ( Diag == CblasNonUnit ) DI='N'; else { cblas_xerbla(5, "cblas_ztrsm", "Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_SD = C2F_CHAR(&SD); F77_DI = C2F_CHAR(&DI); #endif F77_ztrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb); } else cblas_xerbla(1, "cblas_ztrsm", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/cblas_ztrsv.c000066400000000000000000000076171427272030600246010ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_ztrsv.c * The program is a C interface to ztrsv. * * Keita Teranishi 3/23/98 * */ #include "cblas.h" #include "cblas_f77.h" void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX) { char TA; char UL; char DI; #ifdef F77_CHAR F77_CHAR F77_TA, F77_UL, F77_DI; #else #define F77_TA &TA #define F77_UL &UL #define F77_DI &DI #endif #ifdef F77_INT F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; #else #define F77_N N #define F77_lda lda #define F77_incX incX #endif int n, i=0, tincX; double *st=0,*x=(double *)X; extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if (order == CblasColMajor) { if (Uplo == CblasUpper) UL = 'U'; else if (Uplo == CblasLower) UL = 'L'; else { cblas_xerbla(2, "cblas_ztrsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'N'; else if (TransA == CblasTrans) TA = 'T'; else if (TransA == CblasConjTrans) TA = 'C'; else { cblas_xerbla(3, "cblas_ztrsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztrsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztrsv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); } else if (order == CblasRowMajor) { RowMajorStrg = 1; if (Uplo == CblasUpper) UL = 'L'; else if (Uplo == CblasLower) UL = 'U'; else { cblas_xerbla(2, "cblas_ztrsv","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (TransA == CblasNoTrans) TA = 'T'; else if (TransA == CblasTrans) TA = 'N'; else if (TransA == CblasConjTrans) { TA = 'N'; if ( N > 0) { if ( incX > 0 ) tincX = incX; else tincX = -incX; n = N*2*(tincX); x++; st=x+n; i = tincX << 1; do { *x = -(*x); x+=i; } while (x != st); x -= n; } } else { cblas_xerbla(3, "cblas_ztrsv","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if (Diag == CblasUnit) DI = 'U'; else if (Diag == CblasNonUnit) DI = 'N'; else { cblas_xerbla(4, "cblas_ztrsv","Illegal Diag setting, %d\n", Diag); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_DI = C2F_CHAR(&DI); #endif F77_ztrsv( F77_UL, F77_TA, F77_DI, &F77_N, (dcomplex*)A, &F77_lda, (dcomplex*)X, &F77_incX); if (TransA == CblasConjTrans) { if (N > 0) { do { *x = -(*x); x += i; } while (x != st); } } } else cblas_xerbla(1, "cblas_ztrsv", "Illegal Order setting, %d\n", order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/000077500000000000000000000000001427272030600232115ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_caxpby.c000066400000000000000000000012061427272030600260060ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_caxpby.c * * The program is a C interface to caxpby. * * Copyright (C) 2020, Advanced Micro Devices, Inc * */ #include "cblas.h" #include "cblas_f77.h" void cblas_caxpby( f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_caxpby( &F77_N, (scomplex*)alpha, (scomplex*)X, &F77_incX, (scomplex*)beta, (scomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_cgemm3m.c000066400000000000000000000061551427272030600260600ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_cgemm3m.c * * This program is a C interface to cgemm3m. * * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char TA, TB; #ifdef F77_CHAR F77_CHAR F77_TA, F77_TB; #else #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_cgemm3m", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_cgemm3m(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_cgemm3m", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_cgemm3m(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B, &F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_cgemm3m", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_cgemm_batch.c000066400000000000000000000127311427272030600267560ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_cgemm_batch.c * This program is a C interface to cgemm_batch. * * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A_array, f77_int *lda_array, const void **B_array, f77_int *ldb_array, const void *beta_array, void **C_array, f77_int *ldc_array, f77_int group_count, f77_int *group_size) { char TA[group_count], TB[group_count]; #ifdef F77_CHAR F77_CHAR F77_TA[group_count], F77_TB[group_count]; #else #define F77_TA TA #define F77_TB TB #endif #ifdef F77_INT F77_INT F77_GRP_COUNT = group_count; F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; #else #define F77_GRP_COUNT group_count #define F77_M M_array #define F77_N N_array #define F77_K K_array #define F77_lda lda_array #define F77_ldb ldb_array #define F77_ldc ldc_array #define F77_GRP_SIZE group_size #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; dim_t i; if( Order == CblasColMajor ) { for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TA[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_cgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TB[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(3, "cblas_cgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA[i] = C2F_CHAR(TA+i); F77_TB[i] = C2F_CHAR(TB+i); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE[i] = group_size[i]; #endif } F77_cgemm_batch(F77_TA, F77_TB, F77_M, F77_N, F77_K, (const scomplex*)alpha_array, (const scomplex**)A_array, F77_lda, (const scomplex**)B_array, F77_ldb, (const scomplex*)beta_array, (scomplex**)C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; dim_t i; for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TB[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(2, "cblas_cgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TA[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_cgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE = group_size[i]; #endif } F77_cgemm_batch(F77_TA, F77_TB, F77_N, F77_M, F77_K, (const scomplex*)alpha_array, (const scomplex**)B_array, F77_ldb, (const scomplex**)A_array, F77_lda, (const scomplex*)beta_array, (scomplex**)C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else cblas_xerbla(1, "cblas_cgemm_batch", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_cgemmt.c000066400000000000000000000123451427272030600260020ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* cblas_cgemmt.c Based off of cblas_cgemm.c. */ /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cblas.h" #include "cblas_f77.h" void cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char UL, TA, TB; #ifdef F77_CHAR F77_CHAR F77_UL, F77_TA, F77_TB; #else #define F77_UL &UL #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)A, &F77_lda, (scomplex*)B, &F77_ldb, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(2, "cblas_cgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_cgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_cgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_cgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (scomplex*)alpha, (scomplex*)B, &F77_ldb, (scomplex*)A, &F77_lda, (scomplex*)beta, (scomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_cgemmt", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_daxpby.c000066400000000000000000000011221427272030600260040ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_daxpby.c * * The program is a C interface to daxpby. * * Copyright (C) 2020, Advanced Micro Devices, Inc. */ #include "cblas.h" #include "cblas_f77.h" void cblas_daxpby( f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_daxpby( &F77_N, &alpha, X, &F77_incX, &beta, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_dgemm_batch.c000066400000000000000000000124771427272030600267660ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_dgemm_batch.c * This program is a C interface to dgemm_batch. * * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A_array, f77_int *lda_array, const double **B_array, f77_int *ldb_array, const double *beta_array, double **C_array, f77_int *ldc_array, f77_int group_count, f77_int *group_size) { char TA[group_count], TB[group_count]; #ifdef F77_CHAR F77_CHAR F77_TA[group_count], F77_TB[group_count]; #else #define F77_TA TA #define F77_TB TB #endif #ifdef F77_INT F77_INT F77_GRP_COUNT = group_count; F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; #else #define F77_GRP_COUNT group_count #define F77_M M_array #define F77_N N_array #define F77_K K_array #define F77_lda lda_array #define F77_ldb ldb_array #define F77_ldc ldc_array #define F77_GRP_SIZE group_size #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; dim_t i; if( Order == CblasColMajor ) { for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TA[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_dgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TB[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(3, "cblas_dgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA[i] = C2F_CHAR(TA+i); F77_TB[i] = C2F_CHAR(TB+i); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE[i] = group_size[i]; #endif } F77_dgemm_batch(F77_TA, F77_TB, F77_M, F77_N, F77_K, alpha_array, A_array, F77_lda, B_array, F77_ldb, beta_array, C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; dim_t i; for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TB[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(2, "cblas_dgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TA[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_dgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE = group_size[i]; #endif } F77_dgemm_batch(F77_TA, F77_TB, F77_N, F77_M, F77_K, alpha_array, B_array, F77_ldb, A_array, F77_lda, beta_array, C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else cblas_xerbla(1, "cblas_dgemm_batch", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_dgemmt.c000066400000000000000000000121671427272030600260050ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* cblas_dgemmt.c Based off of cblas_dgemm.c. */ /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cblas.h" #include "cblas_f77.h" void cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc) { char UL, TA, TB; #ifdef F77_CHAR F77_CHAR F77_UL, F77_TA, F77_TB; #else #define F77_UL &UL #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(2, "cblas_dgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_dgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_dgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_dgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B, &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_dgemmt", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_saxpby.c000066400000000000000000000012071427272030600260270ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_saxpby.c * * The program is a C interface to saxpby. * It calls the fortran wrapper before calling saxpby. * * Copyright (C) 2020, Advanced Micro Devices, Inc. */ #include "cblas.h" #include "cblas_f77.h" void cblas_saxpby( f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_saxpby( &F77_N, &alpha, X, &F77_incX, &beta, Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_sgemm_batch.c000066400000000000000000000124731427272030600270010ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_sgemm_batch.c * This program is a C interface to sgemm_batch. * * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A_array, f77_int *lda_array, const float **B_array, f77_int *ldb_array, const float *beta_array, float **C_array, f77_int *ldc_array, f77_int group_count, f77_int *group_size) { char TA[group_count], TB[group_count]; #ifdef F77_CHAR F77_CHAR F77_TA[group_count], F77_TB[group_count]; #else #define F77_TA TA #define F77_TB TB #endif #ifdef F77_INT F77_INT F77_GRP_COUNT = group_count; F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; #else #define F77_GRP_COUNT group_count #define F77_M M_array #define F77_N N_array #define F77_K K_array #define F77_lda lda_array #define F77_ldb ldb_array #define F77_ldc ldc_array #define F77_GRP_SIZE group_size #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; dim_t i; if( Order == CblasColMajor ) { for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TA[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_sgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TB[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(3, "cblas_sgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA[i] = C2F_CHAR(TA+i); F77_TB[i] = C2F_CHAR(TB+i); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE[i] = group_size[i]; #endif } F77_sgemm_batch(F77_TA, F77_TB, F77_M, F77_N, F77_K, alpha_array, A_array, F77_lda, B_array, F77_ldb, beta_array, C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; dim_t i; for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TB[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(2, "cblas_sgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TA[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_sgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE = group_size[i]; #endif } F77_sgemm_batch(F77_TA, F77_TB, F77_N, F77_M, F77_K, alpha_array, B_array, F77_ldb, A_array, F77_lda, beta_array, C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else cblas_xerbla(1, "cblas_sgemm_batch", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_sgemmt.c000066400000000000000000000121621427272030600260170ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* cblas_sgemmt.c Based off of cblas_sgemm.c. */ /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cblas.h" #include "cblas_f77.h" void cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc) { char UL, TA, TB; #ifdef F77_CHAR F77_CHAR F77_UL, F77_TA, F77_TB; #else #define F77_UL &UL #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(2, "cblas_sgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_sgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_sgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_sgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, &alpha, B, &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc); } else cblas_xerbla(1, "cblas_sgemmt", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_zaxpby.c000066400000000000000000000012101427272030600260300ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * cblas_zaxpby.c * * The program is a C interface to zaxpby. * * Copyright (C) 2020, Advanced Micro Devices, Inc. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zaxpby( f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY) { #ifdef F77_INT F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; #else #define F77_N N #define F77_incX incX #define F77_incY incY #endif F77_zaxpby( &F77_N, (dcomplex*)alpha, (dcomplex*)X, &F77_incX, (dcomplex*)beta, (dcomplex*)Y, &F77_incY); } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_zgemm3m.c000066400000000000000000000061531427272030600261050ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zgemm3m.c * * This program is a C interface to zgemm3m. * * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char TA, TB; #ifdef F77_CHAR F77_CHAR F77_TA, F77_TB; #else #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_N N #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_zgemm3m", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_zgemm3m(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(2, "cblas_zgemm3m", "Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_zgemm3m(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B, &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zgemm3m", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_zgemm_batch.c000066400000000000000000000127311427272030600270050ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* * * cblas_zgemm_batch.c * This program is a C interface to zgemm_batch. * * Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. * */ #include "cblas.h" #include "cblas_f77.h" void cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A_array, f77_int *lda_array, const void **B_array, f77_int *ldb_array, const void *beta_array, void **C_array, f77_int *ldc_array, f77_int group_count, f77_int *group_size) { char TA[group_count], TB[group_count]; #ifdef F77_CHAR F77_CHAR F77_TA[group_count], F77_TB[group_count]; #else #define F77_TA TA #define F77_TB TB #endif #ifdef F77_INT F77_INT F77_GRP_COUNT = group_count; F77_INT F77_M[F77_GRP_COUNT], F77_N[F77_GRP_COUNT], F77_K[F77_GRP_COUNT]; F77_INT F77_lda[F77_GRP_COUNT], F77_ldb[F77_GRP_COUNT], F77_ldc[F77_GRP_COUNT]; F77_INT F77_GRP_SIZE[F77_GRP_COUNT]; #else #define F77_GRP_COUNT group_count #define F77_M M_array #define F77_N N_array #define F77_K K_array #define F77_lda lda_array #define F77_ldb ldb_array #define F77_ldc ldc_array #define F77_GRP_SIZE group_size #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; dim_t i; if( Order == CblasColMajor ) { for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TA[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_zgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TB[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(3, "cblas_zgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA[i] = C2F_CHAR(TA+i); F77_TB[i] = C2F_CHAR(TB+i); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE[i] = group_size[i]; #endif } F77_zgemm_batch(F77_TA, F77_TB, F77_M, F77_N, F77_K, (const dcomplex*)alpha_array, (const dcomplex**)A_array, F77_lda, (const dcomplex**)B_array, F77_ldb, (const dcomplex*)beta_array, (dcomplex**)C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; dim_t i; for(i = 0; i < group_count; i++) { if(TransA_array[i] == CblasTrans) TB[i]='T'; else if ( TransA_array[i] == CblasConjTrans ) TB[i]='C'; else if ( TransA_array[i] == CblasNoTrans ) TB[i]='N'; else { cblas_xerbla(2, "cblas_zgemm_batch", "Illegal TransA setting %d for group %d\n", TransA_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB_array[i] == CblasTrans) TA[i]='T'; else if ( TransB_array[i] == CblasConjTrans ) TA[i]='C'; else if ( TransB_array[i] == CblasNoTrans ) TA[i]='N'; else { cblas_xerbla(2, "cblas_zgemm_batch", "Illegal TransB setting %d for group %d\n", TransB_array[i], i); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif #ifdef F77_INT F77_M[i] = M_array[i]; F77_N[i] = N_array[i]; F77_K[i] = K_array[i]; F77_lda[i] = lda_array[i]; F77_ldb[i] = ldb_array[i]; F77_ldc[i] = ldc_array[i]; F77_GRP_SIZE = group_size[i]; #endif } F77_zgemm_batch(F77_TA, F77_TB, F77_N, F77_M, F77_K, (const dcomplex*)alpha_array, (const dcomplex**)B_array, F77_ldb, (const dcomplex**)A_array, F77_lda, (const dcomplex*)beta_array, (dcomplex**)C_array, F77_ldc, &F77_GRP_COUNT, F77_GRP_SIZE); } else cblas_xerbla(1, "cblas_zgemm_batch", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/cblas/src/extra/cblas_zgemmt.c000066400000000000000000000123451427272030600260310ustar00rootroot00000000000000#include "blis.h" #ifdef BLIS_ENABLE_CBLAS /* cblas_zgemmt.c Based off of cblas_zgemm.c. */ /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "cblas.h" #include "cblas_f77.h" void cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc) { char UL, TA, TB; #ifdef F77_CHAR F77_CHAR F77_UL, F77_TA, F77_TB; #else #define F77_UL &UL #define F77_TA &TA #define F77_TB &TB #endif #ifdef F77_INT F77_INT F77_M=M, F77_K=K, F77_lda=lda, F77_ldb=ldb; F77_INT F77_ldc=ldc; #else #define F77_M M #define F77_K K #define F77_lda lda #define F77_ldb ldb #define F77_ldc ldc #endif extern int CBLAS_CallFromC; extern int RowMajorStrg; RowMajorStrg = 0; CBLAS_CallFromC = 1; if( Order == CblasColMajor ) { if( Uplo == CblasUpper) UL='U'; else if ( Uplo == CblasLower ) UL='L'; else { cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TA='T'; else if ( TransA == CblasConjTrans ) TA='C'; else if ( TransA == CblasNoTrans ) TA='N'; else { cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TB='T'; else if ( TransB == CblasConjTrans ) TB='C'; else if ( TransB == CblasNoTrans ) TB='N'; else { cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)A, &F77_lda, (dcomplex*)B, &F77_ldb, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else if (Order == CblasRowMajor) { RowMajorStrg = 1; if( Uplo == CblasUpper) UL='L'; else if ( Uplo == CblasLower ) UL='U'; else { cblas_xerbla(2, "cblas_zgemmt","Illegal Uplo setting, %d\n", Uplo); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransA == CblasTrans) TB='T'; else if ( TransA == CblasConjTrans ) TB='C'; else if ( TransA == CblasNoTrans ) TB='N'; else { cblas_xerbla(3, "cblas_zgemmt","Illegal TransA setting, %d\n", TransA); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } if(TransB == CblasTrans) TA='T'; else if ( TransB == CblasConjTrans ) TA='C'; else if ( TransB == CblasNoTrans ) TA='N'; else { cblas_xerbla(4, "cblas_zgemmt","Illegal TransB setting, %d\n", TransB); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #ifdef F77_CHAR F77_UL = C2F_CHAR(&UL); F77_TA = C2F_CHAR(&TA); F77_TB = C2F_CHAR(&TB); #endif F77_zgemmt(F77_UL, F77_TA, F77_TB, &F77_M, &F77_K, (dcomplex*)alpha, (dcomplex*)B, &F77_ldb, (dcomplex*)A, &F77_lda, (dcomplex*)beta, (dcomplex*)C, &F77_ldc); } else cblas_xerbla(1, "cblas_zgemmt", "Illegal Order setting, %d\n", Order); CBLAS_CallFromC = 0; RowMajorStrg = 0; return; } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/000077500000000000000000000000001427272030600212705ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/check/bla_gemm3m_check.h000066400000000000000000000061031427272030600246010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_gemm_check.h000066400000000000000000000057701427272030600243520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_gemmt_check.h000066400000000000000000000062341427272030600245320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_gemv_check.h000066400000000000000000000047531427272030600243630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_ger_check.h000066400000000000000000000045021427272030600241720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ /* We have to append an extra character to denote whether we are testing geru or gerc. */ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_hemm_check.h000066400000000000000000000053361427272030600243510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_hemv_check.h000066400000000000000000000045661427272030600243660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_her2_check.h000066400000000000000000000045651427272030600242660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_her2k_check.h000066400000000000000000000053371427272030600244370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_her_check.h000066400000000000000000000045061427272030600241770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_herk_check.h000066400000000000000000000052461427272030600243540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_symm_check.h000066400000000000000000000033201427272030600243770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_symv_check.h000066400000000000000000000033201427272030600244100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_syr2_check.h000066400000000000000000000033201427272030600243110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_syr2k_check.h000066400000000000000000000056351427272030600244770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_syr_check.h000066400000000000000000000033161427272030600242340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_syrk_check.h000066400000000000000000000055451427272030600244150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_trmm_check.h000066400000000000000000000062251427272030600244000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_trmv_check.h000066400000000000000000000054711427272030600244130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_trsm_check.h000066400000000000000000000033201427272030600243770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif cython-blis-0.9.1/blis/_src/frame/compat/check/bla_trsv_check.h000066400000000000000000000033201427272030600244100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/000077500000000000000000000000001427272030600213365ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_axpby.c000066400000000000000000000054151427272030600234500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ) \ { \ dim_t n0; \ ftype* x0; \ ftype* y0; \ inc_t incx0; \ inc_t incy0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Convert/typecast negative values of n to zero. */ \ bli_convert_blas_dim1( *n, n0 ); \ \ /* If the input increments are negative, adjust the pointers so we can use positive increments instead. */ \ bli_convert_blas_incv( n0, (ftype*)x, *incx, x0, incx0 ); \ bli_convert_blas_incv( n0, (ftype*)y, *incy, y0, incy0 ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ n0, \ (ftype*)alpha, \ x0, incx0, \ (ftype*)beta, \ y0, incy0, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( axpby, axpbyv ) #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_axpby.h000066400000000000000000000040431427272030600234510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_gemm3m.c000066400000000000000000000165701427272030600235160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blisname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ transa, \ transb, \ m, \ n, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* As a placeholder, invoke 1m since BLIS does no longer contains an official 3m implementation. Note that we do this by inlining an abbreviated version of bli_gemm_ex() so that we can bypass consideration of sup, which doesn't make sense in this context. */ \ { \ cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \ \ rntm_t rntm_l; \ rntm_t* rntm = &rntm_l; \ bli_rntm_init_from_global( rntm ); \ \ /* Note that we MUST disable sup handling since it could redirect execution for some problem sizes to a non-3m implementation. */ \ bli_rntm_disable_l3_sup( rntm ); \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_transa, \ blis_transb, \ m0, \ n0, \ k0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ (ftype*)beta, \ (ftype*)c, rs_c, cs_c, \ cntx, \ rntm \ ); \ } \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNCCO #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blisname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ transa, \ transb, \ m, \ n, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *n, n0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt = PASTEMAC(ch,type); \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m0_a, n0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, n0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_conjtrans( blis_transa, &ao ); \ bli_obj_set_conjtrans( blis_transb, &bo ); \ \ /* As a placeholder, invoke 1m since BLIS does no longer contains an official 3m implementation. Note that we do this by inlining an abbreviated version of bli_gemm_ex() so that we can bypass consideration of sup, which doesn't make sense in this context. */ \ { \ cntx_t* cntx = bli_gks_query_ind_cntx( BLIS_1M, dt ); \ \ rntm_t rntm_l; \ rntm_t* rntm = &rntm_l; \ bli_rntm_init_from_global( &rntm_l ); \ \ /* This is probably not needed given that we performed BLAS-style parameter checking above, but bli_gemm_check() is normally called in the normal course of bli_gemm_ex(). */ \ if ( bli_error_checking_is_enabled() ) \ bli_gemm_check( &alphao, &ao, &bo, &betao, &co, cntx ); \ \ PASTEMAC(blisname,_front) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ cntx, \ rntm, \ NULL \ ); \ } \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNCCO_BLAS( gemm3m, gemm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_gemm3m.h000066400000000000000000000043731427272030600235210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_gemm_batch.c000066400000000000000000000162001427272030600244050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ) \ { \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ for ( f77_int gi = 0; gi < *group_count; gi++ ) \ { \ PASTEBLACHK(blisname) \ ( \ MKSTR(ch), \ MKSTR(blisname), \ transa_array+gi, \ transb_array+gi, \ m_array+gi, \ n_array+gi, \ k_array+gi, \ lda_array+gi, \ ldb_array+gi, \ ldc_array+gi \ ); \ } \ \ f77_int idx = 0; \ \ for ( f77_int i = 0; i < *group_count; i++ ) \ { \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( transa_array[i], &blis_transa ); \ bli_param_map_netlib_to_blis_trans( transb_array[i], &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( m_array[i], m0 ); \ bli_convert_blas_dim1( n_array[i], n0 ); \ bli_convert_blas_dim1( k_array[i], k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = lda_array[i]; \ const inc_t rs_b = 1; \ const inc_t cs_b = ldb_array[i]; \ const inc_t rs_c = 1; \ const inc_t cs_c = ldc_array[i]; \ \ for ( f77_int j = 0; j < group_size[i]; j++ ) \ { \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_transa, \ blis_transb, \ m0, \ n0, \ k0, \ (ftype*)(alpha_array + i), \ (ftype*)*(a_array + idx), rs_a, cs_a, \ (ftype*)*(b_array + idx), rs_b, cs_b, \ (ftype*)(beta_array + i), \ (ftype*)*(c_array + idx), rs_c, cs_c, \ NULL, \ NULL \ ); \ \ idx++; \ } \ } \ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size ) \ { \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, n0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ for ( f77_int gi = 0; gi < *group_count; gi++ ) \ { \ PASTEBLACHK(blisname) \ ( \ MKSTR(ch), \ MKSTR(blisname), \ transa_array+gi, \ transb_array+gi, \ m_array+gi, \ n_array+gi, \ k_array+gi, \ lda_array+gi, \ ldb_array+gi, \ ldc_array+gi \ ); \ } \ \ const num_t dt = PASTEMAC(ch,type); \ \ f77_int idx = 0, i, j; \ \ for ( i = 0; i < *group_count; i++ ) \ { \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_trans( transa_array[i], &blis_transa ); \ bli_param_map_netlib_to_blis_trans( transb_array[i], &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( m_array[i], m0 ); \ bli_convert_blas_dim1( n_array[i], n0 ); \ bli_convert_blas_dim1( k_array[i], k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = lda_array[i]; \ const inc_t rs_b = 1; \ const inc_t cs_b = ldb_array[i]; \ const inc_t rs_c = 1; \ const inc_t cs_c = ldc_array[i]; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ \ dim_t m0_a, n0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ bli_set_dims_with_trans( blis_transb, k0, n0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)(alpha_array + i), &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)(beta_array + i), &betao ); \ \ for( j = 0; j < group_size[i]; j++ ) \ { \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)*(a_array + idx), rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)*(b_array + idx), rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, n0, (ftype*)*(c_array + idx), rs_c, cs_c, &co ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ bli_obj_set_conjtrans( blis_transb, &bo ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ idx++; \ } \ } \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( gemm_batch, gemm ) #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_gemm_batch.h000066400000000000000000000045571427272030600244260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_gemmt.c000066400000000000000000000143171427272030600234370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-to-BLIS interfaces. // #ifdef BLIS_BLAS3_CALLS_TAPI #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ transb, \ m, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ /* Call BLIS interface. */ \ PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \ ( \ blis_uploc, \ blis_transa, \ blis_transb, \ m0, \ k0, \ (ftype*)alpha, \ (ftype*)a, rs_a, cs_a, \ (ftype*)b, rs_b, cs_b, \ (ftype*)beta, \ (ftype*)c, rs_c, cs_c, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #else #undef GENTFUNC #define GENTFUNC( ftype, ch, blasname, blisname ) \ \ void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ) \ { \ uplo_t blis_uploc; \ trans_t blis_transa; \ trans_t blis_transb; \ dim_t m0, k0; \ \ /* Initialize BLIS. */ \ bli_init_auto(); \ \ /* Perform BLAS parameter checking. */ \ PASTEBLACHK(blasname) \ ( \ MKSTR(ch), \ MKSTR(blasname), \ uploc, \ transa, \ transb, \ m, \ k, \ lda, \ ldb, \ ldc \ ); \ \ /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \ bli_param_map_netlib_to_blis_uplo( *uploc, &blis_uploc ); \ bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \ bli_param_map_netlib_to_blis_trans( *transb, &blis_transb ); \ \ /* Typecast BLAS integers to BLIS integers. */ \ bli_convert_blas_dim1( *m, m0 ); \ bli_convert_blas_dim1( *k, k0 ); \ \ /* Set the row and column strides of the matrix operands. */ \ const inc_t rs_a = 1; \ const inc_t cs_a = *lda; \ const inc_t rs_b = 1; \ const inc_t cs_b = *ldb; \ const inc_t rs_c = 1; \ const inc_t cs_c = *ldc; \ \ const num_t dt = PASTEMAC(ch,type); \ \ const struc_t strucc = BLIS_SYMMETRIC; \ \ obj_t alphao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t ao = BLIS_OBJECT_INITIALIZER; \ obj_t bo = BLIS_OBJECT_INITIALIZER; \ obj_t betao = BLIS_OBJECT_INITIALIZER_1X1; \ obj_t co = BLIS_OBJECT_INITIALIZER; \ \ dim_t m0_a, n0_a; \ dim_t m0_b, n0_b; \ \ bli_set_dims_with_trans( blis_transa, m0, k0, &m0_a, &n0_a ); \ bli_set_dims_with_trans( blis_transb, k0, m0, &m0_b, &n0_b ); \ \ bli_obj_init_finish_1x1( dt, (ftype*)alpha, &alphao ); \ bli_obj_init_finish_1x1( dt, (ftype*)beta, &betao ); \ \ bli_obj_init_finish( dt, m0_a, n0_a, (ftype*)a, rs_a, cs_a, &ao ); \ bli_obj_init_finish( dt, m0_b, n0_b, (ftype*)b, rs_b, cs_b, &bo ); \ bli_obj_init_finish( dt, m0, m0, (ftype*)c, rs_c, cs_c, &co ); \ \ bli_obj_set_uplo( blis_uploc, &co ); \ bli_obj_set_conjtrans( blis_transa, &ao ); \ bli_obj_set_conjtrans( blis_transb, &bo ); \ \ bli_obj_set_struc( strucc, &co ); \ \ PASTEMAC(blisname,BLIS_OAPI_EX_SUF) \ ( \ &alphao, \ &ao, \ &bo, \ &betao, \ &co, \ NULL, \ NULL \ ); \ \ /* Finalize BLIS. */ \ bli_finalize_auto(); \ } #endif #ifdef BLIS_ENABLE_BLAS INSERT_GENTFUNC_BLAS( gemmt, gemmt ) #endif cython-blis-0.9.1/blis/_src/frame/compat/extra/bla_gemmt.h000066400000000000000000000044161427272030600234430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/000077500000000000000000000000001427272030600206655ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_cabs1.c000066400000000000000000000044531427272030600226460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* scabs1.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ bla_real PASTEF77(s,cabs1)(bla_scomplex *z) { return bli_fabs( bli_creal( *z ) ) + bli_fabs( bli_cimag( *z ) ); } /* scabs1_ */ /* dcabs1.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ bla_double PASTEF77(d,cabs1)(bla_dcomplex *z) { return bli_fabs( bli_zreal( *z ) ) + bli_fabs( bli_zimag( *z ) ); } /* dcabs1_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_cabs1.h000066400000000000000000000034521427272030600226510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_gbmv.c000066400000000000000000001450771427272030600226200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* cgbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; bla_scomplex q__1, q__2, q__3; /* Builtin functions */ //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp; bla_integer lenx, leny, i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj; bla_integer kup1; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CGBMV performs one of the matrix-vector operations */ /* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or */ /* y := alpha*conjg( A' )*x + beta*y, */ /* where alpha and beta are scalars, x and y are vectors and A is an */ /* m by n band matrix, with kl sub-diagonals and ku super-diagonals. */ /* Parameters */ /* ========== */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. */ /* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. */ /* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. */ /* Unchanged on exit. */ /* M - INTEGER. */ /* On entry, M specifies the number of rows of the matrix A. */ /* M must be at least zero. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the number of columns of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* KL - INTEGER. */ /* On entry, KL specifies the number of sub-diagonals of the */ /* matrix A. KL must satisfy 0 .le. KL. */ /* Unchanged on exit. */ /* KU - INTEGER. */ /* On entry, KU specifies the number of super-diagonals of the */ /* matrix A. KU must satisfy 0 .le. KU. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - COMPLEX array of DIMENSION ( LDA, n ). */ /* Before entry, the leading ( kl + ku + 1 ) by n part of the */ /* array A must contain the matrix of coefficients, supplied */ /* column by column, with the leading diagonal of the matrix in */ /* row ( ku + 1 ) of the array, the first super-diagonal */ /* starting at position 2 in row ku, the first sub-diagonal */ /* starting at position 1 in row ( ku + 2 ), and so on. */ /* Elements in the array A that do not correspond to elements */ /* in the band matrix (such as the top left ku by ku triangle) */ /* are not referenced. */ /* The following program segment will transfer a band matrix */ /* from conventional full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* K = KU + 1 - J */ /* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) */ /* A( K + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( kl + ku + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - COMPLEX . */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - COMPLEX array of DIMENSION at least */ /* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1) ) { info = 1; } else if (*m < 0) { info = 2; } else if (*n < 0) { info = 3; } else if (*kl < 0) { info = 4; } else if (*ku < 0) { info = 5; } else if (*lda < *kl + *ku + 1) { info = 8; } else if (*incx == 0) { info = 10; } else if (*incy == 0) { info = 13; } if (info != 0) { PASTEF770(xerbla)("CGBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) == 1.f && bli_cimag(*beta) == 0.f))) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); /* Set LENX and LENY, the lengths of the vectors x and y, and set */ /* up the start points in X and Y. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { lenx = *n; leny = *m; } else { lenx = *m; leny = *n; } if (*incx > 0) { kx = 1; } else { kx = 1 - (lenx - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (leny - 1) * *incy; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through the band part of A. */ /* First form y := beta*y. */ if (bli_creal(*beta) != 1.f || bli_cimag(*beta) != 0.f) { if (*incy == 1) { if (bli_creal(*beta) == 0.f && bli_cimag(*beta) == 0.f) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_csets( (0.f), (0.f), y[i__2] ); /* L10: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__; bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); /* L20: */ } } } else { iy = ky; if (bli_creal(*beta) == 0.f && bli_cimag(*beta) == 0.f) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; bli_csets( (0.f), (0.f), y[i__2] ); iy += *incy; /* L30: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; i__3 = iy; bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); iy += *incy; /* L40: */ } } } } if (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f) { return 0; } kup1 = *ku + 1; if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form y := alpha*A*x + y. */ jx = kx; if (*incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { i__2 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); k = kup1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = i__; i__3 = i__; i__5 = k + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); /* L50: */ } } jx += *incx; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__4 = jx; if (bli_creal(x[i__4]) != 0.f || bli_cimag(x[i__4]) != 0.f) { i__4 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__4]) - bli_cimag(*alpha) * bli_cimag(x[i__4])), (bli_creal(*alpha) * bli_cimag(x[i__4]) + bli_cimag(*alpha) * bli_creal(x[i__4])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); iy = ky; k = kup1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__3 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = iy; i__2 = iy; i__5 = k + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] ); iy += *incy; /* L70: */ } } jx += *incx; if (j > *ku) { ky += *incy; } /* L80: */ } } } else { /* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. */ jy = ky; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { bli_csets( (0.f), (0.f), temp ); k = kup1 - j; if (noconj) { /* Computing MAX */ i__3 = 1, i__4 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__2 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { i__3 = k + i__ + j * a_dim1; i__4 = i__; bli_csets( (bli_creal(a[i__3]) * bli_creal(x[i__4]) - bli_cimag(a[i__3]) * bli_cimag(x[i__4])), (bli_creal(a[i__3]) * bli_cimag(x[i__4]) + bli_cimag(a[i__3]) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L90: */ } } else { /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { bla_r_cnjg(&q__3, &a[k + i__ + j * a_dim1]); i__2 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L100: */ } } i__4 = jy; i__2 = jy; bli_csets( (bli_creal(*alpha) * bli_creal(temp) - bli_cimag(*alpha) * bli_cimag(temp)), (bli_creal(*alpha) * bli_cimag(temp) + bli_cimag(*alpha) * bli_creal(temp)), q__2 ); bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] ); jy += *incy; /* L110: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { bli_csets( (0.f), (0.f), temp ); ix = kx; k = kup1 - j; if (noconj) { /* Computing MAX */ i__4 = 1, i__2 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__3 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = k + i__ + j * a_dim1; i__2 = ix; bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__2]) - bli_cimag(a[i__4]) * bli_cimag(x[i__2])), (bli_creal(a[i__4]) * bli_cimag(x[i__2]) + bli_cimag(a[i__4]) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L120: */ } } else { /* Computing MAX */ i__3 = 1, i__4 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__2 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { bla_r_cnjg(&q__3, &a[k + i__ + j * a_dim1]); i__3 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L130: */ } } i__2 = jy; i__3 = jy; bli_csets( (bli_creal(*alpha) * bli_creal(temp) - bli_cimag(*alpha) * bli_cimag(temp)), (bli_creal(*alpha) * bli_cimag(temp) + bli_cimag(*alpha) * bli_creal(temp)), q__2 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); jy += *incy; if (j > *ku) { kx += *incx; } /* L140: */ } } } return 0; /* End of CGBMV . */ } /* cgbmv_ */ /* dgbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; /* Local variables */ bla_integer info; bla_double temp; bla_integer lenx, leny, i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_integer kup1; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DGBMV performs one of the matrix-vector operations */ /* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, */ /* where alpha and beta are scalars, x and y are vectors and A is an */ /* m by n band matrix, with kl sub-diagonals and ku super-diagonals. */ /* Parameters */ /* ========== */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. */ /* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. */ /* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. */ /* Unchanged on exit. */ /* M - INTEGER. */ /* On entry, M specifies the number of rows of the matrix A. */ /* M must be at least zero. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the number of columns of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* KL - INTEGER. */ /* On entry, KL specifies the number of sub-diagonals of the */ /* matrix A. KL must satisfy 0 .le. KL. */ /* Unchanged on exit. */ /* KU - INTEGER. */ /* On entry, KU specifies the number of super-diagonals of the */ /* matrix A. KU must satisfy 0 .le. KU. */ /* Unchanged on exit. */ /* ALPHA - DOUBLE PRECISION. */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ /* Before entry, the leading ( kl + ku + 1 ) by n part of the */ /* array A must contain the matrix of coefficients, supplied */ /* column by column, with the leading diagonal of the matrix in */ /* row ( ku + 1 ) of the array, the first super-diagonal */ /* starting at position 2 in row ku, the first sub-diagonal */ /* starting at position 1 in row ( ku + 2 ), and so on. */ /* Elements in the array A that do not correspond to elements */ /* in the band matrix (such as the top left ku by ku triangle) */ /* are not referenced. */ /* The following program segment will transfer a band matrix */ /* from conventional full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* K = KU + 1 - J */ /* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) */ /* A( K + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( kl + ku + 1 ). */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - DOUBLE PRECISION. */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - DOUBLE PRECISION array of DIMENSION at least */ /* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1) ) { info = 1; } else if (*m < 0) { info = 2; } else if (*n < 0) { info = 3; } else if (*kl < 0) { info = 4; } else if (*ku < 0) { info = 5; } else if (*lda < *kl + *ku + 1) { info = 8; } else if (*incx == 0) { info = 10; } else if (*incy == 0) { info = 13; } if (info != 0) { PASTEF770(xerbla)("DGBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || (*alpha == 0. && *beta == 1.)) { return 0; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ /* up the start points in X and Y. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { lenx = *n; leny = *m; } else { lenx = *m; leny = *n; } if (*incx > 0) { kx = 1; } else { kx = 1 - (lenx - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (leny - 1) * *incy; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through the band part of A. */ /* First form y := beta*y. */ if (*beta != 1.) { if (*incy == 1) { if (*beta == 0.) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.; /* L10: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = *beta * y[i__]; /* L20: */ } } } else { iy = ky; if (*beta == 0.) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = 0.; iy += *incy; /* L30: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = *beta * y[iy]; iy += *incy; /* L40: */ } } } } if (*alpha == 0.) { return 0; } kup1 = *ku + 1; if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form y := alpha*A*x + y. */ jx = kx; if (*incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { temp = *alpha * x[jx]; k = kup1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { y[i__] += temp * a[k + i__ + j * a_dim1]; /* L50: */ } } jx += *incx; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { temp = *alpha * x[jx]; iy = ky; k = kup1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__3 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { y[iy] += temp * a[k + i__ + j * a_dim1]; iy += *incy; /* L70: */ } } jx += *incx; if (j > *ku) { ky += *incy; } /* L80: */ } } } else { /* Form y := alpha*A'*x + y. */ jy = ky; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = 0.; k = kup1 - j; /* Computing MAX */ i__3 = 1, i__4 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__2 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { temp += a[k + i__ + j * a_dim1] * x[i__]; /* L90: */ } y[jy] += *alpha * temp; jy += *incy; /* L100: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = 0.; ix = kx; k = kup1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { temp += a[k + i__ + j * a_dim1] * x[ix]; ix += *incx; /* L110: */ } y[jy] += *alpha * temp; jy += *incy; if (j > *ku) { kx += *incx; } /* L120: */ } } } return 0; /* End of DGBMV . */ } /* dgbmv_ */ /* sgbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; /* Local variables */ bla_integer info; bla_real temp; bla_integer lenx, leny, i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_integer kup1; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SGBMV performs one of the matrix-vector operations */ /* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, */ /* where alpha and beta are scalars, x and y are vectors and A is an */ /* m by n band matrix, with kl sub-diagonals and ku super-diagonals. */ /* Parameters */ /* ========== */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. */ /* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. */ /* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. */ /* Unchanged on exit. */ /* M - INTEGER. */ /* On entry, M specifies the number of rows of the matrix A. */ /* M must be at least zero. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the number of columns of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* KL - INTEGER. */ /* On entry, KL specifies the number of sub-diagonals of the */ /* matrix A. KL must satisfy 0 .le. KL. */ /* Unchanged on exit. */ /* KU - INTEGER. */ /* On entry, KU specifies the number of super-diagonals of the */ /* matrix A. KU must satisfy 0 .le. KU. */ /* Unchanged on exit. */ /* ALPHA - REAL . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - REAL array of DIMENSION ( LDA, n ). */ /* Before entry, the leading ( kl + ku + 1 ) by n part of the */ /* array A must contain the matrix of coefficients, supplied */ /* column by column, with the leading diagonal of the matrix in */ /* row ( ku + 1 ) of the array, the first super-diagonal */ /* starting at position 2 in row ku, the first sub-diagonal */ /* starting at position 1 in row ( ku + 2 ), and so on. */ /* Elements in the array A that do not correspond to elements */ /* in the band matrix (such as the top left ku by ku triangle) */ /* are not referenced. */ /* The following program segment will transfer a band matrix */ /* from conventional full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* K = KU + 1 - J */ /* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) */ /* A( K + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( kl + ku + 1 ). */ /* Unchanged on exit. */ /* X - REAL array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - REAL . */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - REAL array of DIMENSION at least */ /* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1) ) { info = 1; } else if (*m < 0) { info = 2; } else if (*n < 0) { info = 3; } else if (*kl < 0) { info = 4; } else if (*ku < 0) { info = 5; } else if (*lda < *kl + *ku + 1) { info = 8; } else if (*incx == 0) { info = 10; } else if (*incy == 0) { info = 13; } if (info != 0) { PASTEF770(xerbla)("SGBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || (*alpha == 0.f && *beta == 1.f)) { return 0; } /* Set LENX and LENY, the lengths of the vectors x and y, and set */ /* up the start points in X and Y. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { lenx = *n; leny = *m; } else { lenx = *m; leny = *n; } if (*incx > 0) { kx = 1; } else { kx = 1 - (lenx - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (leny - 1) * *incy; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through the band part of A. */ /* First form y := beta*y. */ if (*beta != 1.f) { if (*incy == 1) { if (*beta == 0.f) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.f; /* L10: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = *beta * y[i__]; /* L20: */ } } } else { iy = ky; if (*beta == 0.f) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = 0.f; iy += *incy; /* L30: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = *beta * y[iy]; iy += *incy; /* L40: */ } } } } if (*alpha == 0.f) { return 0; } kup1 = *ku + 1; if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form y := alpha*A*x + y. */ jx = kx; if (*incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f) { temp = *alpha * x[jx]; k = kup1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { y[i__] += temp * a[k + i__ + j * a_dim1]; /* L50: */ } } jx += *incx; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f) { temp = *alpha * x[jx]; iy = ky; k = kup1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__3 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { y[iy] += temp * a[k + i__ + j * a_dim1]; iy += *incy; /* L70: */ } } jx += *incx; if (j > *ku) { ky += *incy; } /* L80: */ } } } else { /* Form y := alpha*A'*x + y. */ jy = ky; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = 0.f; k = kup1 - j; /* Computing MAX */ i__3 = 1, i__4 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__2 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { temp += a[k + i__ + j * a_dim1] * x[i__]; /* L90: */ } y[jy] += *alpha * temp; jy += *incy; /* L100: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = 0.f; ix = kx; k = kup1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { temp += a[k + i__ + j * a_dim1] * x[ix]; ix += *incx; /* L110: */ } y[jy] += *alpha * temp; jy += *incy; if (j > *ku) { kx += *incx; } /* L120: */ } } } return 0; /* End of SGBMV . */ } /* sgbmv_ */ /* zgbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6; bla_dcomplex z__1, z__2, z__3; /* Builtin functions */ //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp; bla_integer lenx, leny, i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj; bla_integer kup1; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZGBMV performs one of the matrix-vector operations */ /* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or */ /* y := alpha*conjg( A' )*x + beta*y, */ /* where alpha and beta are scalars, x and y are vectors and A is an */ /* m by n band matrix, with kl sub-diagonals and ku super-diagonals. */ /* Parameters */ /* ========== */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. */ /* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. */ /* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. */ /* Unchanged on exit. */ /* M - INTEGER. */ /* On entry, M specifies the number of rows of the matrix A. */ /* M must be at least zero. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the number of columns of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* KL - INTEGER. */ /* On entry, KL specifies the number of sub-diagonals of the */ /* matrix A. KL must satisfy 0 .le. KL. */ /* Unchanged on exit. */ /* KU - INTEGER. */ /* On entry, KU specifies the number of super-diagonals of the */ /* matrix A. KU must satisfy 0 .le. KU. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX*16 . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ /* Before entry, the leading ( kl + ku + 1 ) by n part of the */ /* array A must contain the matrix of coefficients, supplied */ /* column by column, with the leading diagonal of the matrix in */ /* row ( ku + 1 ) of the array, the first super-diagonal */ /* starting at position 2 in row ku, the first sub-diagonal */ /* starting at position 1 in row ( ku + 2 ), and so on. */ /* Elements in the array A that do not correspond to elements */ /* in the band matrix (such as the top left ku by ku triangle) */ /* are not referenced. */ /* The following program segment will transfer a band matrix */ /* from conventional full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* K = KU + 1 - J */ /* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) */ /* A( K + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( kl + ku + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - COMPLEX*16 . */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - COMPLEX*16 array of DIMENSION at least */ /* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' */ /* and at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", ( ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1) ) { info = 1; } else if (*m < 0) { info = 2; } else if (*n < 0) { info = 3; } else if (*kl < 0) { info = 4; } else if (*ku < 0) { info = 5; } else if (*lda < *kl + *ku + 1) { info = 8; } else if (*incx == 0) { info = 10; } else if (*incy == 0) { info = 13; } if (info != 0) { PASTEF770(xerbla)("ZGBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 1. && bli_zimag(*beta) == 0.))) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); /* Set LENX and LENY, the lengths of the vectors x and y, and set */ /* up the start points in X and Y. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { lenx = *n; leny = *m; } else { lenx = *m; leny = *n; } if (*incx > 0) { kx = 1; } else { kx = 1 - (lenx - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (leny - 1) * *incy; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through the band part of A. */ /* First form y := beta*y. */ if (bli_zreal(*beta) != 1. || bli_zimag(*beta) != 0.) { if (*incy == 1) { if (bli_zreal(*beta) == 0. && bli_zimag(*beta) == 0.) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_zsets( (0.), (0.), y[i__2] ); /* L10: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__; bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); /* L20: */ } } } else { iy = ky; if (bli_zreal(*beta) == 0. && bli_zimag(*beta) == 0.) { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; bli_zsets( (0.), (0.), y[i__2] ); iy += *incy; /* L30: */ } } else { i__1 = leny; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; i__3 = iy; bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); iy += *incy; /* L40: */ } } } } if (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0.) { return 0; } kup1 = *ku + 1; if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form y := alpha*A*x + y. */ jx = kx; if (*incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { i__2 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); k = kup1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = i__; i__3 = i__; i__5 = k + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); /* L50: */ } } jx += *incx; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__4 = jx; if (bli_zreal(x[i__4]) != 0. || bli_zimag(x[i__4]) != 0.) { i__4 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__4]) - bli_zimag(*alpha) * bli_zimag(x[i__4])), (bli_zreal(*alpha) * bli_zimag(x[i__4]) + bli_zimag(*alpha) * bli_zreal(x[i__4])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); iy = ky; k = kup1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__3 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = iy; i__2 = iy; i__5 = k + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] ); iy += *incy; /* L70: */ } } jx += *incx; if (j > *ku) { ky += *incy; } /* L80: */ } } } else { /* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. */ jy = ky; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { bli_zsets( (0.), (0.), temp ); k = kup1 - j; if (noconj) { /* Computing MAX */ i__3 = 1, i__4 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__2 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { i__3 = k + i__ + j * a_dim1; i__4 = i__; bli_zsets( (bli_zreal(a[i__3]) * bli_zreal(x[i__4]) - bli_zimag(a[i__3]) * bli_zimag(x[i__4])), (bli_zreal(a[i__3]) * bli_zimag(x[i__4]) + bli_zimag(a[i__3]) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L90: */ } } else { /* Computing MAX */ i__2 = 1, i__3 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__4 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { bla_d_cnjg(&z__3, &a[k + i__ + j * a_dim1]); i__2 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L100: */ } } i__4 = jy; i__2 = jy; bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp) - bli_zimag(*alpha) * bli_zimag(temp)), (bli_zreal(*alpha) * bli_zimag(temp) + bli_zimag(*alpha) * bli_zreal(temp)), z__2 ); bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] ); jy += *incy; /* L110: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { bli_zsets( (0.), (0.), temp ); ix = kx; k = kup1 - j; if (noconj) { /* Computing MAX */ i__4 = 1, i__2 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__3 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = k + i__ + j * a_dim1; i__2 = ix; bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__2]) - bli_zimag(a[i__4]) * bli_zimag(x[i__2])), (bli_zreal(a[i__4]) * bli_zimag(x[i__2]) + bli_zimag(a[i__4]) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L120: */ } } else { /* Computing MAX */ i__3 = 1, i__4 = j - *ku; /* Computing MIN */ i__5 = *m, i__6 = j + *kl; i__2 = f2c_min(i__5,i__6); for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { bla_d_cnjg(&z__3, &a[k + i__ + j * a_dim1]); i__3 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L130: */ } } i__2 = jy; i__3 = jy; bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp) - bli_zimag(*alpha) * bli_zimag(temp)), (bli_zreal(*alpha) * bli_zimag(temp) + bli_zimag(*alpha) * bli_zreal(temp)), z__2 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); jy += *incy; if (j > *ku) { kx += *incx; } /* L140: */ } } } return 0; /* End of ZGBMV . */ } /* zgbmv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_gbmv.h000066400000000000000000000057721427272030600226220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hbmv.c000066400000000000000000001072731427272030600226150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* chbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex * alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; bla_real r__1; bla_scomplex q__1, q__2, q__3, q__4; /* Builtin functions */ //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp1, temp2; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CHBMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n hermitian band matrix, with k super-diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the band matrix A is being supplied as */ /* follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* being supplied. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* being supplied. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry, K specifies the number of super-diagonals of the */ /* matrix A. K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - COMPLEX array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the hermitian matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer the upper */ /* triangular part of a hermitian band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the hermitian matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer the lower */ /* triangular part of a hermitian band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set and are assumed to be zero. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - COMPLEX . */ /* On entry, BETA specifies the scalar beta. */ /* Unchanged on exit. */ /* Y - COMPLEX array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*k < 0) { info = 3; } else if (*lda < *k + 1) { info = 6; } else if (*incx == 0) { info = 8; } else if (*incy == 0) { info = 11; } if (info != 0) { PASTEF770(xerbla)("CHBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) == 1.f && bli_cimag(*beta) == 0.f))) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array A */ /* are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if (bli_creal(*beta) != 1.f || bli_cimag(*beta) != 0.f) { if (*incy == 1) { if (bli_creal(*beta) == 0.f && bli_cimag(*beta) == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_csets( (0.f), (0.f), y[i__2] ); /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__; bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); /* L20: */ } } } else { iy = ky; if (bli_creal(*beta) == 0.f && bli_cimag(*beta) == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; bli_csets( (0.f), (0.f), y[i__2] ); iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; i__3 = iy; bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); iy += *incy; /* L40: */ } } } } if (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f) { return 0; } if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when upper triangle of A is stored. */ kplus1 = *k + 1; if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = i__; i__3 = i__; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__2 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); /* L50: */ } i__4 = j; i__2 = j; i__3 = kplus1 + j * a_dim1; r__1 = bli_creal(a[i__3]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 ); bli_csets( (bli_creal(y[i__2]) + bli_creal(q__3)), (bli_cimag(y[i__2]) + bli_cimag(q__3)), q__2 ); bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] ); /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__4 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__4]) - bli_cimag(*alpha) * bli_cimag(x[i__4])), (bli_creal(*alpha) * bli_cimag(x[i__4]) + bli_cimag(*alpha) * bli_creal(x[i__4])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); ix = kx; iy = ky; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = iy; i__2 = iy; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] ); bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); ix += *incx; iy += *incy; /* L70: */ } i__3 = jy; i__4 = jy; i__2 = kplus1 + j * a_dim1; r__1 = bli_creal(a[i__2]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__3)), (bli_cimag(y[i__4]) + bli_cimag(q__3)), q__2 ); bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); jx += *incx; jy += *incy; if (j > *k) { kx += *incx; ky += *incy; } /* L80: */ } } } else { /* Form y when lower triangle of A is stored. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = j; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__3]) - bli_cimag(*alpha) * bli_cimag(x[i__3])), (bli_creal(*alpha) * bli_cimag(x[i__3]) + bli_cimag(*alpha) * bli_creal(x[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); i__3 = j; i__4 = j; i__2 = j * a_dim1 + 1; r__1 = bli_creal(a[i__2]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); l = 1 - j; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { i__4 = i__; i__2 = i__; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] ); bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); /* L90: */ } i__3 = j; i__4 = j; bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__3]) - bli_cimag(*alpha) * bli_cimag(x[i__3])), (bli_creal(*alpha) * bli_cimag(x[i__3]) + bli_cimag(*alpha) * bli_creal(x[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); i__3 = jy; i__4 = jy; i__2 = j * a_dim1 + 1; r__1 = bli_creal(a[i__2]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); l = 1 - j; ix = jx; iy = jy; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { ix += *incx; iy += *incy; i__4 = iy; i__2 = iy; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp1) * bli_creal(a[i__5]) - bli_cimag(temp1) * bli_cimag(a[i__5])), (bli_creal(temp1) * bli_cimag(a[i__5]) + bli_cimag(temp1) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(y[i__2]) + bli_creal(q__2)), (bli_cimag(y[i__2]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__4] ); bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); /* L110: */ } i__3 = jy; i__4 = jy; bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); jx += *incx; jy += *incy; /* L120: */ } } } return 0; /* End of CHBMV . */ } /* chbmv_ */ /* zhbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer * incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; bla_double d__1; bla_dcomplex z__1, z__2, z__3, z__4; /* Builtin functions */ //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp1, temp2; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZHBMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n hermitian band matrix, with k super-diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the band matrix A is being supplied as */ /* follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* being supplied. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* being supplied. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry, K specifies the number of super-diagonals of the */ /* matrix A. K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX*16 . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the hermitian matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer the upper */ /* triangular part of a hermitian band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the hermitian matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer the lower */ /* triangular part of a hermitian band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set and are assumed to be zero. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - COMPLEX*16 . */ /* On entry, BETA specifies the scalar beta. */ /* Unchanged on exit. */ /* Y - COMPLEX*16 array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*k < 0) { info = 3; } else if (*lda < *k + 1) { info = 6; } else if (*incx == 0) { info = 8; } else if (*incy == 0) { info = 11; } if (info != 0) { PASTEF770(xerbla)("ZHBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 1. && bli_zimag(*beta) == 0.))) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array A */ /* are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if (bli_zreal(*beta) != 1. || bli_zimag(*beta) != 0.) { if (*incy == 1) { if (bli_zreal(*beta) == 0. && bli_zimag(*beta) == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_zsets( (0.), (0.), y[i__2] ); /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__; bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); /* L20: */ } } } else { iy = ky; if (bli_zreal(*beta) == 0. && bli_zimag(*beta) == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; bli_zsets( (0.), (0.), y[i__2] ); iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; i__3 = iy; bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); iy += *incy; /* L40: */ } } } } if (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0.) { return 0; } if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when upper triangle of A is stored. */ kplus1 = *k + 1; if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = i__; i__3 = i__; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__2 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); /* L50: */ } i__4 = j; i__2 = j; i__3 = kplus1 + j * a_dim1; d__1 = bli_zreal(a[i__3]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 ); bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__3)), (bli_zimag(y[i__2]) + bli_zimag(z__3)), z__2 ); bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] ); /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__4 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__4]) - bli_zimag(*alpha) * bli_zimag(x[i__4])), (bli_zreal(*alpha) * bli_zimag(x[i__4]) + bli_zimag(*alpha) * bli_zreal(x[i__4])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); ix = kx; iy = ky; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = iy; i__2 = iy; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] ); bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__4 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); ix += *incx; iy += *incy; /* L70: */ } i__3 = jy; i__4 = jy; i__2 = kplus1 + j * a_dim1; d__1 = bli_zreal(a[i__2]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__3)), (bli_zimag(y[i__4]) + bli_zimag(z__3)), z__2 ); bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); jx += *incx; jy += *incy; if (j > *k) { kx += *incx; ky += *incy; } /* L80: */ } } } else { /* Form y when lower triangle of A is stored. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__3]) - bli_zimag(*alpha) * bli_zimag(x[i__3])), (bli_zreal(*alpha) * bli_zimag(x[i__3]) + bli_zimag(*alpha) * bli_zreal(x[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); i__3 = j; i__4 = j; i__2 = j * a_dim1 + 1; d__1 = bli_zreal(a[i__2]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); l = 1 - j; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { i__4 = i__; i__2 = i__; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] ); bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__4 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); /* L90: */ } i__3 = j; i__4 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__3]) - bli_zimag(*alpha) * bli_zimag(x[i__3])), (bli_zreal(*alpha) * bli_zimag(x[i__3]) + bli_zimag(*alpha) * bli_zreal(x[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); i__3 = jy; i__4 = jy; i__2 = j * a_dim1 + 1; d__1 = bli_zreal(a[i__2]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); l = 1 - j; ix = jx; iy = jy; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { ix += *incx; iy += *incy; i__4 = iy; i__2 = iy; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp1) * bli_zreal(a[i__5]) - bli_zimag(temp1) * bli_zimag(a[i__5])), (bli_zreal(temp1) * bli_zimag(a[i__5]) + bli_zimag(temp1) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__2]) + bli_zreal(z__2)), (bli_zimag(y[i__2]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__4] ); bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__4 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); /* L110: */ } i__3 = jy; i__4 = jy; bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); jx += *incx; jy += *incy; /* L120: */ } } } return 0; /* End of ZHBMV . */ } /* zhbmv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hbmv.h000066400000000000000000000044021427272030600226100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hpmv.c000066400000000000000000000770761427272030600226420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* chpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex * ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_real r__1; bla_scomplex q__1, q__2, q__3, q__4; /* Builtin functions */ //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CHPMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n hermitian matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* AP - COMPLEX array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set and are assumed to be zero. */ /* Unchanged on exit. */ /* X - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - COMPLEX . */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. On exit, Y is overwritten by the updated */ /* vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --y; --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 6; } else if (*incy == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("CHPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f && (bli_creal(*beta) == 1.f && bli_cimag(*beta) == 0.f))) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ /* First form y := beta*y. */ if (bli_creal(*beta) != 1.f || bli_cimag(*beta) != 0.f) { if (*incy == 1) { if (bli_creal(*beta) == 0.f && bli_cimag(*beta) == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_csets( (0.f), (0.f), y[i__2] ); /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__; bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); /* L20: */ } } } else { iy = ky; if (bli_creal(*beta) == 0.f && bli_cimag(*beta) == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; bli_csets( (0.f), (0.f), y[i__2] ); iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; i__3 = iy; bli_csets( (bli_creal(*beta) * bli_creal(y[i__3]) - bli_cimag(*beta) * bli_cimag(y[i__3])), (bli_creal(*beta) * bli_cimag(y[i__3]) + bli_cimag(*beta) * bli_creal(y[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); iy += *incy; /* L40: */ } } } } if (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f) { return 0; } kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when AP contains the upper triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); bla_r_cnjg(&q__3, &ap[k]); i__3 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); ++k; /* L50: */ } i__2 = j; i__3 = j; i__4 = kk + j - 1; r__1 = bli_creal(ap[i__4]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__3)), (bli_cimag(y[i__3]) + bli_cimag(q__3)), q__2 ); bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); kk += j; /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); ix = kx; iy = ky; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = iy; i__4 = iy; i__5 = k; bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); bla_r_cnjg(&q__3, &ap[k]); i__3 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); ix += *incx; iy += *incy; /* L70: */ } i__2 = jy; i__3 = jy; i__4 = kk + j - 1; r__1 = bli_creal(ap[i__4]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__3 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__3)), (bli_cimag(y[i__3]) + bli_cimag(q__3)), q__2 ); bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); jx += *incx; jy += *incy; kk += j; /* L80: */ } } } else { /* Form y when AP contains the lower triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); i__2 = j; i__3 = j; i__4 = kk; r__1 = bli_creal(ap[i__4]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); bla_r_cnjg(&q__3, &ap[k]); i__3 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); ++k; /* L90: */ } i__2 = j; i__3 = j; bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); kk += *n - j + 1; /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); bli_csets( (0.f), (0.f), temp2 ); i__2 = jy; i__3 = jy; i__4 = kk; r__1 = bli_creal(ap[i__4]); bli_csets( (r__1 * bli_creal(temp1)), (r__1 * bli_cimag(temp1)), q__2 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; iy += *incy; i__3 = iy; i__4 = iy; i__5 = k; bli_csets( (bli_creal(temp1) * bli_creal(ap[i__5]) - bli_cimag(temp1) * bli_cimag(ap[i__5])), (bli_creal(temp1) * bli_cimag(ap[i__5]) + bli_cimag(temp1) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(y[i__4]) + bli_creal(q__2)), (bli_cimag(y[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__3] ); bla_r_cnjg(&q__3, &ap[k]); i__3 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp2) + bli_creal(q__2)), (bli_cimag(temp2) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); /* L110: */ } i__2 = jy; i__3 = jy; bli_csets( (bli_creal(*alpha) * bli_creal(temp2) - bli_cimag(*alpha) * bli_cimag(temp2)), (bli_creal(*alpha) * bli_cimag(temp2) + bli_cimag(*alpha) * bli_creal(temp2)), q__2 ); bli_csets( (bli_creal(y[i__3]) + bli_creal(q__2)), (bli_cimag(y[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), y[i__2] ); jx += *incx; jy += *incy; kk += *n - j + 1; /* L120: */ } } } return 0; /* End of CHPMV . */ } /* chpmv_ */ /* zhpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_double d__1; bla_dcomplex z__1, z__2, z__3, z__4; /* Builtin functions */ //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZHPMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n hermitian matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX*16 . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* AP - COMPLEX*16 array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set and are assumed to be zero. */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - COMPLEX*16 . */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. On exit, Y is overwritten by the updated */ /* vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --y; --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 6; } else if (*incy == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("ZHPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0. && (bli_zreal(*beta) == 1. && bli_zimag(*beta) == 0.))) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ /* First form y := beta*y. */ if (bli_zreal(*beta) != 1. || bli_zimag(*beta) != 0.) { if (*incy == 1) { if (bli_zreal(*beta) == 0. && bli_zimag(*beta) == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_zsets( (0.), (0.), y[i__2] ); /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__; bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); /* L20: */ } } } else { iy = ky; if (bli_zreal(*beta) == 0. && bli_zimag(*beta) == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; bli_zsets( (0.), (0.), y[i__2] ); iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; i__3 = iy; bli_zsets( (bli_zreal(*beta) * bli_zreal(y[i__3]) - bli_zimag(*beta) * bli_zimag(y[i__3])), (bli_zreal(*beta) * bli_zimag(y[i__3]) + bli_zimag(*beta) * bli_zreal(y[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); iy += *incy; /* L40: */ } } } } if (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0.) { return 0; } kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when AP contains the upper triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); bla_d_cnjg(&z__3, &ap[k]); i__3 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); ++k; /* L50: */ } i__2 = j; i__3 = j; i__4 = kk + j - 1; d__1 = bli_zreal(ap[i__4]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__3)), (bli_zimag(y[i__3]) + bli_zimag(z__3)), z__2 ); bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); kk += j; /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); ix = kx; iy = ky; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = iy; i__4 = iy; i__5 = k; bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); bla_d_cnjg(&z__3, &ap[k]); i__3 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); ix += *incx; iy += *incy; /* L70: */ } i__2 = jy; i__3 = jy; i__4 = kk + j - 1; d__1 = bli_zreal(ap[i__4]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__3 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__3)), (bli_zimag(y[i__3]) + bli_zimag(z__3)), z__2 ); bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); jx += *incx; jy += *incy; kk += j; /* L80: */ } } } else { /* Form y when AP contains the lower triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); i__2 = j; i__3 = j; i__4 = kk; d__1 = bli_zreal(ap[i__4]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); bla_d_cnjg(&z__3, &ap[k]); i__3 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); ++k; /* L90: */ } i__2 = j; i__3 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); kk += *n - j + 1; /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); bli_zsets( (0.), (0.), temp2 ); i__2 = jy; i__3 = jy; i__4 = kk; d__1 = bli_zreal(ap[i__4]); bli_zsets( (d__1 * bli_zreal(temp1)), (d__1 * bli_zimag(temp1)), z__2 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; iy += *incy; i__3 = iy; i__4 = iy; i__5 = k; bli_zsets( (bli_zreal(temp1) * bli_zreal(ap[i__5]) - bli_zimag(temp1) * bli_zimag(ap[i__5])), (bli_zreal(temp1) * bli_zimag(ap[i__5]) + bli_zimag(temp1) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(y[i__4]) + bli_zreal(z__2)), (bli_zimag(y[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__3] ); bla_d_cnjg(&z__3, &ap[k]); i__3 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp2) + bli_zreal(z__2)), (bli_zimag(temp2) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); /* L110: */ } i__2 = jy; i__3 = jy; bli_zsets( (bli_zreal(*alpha) * bli_zreal(temp2) - bli_zimag(*alpha) * bli_zimag(temp2)), (bli_zreal(*alpha) * bli_zimag(temp2) + bli_zimag(*alpha) * bli_zreal(temp2)), z__2 ); bli_zsets( (bli_zreal(y[i__3]) + bli_zreal(z__2)), (bli_zimag(y[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), y[i__2] ); jx += *incx; jy += *incy; kk += *n - j + 1; /* L120: */ } } } return 0; /* End of ZHPMV . */ } /* zhpmv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hpmv.h000066400000000000000000000042501427272030600226270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hpr.c000066400000000000000000000546501427272030600224520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* chpr.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_real r__1; bla_scomplex q__1, q__2; /* Builtin functions */ //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CHPR performs the hermitian rank 1 operation */ /* A := alpha*x*conjg( x' ) + A, */ /* where alpha is a bla_real scalar, x is an n element vector and A is an */ /* n by n hermitian matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - REAL . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* AP - COMPLEX array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set, they are assumed to be zero, and on exit they */ /* are set to zero. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } if (info != 0) { PASTEF770(xerbla)("CHPR ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.f) { return 0; } /* Set the start point in X if the increment is not unity. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { bla_r_cnjg(&q__2, &x[j]); bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); ++k; /* L10: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = j; bli_csets( (bli_creal(x[i__4]) * bli_creal(temp) - bli_cimag(x[i__4]) * bli_cimag(temp)), (bli_creal(x[i__4]) * bli_cimag(temp) + bli_cimag(x[i__4]) * bli_creal(temp)), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { bla_r_cnjg(&q__2, &x[jx]); bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = k; i__4 = k; i__5 = ix; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); ix += *incx; /* L30: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = jx; bli_csets( (bli_creal(x[i__4]) * bli_creal(temp) - bli_cimag(x[i__4]) * bli_cimag(temp)), (bli_creal(x[i__4]) * bli_cimag(temp) + bli_cimag(x[i__4]) * bli_creal(temp)), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } jx += *incx; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { bla_r_cnjg(&q__2, &x[j]); bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); i__2 = kk; i__3 = kk; i__4 = j; bli_csets( (bli_creal(temp) * bli_creal(x[i__4]) - bli_cimag(temp) * bli_cimag(x[i__4])), (bli_creal(temp) * bli_cimag(x[i__4]) + bli_cimag(temp) * bli_creal(x[i__4])), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); ++k; /* L50: */ } } else { i__2 = kk; i__3 = kk; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } kk = kk + *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { bla_r_cnjg(&q__2, &x[jx]); bli_csets( (*alpha * bli_creal(q__2)), (*alpha * bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); i__2 = kk; i__3 = kk; i__4 = jx; bli_csets( (bli_creal(temp) * bli_creal(x[i__4]) - bli_cimag(temp) * bli_cimag(x[i__4])), (bli_creal(temp) * bli_cimag(x[i__4]) + bli_cimag(temp) * bli_creal(x[i__4])), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); ix = jx; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; i__3 = k; i__4 = k; i__5 = ix; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp) - bli_cimag(x[i__5]) * bli_cimag(temp)), (bli_creal(x[i__5]) * bli_cimag(temp) + bli_cimag(x[i__5]) * bli_creal(temp)), q__2 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__2)), (bli_cimag(ap[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); /* L70: */ } } else { i__2 = kk; i__3 = kk; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } jx += *incx; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of CHPR . */ } /* chpr_ */ /* zhpr.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_double d__1; bla_dcomplex z__1, z__2; /* Builtin functions */ //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZHPR performs the hermitian rank 1 operation */ /* A := alpha*x*conjg( x' ) + A, */ /* where alpha is a bla_real scalar, x is an n element vector and A is an */ /* n by n hermitian matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - DOUBLE PRECISION. */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* AP - COMPLEX*16 array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set, they are assumed to be zero, and on exit they */ /* are set to zero. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } if (info != 0) { PASTEF770(xerbla)("ZHPR ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.) { return 0; } /* Set the start point in X if the increment is not unity. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { bla_d_cnjg(&z__2, &x[j]); bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); ++k; /* L10: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = j; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp) - bli_zimag(x[i__4]) * bli_zimag(temp)), (bli_zreal(x[i__4]) * bli_zimag(temp) + bli_zimag(x[i__4]) * bli_zreal(temp)), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { bla_d_cnjg(&z__2, &x[jx]); bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = k; i__4 = k; i__5 = ix; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); ix += *incx; /* L30: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = jx; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp) - bli_zimag(x[i__4]) * bli_zimag(temp)), (bli_zreal(x[i__4]) * bli_zimag(temp) + bli_zimag(x[i__4]) * bli_zreal(temp)), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } jx += *incx; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { bla_d_cnjg(&z__2, &x[j]); bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); i__2 = kk; i__3 = kk; i__4 = j; bli_zsets( (bli_zreal(temp) * bli_zreal(x[i__4]) - bli_zimag(temp) * bli_zimag(x[i__4])), (bli_zreal(temp) * bli_zimag(x[i__4]) + bli_zimag(temp) * bli_zreal(x[i__4])), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); ++k; /* L50: */ } } else { i__2 = kk; i__3 = kk; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } kk = kk + *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { bla_d_cnjg(&z__2, &x[jx]); bli_zsets( (*alpha * bli_zreal(z__2)), (*alpha * bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); i__2 = kk; i__3 = kk; i__4 = jx; bli_zsets( (bli_zreal(temp) * bli_zreal(x[i__4]) - bli_zimag(temp) * bli_zimag(x[i__4])), (bli_zreal(temp) * bli_zimag(x[i__4]) + bli_zimag(temp) * bli_zreal(x[i__4])), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); ix = jx; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; i__3 = k; i__4 = k; i__5 = ix; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp) - bli_zimag(x[i__5]) * bli_zimag(temp)), (bli_zreal(x[i__5]) * bli_zimag(temp) + bli_zimag(x[i__5]) * bli_zreal(temp)), z__2 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__2)), (bli_zimag(ap[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); /* L70: */ } } else { i__2 = kk; i__3 = kk; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } jx += *incx; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of ZHPR . */ } /* zhpr_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hpr.h000066400000000000000000000040141427272030600224440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hpr2.c000066400000000000000000001011571427272030600225270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* chpr2.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5, i__6; bla_real r__1; bla_scomplex q__1, q__2, q__3, q__4; /* Builtin functions */ //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CHPR2 performs the hermitian rank 2 operation */ /* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, */ /* where alpha is a scalar, x and y are n element vectors and A is an */ /* n by n hermitian matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Y - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. */ /* Unchanged on exit. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* AP - COMPLEX array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set, they are assumed to be zero, and on exit they */ /* are set to zero. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --y; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } else if (*incy == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("CHPR2 ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f)) { return 0; } /* Set up the start points in X and Y if the increments are not both */ /* unity. */ if (*incx != 1 || *incy != 1) { if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } jx = kx; jy = ky; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; i__3 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f || bli_cimag(y[i__3]) != 0.f)) { bla_r_cnjg(&q__2, &y[j]); bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); i__2 = j; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 ); bla_r_cnjg(&q__1, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 ); i__6 = i__; bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); ++k; /* L10: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = j; bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 ); i__5 = j; bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 ); bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } kk += j; /* L20: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; i__3 = jy; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f || bli_cimag(y[i__3]) != 0.f)) { bla_r_cnjg(&q__2, &y[jy]); bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); i__2 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 ); bla_r_cnjg(&q__1, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); ix = kx; iy = ky; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = k; i__4 = k; i__5 = ix; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 ); i__6 = iy; bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); ix += *incx; iy += *incy; /* L30: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = jx; bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 ); i__5 = jy; bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 ); bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } jx += *incx; jy += *incy; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; i__3 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f || bli_cimag(y[i__3]) != 0.f)) { bla_r_cnjg(&q__2, &y[j]); bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); i__2 = j; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 ); bla_r_cnjg(&q__1, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); i__2 = kk; i__3 = kk; i__4 = j; bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 ); i__5 = j; bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 ); bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 ); i__6 = i__; bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); ++k; /* L50: */ } } else { i__2 = kk; i__3 = kk; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } kk = kk + *n - j + 1; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; i__3 = jy; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f || (bli_creal(y[i__3]) != 0.f || bli_cimag(y[i__3]) != 0.f)) { bla_r_cnjg(&q__2, &y[jy]); bli_csets( (bli_creal(*alpha) * bli_creal(q__2) - bli_cimag(*alpha) * bli_cimag(q__2)), (bli_creal(*alpha) * bli_cimag(q__2) + bli_cimag(*alpha) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp1 ); i__2 = jx; bli_csets( (bli_creal(*alpha) * bli_creal(x[i__2]) - bli_cimag(*alpha) * bli_cimag(x[i__2])), (bli_creal(*alpha) * bli_cimag(x[i__2]) + bli_cimag(*alpha) * bli_creal(x[i__2])), q__2 ); bla_r_cnjg(&q__1, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp2 ); i__2 = kk; i__3 = kk; i__4 = jx; bli_csets( (bli_creal(x[i__4]) * bli_creal(temp1) - bli_cimag(x[i__4]) * bli_cimag(temp1)), (bli_creal(x[i__4]) * bli_cimag(temp1) + bli_cimag(x[i__4]) * bli_creal(temp1)), q__2 ); i__5 = jy; bli_csets( (bli_creal(y[i__5]) * bli_creal(temp2) - bli_cimag(y[i__5]) * bli_cimag(temp2)), (bli_creal(y[i__5]) * bli_cimag(temp2) + bli_cimag(y[i__5]) * bli_creal(temp2)), q__3 ); bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 ); r__1 = bli_creal(ap[i__3]) + bli_creal(q__1); bli_csets( (r__1), (0.f), ap[i__2] ); ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; iy += *incy; i__3 = k; i__4 = k; i__5 = ix; bli_csets( (bli_creal(x[i__5]) * bli_creal(temp1) - bli_cimag(x[i__5]) * bli_cimag(temp1)), (bli_creal(x[i__5]) * bli_cimag(temp1) + bli_cimag(x[i__5]) * bli_creal(temp1)), q__3 ); bli_csets( (bli_creal(ap[i__4]) + bli_creal(q__3)), (bli_cimag(ap[i__4]) + bli_cimag(q__3)), q__2 ); i__6 = iy; bli_csets( (bli_creal(y[i__6]) * bli_creal(temp2) - bli_cimag(y[i__6]) * bli_cimag(temp2)), (bli_creal(y[i__6]) * bli_cimag(temp2) + bli_cimag(y[i__6]) * bli_creal(temp2)), q__4 ); bli_csets( (bli_creal(q__2) + bli_creal(q__4)), (bli_cimag(q__2) + bli_cimag(q__4)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ap[i__3] ); /* L70: */ } } else { i__2 = kk; i__3 = kk; r__1 = bli_creal(ap[i__3]); bli_csets( (r__1), (0.f), ap[i__2] ); } jx += *incx; jy += *incy; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of CHPR2 . */ } /* chpr2_ */ /* zhpr2.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5, i__6; bla_double d__1; bla_dcomplex z__1, z__2, z__3, z__4; /* Builtin functions */ //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZHPR2 performs the hermitian rank 2 operation */ /* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, */ /* where alpha is a scalar, x and y are n element vectors and A is an */ /* n by n hermitian matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX*16 . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Y - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. */ /* Unchanged on exit. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* AP - COMPLEX*16 array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the hermitian matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set, they are assumed to be zero, and on exit they */ /* are set to zero. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --y; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } else if (*incy == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("ZHPR2 ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0.)) { return 0; } /* Set up the start points in X and Y if the increments are not both */ /* unity. */ if (*incx != 1 || *incy != 1) { if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } jx = kx; jy = ky; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; i__3 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || bli_zimag(y[i__3]) != 0.)) { bla_d_cnjg(&z__2, &y[j]); bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); i__2 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 ); bla_d_cnjg(&z__1, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 ); i__6 = i__; bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); ++k; /* L10: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = j; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 ); i__5 = j; bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } kk += j; /* L20: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; i__3 = jy; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || bli_zimag(y[i__3]) != 0.)) { bla_d_cnjg(&z__2, &y[jy]); bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); i__2 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 ); bla_d_cnjg(&z__1, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); ix = kx; iy = ky; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = k; i__4 = k; i__5 = ix; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 ); i__6 = iy; bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); ix += *incx; iy += *incy; /* L30: */ } i__2 = kk + j - 1; i__3 = kk + j - 1; i__4 = jx; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 ); i__5 = jy; bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); } else { i__2 = kk + j - 1; i__3 = kk + j - 1; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } jx += *incx; jy += *incy; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; i__3 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || bli_zimag(y[i__3]) != 0.)) { bla_d_cnjg(&z__2, &y[j]); bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); i__2 = j; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 ); bla_d_cnjg(&z__1, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); i__2 = kk; i__3 = kk; i__4 = j; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 ); i__5 = j; bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = k; i__5 = i__; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 ); i__6 = i__; bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); ++k; /* L50: */ } } else { i__2 = kk; i__3 = kk; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } kk = kk + *n - j + 1; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; i__3 = jy; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0. || (bli_zreal(y[i__3]) != 0. || bli_zimag(y[i__3]) != 0.)) { bla_d_cnjg(&z__2, &y[jy]); bli_zsets( (bli_zreal(*alpha) * bli_zreal(z__2) - bli_zimag(*alpha) * bli_zimag(z__2)), (bli_zreal(*alpha) * bli_zimag(z__2) + bli_zimag(*alpha) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp1 ); i__2 = jx; bli_zsets( (bli_zreal(*alpha) * bli_zreal(x[i__2]) - bli_zimag(*alpha) * bli_zimag(x[i__2])), (bli_zreal(*alpha) * bli_zimag(x[i__2]) + bli_zimag(*alpha) * bli_zreal(x[i__2])), z__2 ); bla_d_cnjg(&z__1, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp2 ); i__2 = kk; i__3 = kk; i__4 = jx; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(temp1) - bli_zimag(x[i__4]) * bli_zimag(temp1)), (bli_zreal(x[i__4]) * bli_zimag(temp1) + bli_zimag(x[i__4]) * bli_zreal(temp1)), z__2 ); i__5 = jy; bli_zsets( (bli_zreal(y[i__5]) * bli_zreal(temp2) - bli_zimag(y[i__5]) * bli_zimag(temp2)), (bli_zreal(y[i__5]) * bli_zimag(temp2) + bli_zimag(y[i__5]) * bli_zreal(temp2)), z__3 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 ); d__1 = bli_zreal(ap[i__3]) + bli_zreal(z__1); bli_zsets( (d__1), (0.), ap[i__2] ); ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; iy += *incy; i__3 = k; i__4 = k; i__5 = ix; bli_zsets( (bli_zreal(x[i__5]) * bli_zreal(temp1) - bli_zimag(x[i__5]) * bli_zimag(temp1)), (bli_zreal(x[i__5]) * bli_zimag(temp1) + bli_zimag(x[i__5]) * bli_zreal(temp1)), z__3 ); bli_zsets( (bli_zreal(ap[i__4]) + bli_zreal(z__3)), (bli_zimag(ap[i__4]) + bli_zimag(z__3)), z__2 ); i__6 = iy; bli_zsets( (bli_zreal(y[i__6]) * bli_zreal(temp2) - bli_zimag(y[i__6]) * bli_zimag(temp2)), (bli_zreal(y[i__6]) * bli_zimag(temp2) + bli_zimag(y[i__6]) * bli_zreal(temp2)), z__4 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__4)), (bli_zimag(z__2) + bli_zimag(z__4)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ap[i__3] ); /* L70: */ } } else { i__2 = kk; i__3 = kk; d__1 = bli_zreal(ap[i__3]); bli_zsets( (d__1), (0.), ap[i__2] ); } jx += *incx; jy += *incy; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of ZHPR2 . */ } /* zhpr2_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_hpr2.h000066400000000000000000000041641427272030600225340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_lsame.c000066400000000000000000000107621427272030600227560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* lsame.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len) #else int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len) #endif { /* System generated locals */ bla_logical ret_val; /* Local variables */ bla_integer inta, intb, zcode; /* -- LAPACK auxiliary routine (version 2.0) -- */ /* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., */ /* Courant Institute, Argonne National Lab, and Rice University */ /* January 31, 1994 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* LSAME returns .TRUE. if CA is the same letter as CB regardless of */ /* case. */ /* Arguments */ /* ========= */ /* CA (input) CHARACTER*1 */ /* CB (input) CHARACTER*1 */ /* CA and CB specify the single bla_characters to be compared. */ /* ===================================================================== */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. Executable Statements .. */ /* Test if the bla_characters are equal */ ret_val = *(unsigned char *)ca == *(unsigned char *)cb; if (ret_val) { return ret_val; } /* Now test for equivalence if both bla_characters are alphabetic. */ zcode = 'Z'; /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */ /* machines, on which ICHAR returns a value with bit 8 set. */ /* ICHAR('A') on Prime machines returns 193 which is the same as */ /* ICHAR('A') on an EBCDIC machine. */ inta = *(unsigned char *)ca; intb = *(unsigned char *)cb; if (zcode == 90 || zcode == 122) { /* ASCII is assumed - ZCODE is the ASCII code of either lower or */ /* upper case 'Z'. */ if (inta >= 97 && inta <= 122) { inta += -32; } if (intb >= 97 && intb <= 122) { intb += -32; } } else if (zcode == 233 || zcode == 169) { /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */ /* upper case 'Z'. */ if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169)) { inta += 64; } if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169)) { intb += 64; } } else if (zcode == 218 || zcode == 250) { /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */ /* plus 128 of either lower or upper case 'Z'. */ if (inta >= 225 && inta <= 250) { inta += -32; } if (intb >= 225 && intb <= 250) { intb += -32; } } ret_val = inta == intb; /* RETURN */ /* End of LSAME */ return ret_val; } /* lsame */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_lsame.h000066400000000000000000000035731427272030600227650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rot.c000066400000000000000000000244561427272030600224660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* srot.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s) { /* System generated locals */ bla_integer i__1; /* Local variables */ bla_integer i__; bla_real stemp; bla_integer ix, iy; /* applies a plane rotation. */ /* jack dongarra, linpack, 3/11/78. */ /* modified 12/3/93, array(1) declarations changed to array(*) */ /* Parameter adjustments */ --sy; --sx; /* Function Body */ if (*n <= 0) { return 0; } if (*incx == 1 && *incy == 1) { goto L20; } /* code for unequal increments or equal increments not equal */ /* to 1 */ ix = 1; iy = 1; if (*incx < 0) { ix = (-(*n) + 1) * *incx + 1; } if (*incy < 0) { iy = (-(*n) + 1) * *incy + 1; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { stemp = *c__ * sx[ix] + *s * sy[iy]; sy[iy] = *c__ * sy[iy] - *s * sx[ix]; sx[ix] = stemp; ix += *incx; iy += *incy; /* L10: */ } return 0; /* code for both increments equal to 1 */ L20: i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { stemp = *c__ * sx[i__] + *s * sy[i__]; sy[i__] = *c__ * sy[i__] - *s * sx[i__]; sx[i__] = stemp; /* L30: */ } return 0; } /* srot_ */ /* drot.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s) { /* System generated locals */ bla_integer i__1; /* Local variables */ bla_integer i__; bla_double dtemp; bla_integer ix, iy; /* applies a plane rotation. */ /* jack dongarra, linpack, 3/11/78. */ /* modified 12/3/93, array(1) declarations changed to array(*) */ /* Parameter adjustments */ --dy; --dx; /* Function Body */ if (*n <= 0) { return 0; } if (*incx == 1 && *incy == 1) { goto L20; } /* code for unequal increments or equal increments not equal */ /* to 1 */ ix = 1; iy = 1; if (*incx < 0) { ix = (-(*n) + 1) * *incx + 1; } if (*incy < 0) { iy = (-(*n) + 1) * *incy + 1; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { dtemp = *c__ * dx[ix] + *s * dy[iy]; dy[iy] = *c__ * dy[iy] - *s * dx[ix]; dx[ix] = dtemp; ix += *incx; iy += *incy; /* L10: */ } return 0; /* code for both increments equal to 1 */ L20: i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { dtemp = *c__ * dx[i__] + *s * dy[i__]; dy[i__] = *c__ * dy[i__] - *s * dx[i__]; dx[i__] = dtemp; /* L30: */ } return 0; } /* drot_ */ /* csrot.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4; bla_scomplex q__1, q__2, q__3; /* Local variables */ bla_integer i__; bla_scomplex ctemp; bla_integer ix, iy; /* applies a plane rotation, where the cos and sin (c and s) are bla_real */ /* and the vectors cx and cy are complex. */ /* jack dongarra, linpack, 3/11/78. */ /* Parameter adjustments */ --cy; --cx; /* Function Body */ if (*n <= 0) { return 0; } if (*incx == 1 && *incy == 1) { goto L20; } /* code for unequal increments or equal increments not equal */ /* to 1 */ ix = 1; iy = 1; if (*incx < 0) { ix = (-(*n) + 1) * *incx + 1; } if (*incy < 0) { iy = (-(*n) + 1) * *incy + 1; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = ix; bli_csets( (*c__ * bli_creal(cx[i__2])), (*c__ * bli_cimag(cx[i__2])), q__2 ); i__3 = iy; bli_csets( (*s * bli_creal(cy[i__3])), (*s * bli_cimag(cy[i__3])), q__3 ); bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ctemp ); i__2 = iy; i__3 = iy; bli_csets( (*c__ * bli_creal(cy[i__3])), (*c__ * bli_cimag(cy[i__3])), q__2 ); i__4 = ix; bli_csets( (*s * bli_creal(cx[i__4])), (*s * bli_cimag(cx[i__4])), q__3 ); bli_csets( (bli_creal(q__2) - bli_creal(q__3)), (bli_cimag(q__2) - bli_cimag(q__3)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), cy[i__2] ); i__2 = ix; bli_csets( (bli_creal(ctemp)), (bli_cimag(ctemp)), cx[i__2] ); ix += *incx; iy += *incy; /* L10: */ } return 0; /* code for both increments equal to 1 */ L20: i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_csets( (*c__ * bli_creal(cx[i__2])), (*c__ * bli_cimag(cx[i__2])), q__2 ); i__3 = i__; bli_csets( (*s * bli_creal(cy[i__3])), (*s * bli_cimag(cy[i__3])), q__3 ); bli_csets( (bli_creal(q__2) + bli_creal(q__3)), (bli_cimag(q__2) + bli_cimag(q__3)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), ctemp ); i__2 = i__; i__3 = i__; bli_csets( (*c__ * bli_creal(cy[i__3])), (*c__ * bli_cimag(cy[i__3])), q__2 ); i__4 = i__; bli_csets( (*s * bli_creal(cx[i__4])), (*s * bli_cimag(cx[i__4])), q__3 ); bli_csets( (bli_creal(q__2) - bli_creal(q__3)), (bli_cimag(q__2) - bli_cimag(q__3)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), cy[i__2] ); i__2 = i__; bli_csets( (bli_creal(ctemp)), (bli_cimag(ctemp)), cx[i__2] ); /* L30: */ } return 0; } /* csrot_ */ /* zdrot.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4; bla_dcomplex z__1, z__2, z__3; /* Local variables */ bla_integer i__; bla_dcomplex ztemp; bla_integer ix, iy; /* applies a plane rotation, where the cos and sin (c and s) are */ /* double precision and the vectors zx and zy are double complex. */ /* jack dongarra, linpack, 3/11/78. */ /* Parameter adjustments */ --zy; --zx; /* Function Body */ if (*n <= 0) { return 0; } if (*incx == 1 && *incy == 1) { goto L20; } /* code for unequal increments or equal increments not equal */ /* to 1 */ ix = 1; iy = 1; if (*incx < 0) { ix = (-(*n) + 1) * *incx + 1; } if (*incy < 0) { iy = (-(*n) + 1) * *incy + 1; } i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = ix; bli_zsets( (*c__ * bli_zreal(zx[i__2])), (*c__ * bli_zimag(zx[i__2])), z__2 ); i__3 = iy; bli_zsets( (*s * bli_zreal(zy[i__3])), (*s * bli_zimag(zy[i__3])), z__3 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ztemp ); i__2 = iy; i__3 = iy; bli_zsets( (*c__ * bli_zreal(zy[i__3])), (*c__ * bli_zimag(zy[i__3])), z__2 ); i__4 = ix; bli_zsets( (*s * bli_zreal(zx[i__4])), (*s * bli_zimag(zx[i__4])), z__3 ); bli_zsets( (bli_zreal(z__2) - bli_zreal(z__3)), (bli_zimag(z__2) - bli_zimag(z__3)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), zy[i__2] ); i__2 = ix; bli_zsets( (bli_zreal(ztemp)), (bli_zimag(ztemp)), zx[i__2] ); ix += *incx; iy += *incy; /* L10: */ } return 0; /* code for both increments equal to 1 */ L20: i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; bli_zsets( (*c__ * bli_zreal(zx[i__2])), (*c__ * bli_zimag(zx[i__2])), z__2 ); i__3 = i__; bli_zsets( (*s * bli_zreal(zy[i__3])), (*s * bli_zimag(zy[i__3])), z__3 ); bli_zsets( (bli_zreal(z__2) + bli_zreal(z__3)), (bli_zimag(z__2) + bli_zimag(z__3)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), ztemp ); i__2 = i__; i__3 = i__; bli_zsets( (*c__ * bli_zreal(zy[i__3])), (*c__ * bli_zimag(zy[i__3])), z__2 ); i__4 = i__; bli_zsets( (*s * bli_zreal(zx[i__4])), (*s * bli_zimag(zx[i__4])), z__3 ); bli_zsets( (bli_zreal(z__2) - bli_zreal(z__3)), (bli_zimag(z__2) - bli_zimag(z__3)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), zy[i__2] ); i__2 = i__; bli_zsets( (bli_zreal(ztemp)), (bli_zimag(ztemp)), zx[i__2] ); /* L30: */ } return 0; } /* zdrot_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rot.h000066400000000000000000000046201427272030600224620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rotg.c000066400000000000000000000175331427272030600226330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* srotg.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Table of constant values */ static bla_real sc_b4 = 1.f; /* Subroutine */ int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s) { /* System generated locals */ bla_real r__1, r__2; /* Builtin functions */ //double sqrt(bla_double), bla_r_sign(bla_real *, bla_real *); /* Local variables */ bla_real r__, scale, z__, roe; /* construct givens plane rotation. */ /* jack dongarra, linpack, 3/11/78. */ roe = *sb; if (bli_fabs(*sa) > bli_fabs(*sb)) { roe = *sa; } scale = bli_fabs(*sa) + bli_fabs(*sb); if (scale != 0.f) { goto L10; } *c__ = 1.f; *s = 0.f; r__ = 0.f; z__ = 0.f; goto L20; L10: /* Computing 2nd power */ r__1 = *sa / scale; /* Computing 2nd power */ r__2 = *sb / scale; r__ = scale * sqrt(r__1 * r__1 + r__2 * r__2); r__ = bla_r_sign(&sc_b4, &roe) * r__; *c__ = *sa / r__; *s = *sb / r__; z__ = 1.f; if (bli_fabs(*sa) > bli_fabs(*sb)) { z__ = *s; } if (bli_fabs(*sb) >= bli_fabs(*sa) && *c__ != 0.f) { z__ = 1.f / *c__; } L20: *sa = r__; *sb = z__; return 0; } /* srotg_ */ /* drotg.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Table of constant values */ static bla_double dc_b4 = 1.; /* Subroutine */ int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s) { /* System generated locals */ bla_double d__1, d__2; /* Builtin functions */ //double sqrt(bla_double), bla_d_sign(bla_double *, bla_double *); /* Local variables */ bla_double r__, scale, z__, roe; /* construct givens plane rotation. */ /* jack dongarra, linpack, 3/11/78. */ roe = *db; if (bli_fabs(*da) > bli_fabs(*db)) { roe = *da; } scale = bli_fabs(*da) + bli_fabs(*db); if (scale != 0.) { goto L10; } *c__ = 1.; *s = 0.; r__ = 0.; z__ = 0.; goto L20; L10: /* Computing 2nd power */ d__1 = *da / scale; /* Computing 2nd power */ d__2 = *db / scale; r__ = scale * sqrt(d__1 * d__1 + d__2 * d__2); r__ = bla_d_sign(&dc_b4, &roe) * r__; *c__ = *da / r__; *s = *db / r__; z__ = 1.; if (bli_fabs(*da) > bli_fabs(*db)) { z__ = *s; } if (bli_fabs(*db) >= bli_fabs(*da) && *c__ != 0.) { z__ = 1. / *c__; } L20: *da = r__; *db = z__; return 0; } /* drotg_ */ /* crotg.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s) { /* System generated locals */ bla_real r__1, r__2; bla_scomplex q__1, q__2, q__3; /* Builtin functions */ //double bla_c_abs(bla_scomplex *), sqrt(bla_double); //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_real norm; bla_scomplex alpha; bla_real scale; if (bla_c_abs(ca) != 0.f) { goto L10; } *c__ = 0.f; bli_csets( 1.f, 0.f, *s ); bli_csets( bli_creal(*cb), bli_cimag(*cb), *ca ); goto L20; L10: scale = bla_c_abs(ca) + bla_c_abs(cb); bli_csets( (bli_creal(*ca) / scale), (bli_cimag(*ca) / scale), q__1 ); /* Computing 2nd power */ r__1 = bla_c_abs(&q__1); bli_csets( (bli_creal(*cb) / scale), (bli_cimag(*cb) / scale), q__2 ); /* Computing 2nd power */ r__2 = bla_c_abs(&q__2); norm = scale * sqrt(r__1 * r__1 + r__2 * r__2); r__1 = bla_c_abs(ca); bli_csets( (bli_creal(*ca) / r__1), (bli_cimag(*ca) / r__1), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), alpha ); *c__ = bla_c_abs(ca) / norm; bla_r_cnjg(&q__3, cb); bli_csets( (bli_creal(alpha) * bli_creal(q__3) - bli_cimag(alpha) * bli_cimag(q__3)), (bli_creal(alpha) * bli_cimag(q__3) + bli_cimag(alpha) * bli_creal(q__3)), q__2 ); bli_csets( (bli_creal(q__2) / norm), (bli_cimag(q__2) / norm), q__1 ); bli_csets( bli_creal(q__1), bli_cimag(q__1), *s ); bli_csets( (norm * bli_creal(alpha)), (norm * bli_cimag(alpha)), q__1 ); bli_csets( bli_creal(q__1), bli_cimag(q__1), *ca ); L20: return 0; } /* crotg_ */ /* zrotg.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s) { /* System generated locals */ bla_double d__1, d__2; bla_dcomplex z__1, z__2, z__3, z__4; /* Builtin functions */ //double bla_z_abs(bla_dcomplex *); //void bla_z_div(bla_dcomplex *, bla_dcomplex *, bla_dcomplex *); //double sqrt(bla_double); //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_double norm; bla_dcomplex alpha; bla_double scale; if (bla_z_abs(ca) != 0.) { goto L10; } *c__ = 0.; bli_zsets( 1., 0., *s ); bli_zsets( bli_zreal(*cb), bli_zimag(*cb), *ca ); goto L20; L10: scale = bla_z_abs(ca) + bla_z_abs(cb); bli_zsets( (scale), (0.), z__2 ); bla_z_div(&z__1, ca, &z__2); /* Computing 2nd power */ d__1 = bla_z_abs(&z__1); bli_zsets( (scale), (0.), z__4 ); bla_z_div(&z__3, cb, &z__4); /* Computing 2nd power */ d__2 = bla_z_abs(&z__3); norm = scale * sqrt(d__1 * d__1 + d__2 * d__2); d__1 = bla_z_abs(ca); bli_zsets( (bli_zreal(*ca) / d__1), (bli_zimag(*ca) / d__1), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), alpha ); *c__ = bla_z_abs(ca) / norm; bla_d_cnjg(&z__3, cb); bli_zsets( (bli_zreal(alpha) * bli_zreal(z__3) - bli_zimag(alpha) * bli_zimag(z__3)), (bli_zreal(alpha) * bli_zimag(z__3) + bli_zimag(alpha) * bli_zreal(z__3)), z__2 ); bli_zsets( (bli_zreal(z__2) / norm), (bli_zimag(z__2) / norm), z__1 ); bli_zsets( bli_zreal(z__1), bli_zimag(z__1), *s ); bli_zsets( (norm * bli_zreal(alpha)), (norm * bli_zimag(alpha)), z__1 ); bli_zsets( bli_zreal(z__1), bli_zimag(z__1), *ca ); L20: return 0; } /* zrotg_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rotg.h000066400000000000000000000041101427272030600226230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rotm.c000066400000000000000000000215401427272030600226320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* srotm.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam) { /* Initialized data */ static bla_real zero = 0.f; static bla_real two = 2.f; /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer i__; bla_real w, z__, sflag; bla_integer kx, ky, nsteps; bla_real sh11, sh12, sh21, sh22; /* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */ /* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */ /* (DX**T) */ /* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */ /* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */ /* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ /* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */ /* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */ /* H=( ) ( ) ( ) ( ) */ /* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */ /* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */ /* Parameter adjustments */ --sparam; --sy; --sx; /* Function Body */ sflag = sparam[1]; if (*n <= 0 || sflag + two == zero) { goto L140; } if (! (*incx == *incy && *incx > 0)) { goto L70; } nsteps = *n * *incx; if (sflag < 0.f) { goto L50; } else if (sflag == 0) { goto L10; } else { goto L30; } L10: sh12 = sparam[4]; sh21 = sparam[3]; i__1 = nsteps; i__2 = *incx; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { w = sx[i__]; z__ = sy[i__]; sx[i__] = w + z__ * sh12; sy[i__] = w * sh21 + z__; /* L20: */ } goto L140; L30: sh11 = sparam[2]; sh22 = sparam[5]; i__2 = nsteps; i__1 = *incx; for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { w = sx[i__]; z__ = sy[i__]; sx[i__] = w * sh11 + z__; sy[i__] = -w + sh22 * z__; /* L40: */ } goto L140; L50: sh11 = sparam[2]; sh12 = sparam[4]; sh21 = sparam[3]; sh22 = sparam[5]; i__1 = nsteps; i__2 = *incx; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { w = sx[i__]; z__ = sy[i__]; sx[i__] = w * sh11 + z__ * sh12; sy[i__] = w * sh21 + z__ * sh22; /* L60: */ } goto L140; L70: kx = 1; ky = 1; if (*incx < 0) { kx = (1 - *n) * *incx + 1; } if (*incy < 0) { ky = (1 - *n) * *incy + 1; } if (sflag < 0.f) { goto L120; } else if (sflag == 0) { goto L80; } else { goto L100; } L80: sh12 = sparam[4]; sh21 = sparam[3]; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { w = sx[kx]; z__ = sy[ky]; sx[kx] = w + z__ * sh12; sy[ky] = w * sh21 + z__; kx += *incx; ky += *incy; /* L90: */ } goto L140; L100: sh11 = sparam[2]; sh22 = sparam[5]; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { w = sx[kx]; z__ = sy[ky]; sx[kx] = w * sh11 + z__; sy[ky] = -w + sh22 * z__; kx += *incx; ky += *incy; /* L110: */ } goto L140; L120: sh11 = sparam[2]; sh12 = sparam[4]; sh21 = sparam[3]; sh22 = sparam[5]; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { w = sx[kx]; z__ = sy[ky]; sx[kx] = w * sh11 + z__ * sh12; sy[ky] = w * sh21 + z__ * sh22; kx += *incx; ky += *incy; /* L130: */ } L140: return 0; } /* srotm_ */ /* drotm.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam) { /* Initialized data */ static bla_double zero = 0.; static bla_double two = 2.; /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer i__; bla_double dflag, w, z__; bla_integer kx, ky, nsteps; bla_double dh11, dh12, dh22, dh21; /* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */ /* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */ /* (DY**T) */ /* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */ /* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */ /* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ /* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */ /* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */ /* H=( ) ( ) ( ) ( ) */ /* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */ /* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */ /* Parameter adjustments */ --dparam; --dy; --dx; /* Function Body */ dflag = dparam[1]; if (*n <= 0 || dflag + two == zero) { goto L140; } if (! (*incx == *incy && *incx > 0)) { goto L70; } nsteps = *n * *incx; if (dflag < 0.) { goto L50; } else if (dflag == 0) { goto L10; } else { goto L30; } L10: dh12 = dparam[4]; dh21 = dparam[3]; i__1 = nsteps; i__2 = *incx; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { w = dx[i__]; z__ = dy[i__]; dx[i__] = w + z__ * dh12; dy[i__] = w * dh21 + z__; /* L20: */ } goto L140; L30: dh11 = dparam[2]; dh22 = dparam[5]; i__2 = nsteps; i__1 = *incx; for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { w = dx[i__]; z__ = dy[i__]; dx[i__] = w * dh11 + z__; dy[i__] = -w + dh22 * z__; /* L40: */ } goto L140; L50: dh11 = dparam[2]; dh12 = dparam[4]; dh21 = dparam[3]; dh22 = dparam[5]; i__1 = nsteps; i__2 = *incx; for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { w = dx[i__]; z__ = dy[i__]; dx[i__] = w * dh11 + z__ * dh12; dy[i__] = w * dh21 + z__ * dh22; /* L60: */ } goto L140; L70: kx = 1; ky = 1; if (*incx < 0) { kx = (1 - *n) * *incx + 1; } if (*incy < 0) { ky = (1 - *n) * *incy + 1; } if (dflag < 0.) { goto L120; } else if (dflag == 0) { goto L80; } else { goto L100; } L80: dh12 = dparam[4]; dh21 = dparam[3]; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { w = dx[kx]; z__ = dy[ky]; dx[kx] = w + z__ * dh12; dy[ky] = w * dh21 + z__; kx += *incx; ky += *incy; /* L90: */ } goto L140; L100: dh11 = dparam[2]; dh22 = dparam[5]; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { w = dx[kx]; z__ = dy[ky]; dx[kx] = w * dh11 + z__; dy[ky] = -w + dh22 * z__; kx += *incx; ky += *incy; /* L110: */ } goto L140; L120: dh11 = dparam[2]; dh12 = dparam[4]; dh21 = dparam[3]; dh22 = dparam[5]; i__2 = *n; for (i__ = 1; i__ <= i__2; ++i__) { w = dx[kx]; z__ = dy[ky]; dx[kx] = w * dh11 + z__ * dh12; dy[ky] = w * dh21 + z__ * dh22; kx += *incx; ky += *incy; /* L130: */ } L140: return 0; } /* drotm_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rotm.h000066400000000000000000000037661427272030600226510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rotmg.c000066400000000000000000000264261427272030600230110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* srotmg.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam) { /* Initialized data */ static bla_real zero = 0.f; static bla_real one = 1.f; static bla_real two = 2.f; static bla_real gam = 4096.f; static bla_real gamsq = 16777200.f; static bla_real rgamsq = 5.96046e-8f; /* Format strings */ /* System generated locals */ bla_real r__1; /* Local variables */ bla_real sflag, stemp, su, sp1, sp2, sq2, sq1, sh11 = 0.f, sh21 = 0.f, sh12 = 0.f, sh22 = 0.f; bla_integer igo; /* Assigned format variables */ /* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */ /* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */ /* SY2)**T. */ /* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ /* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */ /* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */ /* H=( ) ( ) ( ) ( ) */ /* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */ /* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */ /* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */ /* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */ /* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */ /* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */ /* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */ /* Parameter adjustments */ --sparam; /* Function Body */ if (! (*sd1 < zero)) { goto L10; } /* GO ZERO-H-D-AND-SX1.. */ goto L60; L10: /* CASE-SD1-NONNEGATIVE */ sp2 = *sd2 * *sy1; if (! (sp2 == zero)) { goto L20; } sflag = -two; goto L260; /* REGULAR-CASE.. */ L20: sp1 = *sd1 * *sx1; sq2 = sp2 * *sy1; sq1 = sp1 * *sx1; if (! (bli_fabs(sq1) > bli_fabs(sq2))) { goto L40; } sh21 = -(*sy1) / *sx1; sh12 = sp2 / sp1; su = one - sh12 * sh21; if (! (su <= zero)) { goto L30; } /* GO ZERO-H-D-AND-SX1.. */ goto L60; L30: sflag = zero; *sd1 /= su; *sd2 /= su; *sx1 *= su; /* GO SCALE-CHECK.. */ goto L100; L40: if (! (sq2 < zero)) { goto L50; } /* GO ZERO-H-D-AND-SX1.. */ goto L60; L50: sflag = one; sh11 = sp1 / sp2; sh22 = *sx1 / *sy1; su = one + sh11 * sh22; stemp = *sd2 / su; *sd2 = *sd1 / su; *sd1 = stemp; *sx1 = *sy1 * su; /* GO SCALE-CHECK */ goto L100; /* PROCEDURE..ZERO-H-D-AND-SX1.. */ L60: sflag = -one; sh11 = zero; sh12 = zero; sh21 = zero; sh22 = zero; *sd1 = zero; *sd2 = zero; *sx1 = zero; /* RETURN.. */ goto L220; /* PROCEDURE..FIX-H.. */ L70: if (! (sflag >= zero)) { goto L90; } if (! (sflag == zero)) { goto L80; } sh11 = one; sh22 = one; sflag = -one; goto L90; L80: sh21 = -one; sh12 = one; sflag = -one; L90: switch (igo) { case 0: goto L120; case 1: goto L150; case 2: goto L180; case 3: goto L210; } /* PROCEDURE..SCALE-CHECK */ L100: L110: if (! (*sd1 <= rgamsq)) { goto L130; } if (*sd1 == zero) { goto L160; } igo = 0; /* FIX-H.. */ goto L70; L120: /* Computing 2nd power */ r__1 = gam; *sd1 *= r__1 * r__1; *sx1 /= gam; sh11 /= gam; sh12 /= gam; goto L110; L130: L140: if (! (*sd1 >= gamsq)) { goto L160; } igo = 1; /* FIX-H.. */ goto L70; L150: /* Computing 2nd power */ r__1 = gam; *sd1 /= r__1 * r__1; *sx1 *= gam; sh11 *= gam; sh12 *= gam; goto L140; L160: L170: if (! (bli_fabs(*sd2) <= rgamsq)) { goto L190; } if (*sd2 == zero) { goto L220; } igo = 2; /* FIX-H.. */ goto L70; L180: /* Computing 2nd power */ r__1 = gam; *sd2 *= r__1 * r__1; sh21 /= gam; sh22 /= gam; goto L170; L190: L200: if (! (bli_fabs(*sd2) >= gamsq)) { goto L220; } igo = 3; /* FIX-H.. */ goto L70; L210: /* Computing 2nd power */ r__1 = gam; *sd2 /= r__1 * r__1; sh21 *= gam; sh22 *= gam; goto L200; L220: if (sflag < 0.f) { goto L250; } else if (sflag == 0) { goto L230; } else { goto L240; } L230: sparam[3] = sh21; sparam[4] = sh12; goto L260; L240: sparam[2] = sh11; sparam[5] = sh22; goto L260; L250: sparam[2] = sh11; sparam[3] = sh21; sparam[4] = sh12; sparam[5] = sh22; L260: sparam[1] = sflag; return 0; } /* srotmg_ */ /* drotmg.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam) { /* Initialized data */ static bla_double zero = 0.; static bla_double one = 1.; static bla_double two = 2.; static bla_double gam = 4096.; static bla_double gamsq = 16777216.; static bla_double rgamsq = 5.9604645e-8; /* Format strings */ /* System generated locals */ bla_double d__1; /* Local variables */ bla_double dflag, dtemp, du, dp1, dp2, dq2, dq1, dh11 = 0.f, dh21 = 0.f, dh12 = 0.f, dh22 = 0.f; bla_integer igo; /* Assigned format variables */ /* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */ /* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */ /* DY2)**T. */ /* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */ /* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */ /* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */ /* H=( ) ( ) ( ) ( ) */ /* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */ /* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */ /* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */ /* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */ /* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */ /* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */ /* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */ /* Parameter adjustments */ --dparam; /* Function Body */ if (! (*dd1 < zero)) { goto L10; } /* GO ZERO-H-D-AND-DX1.. */ goto L60; L10: /* CASE-DD1-NONNEGATIVE */ dp2 = *dd2 * *dy1; if (! (dp2 == zero)) { goto L20; } dflag = -two; goto L260; /* REGULAR-CASE.. */ L20: dp1 = *dd1 * *dx1; dq2 = dp2 * *dy1; dq1 = dp1 * *dx1; if (! (bli_fabs(dq1) > bli_fabs(dq2))) { goto L40; } dh21 = -(*dy1) / *dx1; dh12 = dp2 / dp1; du = one - dh12 * dh21; if (! (du <= zero)) { goto L30; } /* GO ZERO-H-D-AND-DX1.. */ goto L60; L30: dflag = zero; *dd1 /= du; *dd2 /= du; *dx1 *= du; /* GO SCALE-CHECK.. */ goto L100; L40: if (! (dq2 < zero)) { goto L50; } /* GO ZERO-H-D-AND-DX1.. */ goto L60; L50: dflag = one; dh11 = dp1 / dp2; dh22 = *dx1 / *dy1; du = one + dh11 * dh22; dtemp = *dd2 / du; *dd2 = *dd1 / du; *dd1 = dtemp; *dx1 = *dy1 * du; /* GO SCALE-CHECK */ goto L100; /* PROCEDURE..ZERO-H-D-AND-DX1.. */ L60: dflag = -one; dh11 = zero; dh12 = zero; dh21 = zero; dh22 = zero; *dd1 = zero; *dd2 = zero; *dx1 = zero; /* RETURN.. */ goto L220; /* PROCEDURE..FIX-H.. */ L70: if (! (dflag >= zero)) { goto L90; } if (! (dflag == zero)) { goto L80; } dh11 = one; dh22 = one; dflag = -one; goto L90; L80: dh21 = -one; dh12 = one; dflag = -one; L90: switch (igo) { case 0: goto L120; case 1: goto L150; case 2: goto L180; case 3: goto L210; } /* PROCEDURE..SCALE-CHECK */ L100: L110: if (! (*dd1 <= rgamsq)) { goto L130; } if (*dd1 == zero) { goto L160; } igo = 0; /* FIX-H.. */ goto L70; L120: /* Computing 2nd power */ d__1 = gam; *dd1 *= d__1 * d__1; *dx1 /= gam; dh11 /= gam; dh12 /= gam; goto L110; L130: L140: if (! (*dd1 >= gamsq)) { goto L160; } igo = 1; /* FIX-H.. */ goto L70; L150: /* Computing 2nd power */ d__1 = gam; *dd1 /= d__1 * d__1; *dx1 *= gam; dh11 *= gam; dh12 *= gam; goto L140; L160: L170: if (! (bli_fabs(*dd2) <= rgamsq)) { goto L190; } if (*dd2 == zero) { goto L220; } igo = 2; /* FIX-H.. */ goto L70; L180: /* Computing 2nd power */ d__1 = gam; *dd2 *= d__1 * d__1; dh21 /= gam; dh22 /= gam; goto L170; L190: L200: if (! (bli_fabs(*dd2) >= gamsq)) { goto L220; } igo = 3; /* FIX-H.. */ goto L70; L210: /* Computing 2nd power */ d__1 = gam; *dd2 /= d__1 * d__1; dh21 *= gam; dh22 *= gam; goto L200; L220: if (dflag < 0.) { goto L250; } else if (dflag == 0) { goto L230; } else { goto L240; } L230: dparam[3] = dh21; dparam[4] = dh12; goto L260; L240: dparam[2] = dh11; dparam[5] = dh22; goto L260; L250: dparam[2] = dh11; dparam[3] = dh21; dparam[4] = dh12; dparam[5] = dh22; L260: dparam[1] = dflag; return 0; } /* drotmg_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_rotmg.h000066400000000000000000000036541427272030600230140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_sbmv.c000066400000000000000000000522451427272030600226260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* dsbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ bla_integer info; bla_double temp1, temp2; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DSBMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n symmetric band matrix, with k super-diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the band matrix A is being supplied as */ /* follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* being supplied. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* being supplied. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry, K specifies the number of super-diagonals of the */ /* matrix A. K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* ALPHA - DOUBLE PRECISION. */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the symmetric matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer the upper */ /* triangular part of a symmetric band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the symmetric matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer the lower */ /* triangular part of a symmetric band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - DOUBLE PRECISION. */ /* On entry, BETA specifies the scalar beta. */ /* Unchanged on exit. */ /* Y - DOUBLE PRECISION array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*k < 0) { info = 3; } else if (*lda < *k + 1) { info = 6; } else if (*incx == 0) { info = 8; } else if (*incy == 0) { info = 11; } if (info != 0) { PASTEF770(xerbla)("DSBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (*alpha == 0. && *beta == 1.)) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array A */ /* are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if (*beta != 1.) { if (*incy == 1) { if (*beta == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.; /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = *beta * y[i__]; /* L20: */ } } } else { iy = ky; if (*beta == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = 0.; iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = *beta * y[iy]; iy += *incy; /* L40: */ } } } } if (*alpha == 0.) { return 0; } if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when upper triangle of A is stored. */ kplus1 = *k + 1; if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.; l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { y[i__] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[i__]; /* L50: */ } y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.; ix = kx; iy = ky; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { y[iy] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[ix]; ix += *incx; iy += *incy; /* L70: */ } y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; jx += *incx; jy += *incy; if (j > *k) { kx += *incx; ky += *incy; } /* L80: */ } } } else { /* Form y when lower triangle of A is stored. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.; y[j] += temp1 * a[j * a_dim1 + 1]; l = 1 - j; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { y[i__] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[i__]; /* L90: */ } y[j] += *alpha * temp2; /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.; y[jy] += temp1 * a[j * a_dim1 + 1]; l = 1 - j; ix = jx; iy = jy; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { ix += *incx; iy += *incy; y[iy] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[ix]; /* L110: */ } y[jy] += *alpha * temp2; jx += *incx; jy += *incy; /* L120: */ } } } return 0; /* End of DSBMV . */ } /* dsbmv_ */ /* ssbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ bla_integer info; bla_real temp1, temp2; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SSBMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n symmetric band matrix, with k super-diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the band matrix A is being supplied as */ /* follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* being supplied. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* being supplied. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry, K specifies the number of super-diagonals of the */ /* matrix A. K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* ALPHA - REAL . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - REAL array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the symmetric matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer the upper */ /* triangular part of a symmetric band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the symmetric matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer the lower */ /* triangular part of a symmetric band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - REAL array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - REAL . */ /* On entry, BETA specifies the scalar beta. */ /* Unchanged on exit. */ /* Y - REAL array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*k < 0) { info = 3; } else if (*lda < *k + 1) { info = 6; } else if (*incx == 0) { info = 8; } else if (*incy == 0) { info = 11; } if (info != 0) { PASTEF770(xerbla)("SSBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array A */ /* are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if (*beta != 1.f) { if (*incy == 1) { if (*beta == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.f; /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = *beta * y[i__]; /* L20: */ } } } else { iy = ky; if (*beta == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = 0.f; iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = *beta * y[iy]; iy += *incy; /* L40: */ } } } } if (*alpha == 0.f) { return 0; } if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when upper triangle of A is stored. */ kplus1 = *k + 1; if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.f; l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { y[i__] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[i__]; /* L50: */ } y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.f; ix = kx; iy = ky; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { y[iy] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[ix]; ix += *incx; iy += *incy; /* L70: */ } y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2; jx += *incx; jy += *incy; if (j > *k) { kx += *incx; ky += *incy; } /* L80: */ } } } else { /* Form y when lower triangle of A is stored. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.f; y[j] += temp1 * a[j * a_dim1 + 1]; l = 1 - j; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { y[i__] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[i__]; /* L90: */ } y[j] += *alpha * temp2; /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.f; y[jy] += temp1 * a[j * a_dim1 + 1]; l = 1 - j; ix = jx; iy = jy; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = f2c_min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { ix += *incx; iy += *incy; y[iy] += temp1 * a[l + i__ + j * a_dim1]; temp2 += a[l + i__ + j * a_dim1] * x[ix]; /* L110: */ } y[jy] += *alpha * temp2; jx += *incx; jy += *incy; /* L120: */ } } } return 0; /* End of SSBMV . */ } /* ssbmv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_sbmv.h000066400000000000000000000043441427272030600226300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_spmv.c000066400000000000000000000416511427272030600226430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* dspmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_double temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DSPMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n symmetric matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - DOUBLE PRECISION. */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* AP - DOUBLE PRECISION array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - DOUBLE PRECISION. */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. On exit, Y is overwritten by the updated */ /* vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --y; --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 6; } else if (*incy == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("DSPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (*alpha == 0. && *beta == 1.)) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ /* First form y := beta*y. */ if (*beta != 1.) { if (*incy == 1) { if (*beta == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.; /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = *beta * y[i__]; /* L20: */ } } } else { iy = ky; if (*beta == 0.) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = 0.; iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = *beta * y[iy]; iy += *incy; /* L40: */ } } } } if (*alpha == 0.) { return 0; } kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when AP contains the upper triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.; k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { y[i__] += temp1 * ap[k]; temp2 += ap[k] * x[i__]; ++k; /* L50: */ } y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2; kk += j; /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.; ix = kx; iy = ky; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { y[iy] += temp1 * ap[k]; temp2 += ap[k] * x[ix]; ix += *incx; iy += *incy; /* L70: */ } y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2; jx += *incx; jy += *incy; kk += j; /* L80: */ } } } else { /* Form y when AP contains the lower triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.; y[j] += temp1 * ap[kk]; k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { y[i__] += temp1 * ap[k]; temp2 += ap[k] * x[i__]; ++k; /* L90: */ } y[j] += *alpha * temp2; kk += *n - j + 1; /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.; y[jy] += temp1 * ap[kk]; ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; iy += *incy; y[iy] += temp1 * ap[k]; temp2 += ap[k] * x[ix]; /* L110: */ } y[jy] += *alpha * temp2; jx += *incx; jy += *incy; kk += *n - j + 1; /* L120: */ } } } return 0; /* End of DSPMV . */ } /* dspmv_ */ /* sspmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_real temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx, jy, kx, ky; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SSPMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n symmetric matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - REAL . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* AP - REAL array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. */ /* Unchanged on exit. */ /* X - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - REAL . */ /* On entry, BETA specifies the scalar beta. When BETA is */ /* supplied as zero then Y need not be set on input. */ /* Unchanged on exit. */ /* Y - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. On exit, Y is overwritten by the updated */ /* vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --y; --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 6; } else if (*incy == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("SSPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ /* First form y := beta*y. */ if (*beta != 1.f) { if (*incy == 1) { if (*beta == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = 0.f; /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[i__] = *beta * y[i__]; /* L20: */ } } } else { iy = ky; if (*beta == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = 0.f; iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { y[iy] = *beta * y[iy]; iy += *incy; /* L40: */ } } } } if (*alpha == 0.f) { return 0; } kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when AP contains the upper triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.f; k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { y[i__] += temp1 * ap[k]; temp2 += ap[k] * x[i__]; ++k; /* L50: */ } y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2; kk += j; /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.f; ix = kx; iy = ky; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { y[iy] += temp1 * ap[k]; temp2 += ap[k] * x[ix]; ix += *incx; iy += *incy; /* L70: */ } y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2; jx += *incx; jy += *incy; kk += j; /* L80: */ } } } else { /* Form y when AP contains the lower triangle. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[j]; temp2 = 0.f; y[j] += temp1 * ap[kk]; k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { y[i__] += temp1 * ap[k]; temp2 += ap[k] * x[i__]; ++k; /* L90: */ } y[j] += *alpha * temp2; kk += *n - j + 1; /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp1 = *alpha * x[jx]; temp2 = 0.f; y[jy] += temp1 * ap[kk]; ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; iy += *incy; y[iy] += temp1 * ap[k]; temp2 += ap[k] * x[ix]; /* L110: */ } y[jy] += *alpha * temp2; jx += *incx; jy += *incy; kk += *n - j + 1; /* L120: */ } } } return 0; /* End of SSPMV . */ } /* sspmv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_spmv.h000066400000000000000000000042121427272030600226400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_spr.c000066400000000000000000000331001427272030600224500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* dspr.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_double temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DSPR performs the symmetric rank 1 operation */ /* A := alpha*x*x' + A, */ /* where alpha is a bla_real scalar, x is an n element vector and A is an */ /* n by n symmetric matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - DOUBLE PRECISION. */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* AP - DOUBLE PRECISION array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } if (info != 0) { PASTEF770(xerbla)("DSPR ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.) { return 0; } /* Set the start point in X if the increment is not unity. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.) { temp = *alpha * x[j]; k = kk; i__2 = j; for (i__ = 1; i__ <= i__2; ++i__) { ap[k] += x[i__] * temp; ++k; /* L10: */ } } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { temp = *alpha * x[jx]; ix = kx; i__2 = kk + j - 1; for (k = kk; k <= i__2; ++k) { ap[k] += x[ix] * temp; ix += *incx; /* L30: */ } } jx += *incx; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.) { temp = *alpha * x[j]; k = kk; i__2 = *n; for (i__ = j; i__ <= i__2; ++i__) { ap[k] += x[i__] * temp; ++k; /* L50: */ } } kk = kk + *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { temp = *alpha * x[jx]; ix = jx; i__2 = kk + *n - j; for (k = kk; k <= i__2; ++k) { ap[k] += x[ix] * temp; ix += *incx; /* L70: */ } } jx += *incx; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of DSPR . */ } /* dspr_ */ /* sspr.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_real temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SSPR performs the symmetric rank 1 operation */ /* A := alpha*x*x' + A, */ /* where alpha is a bla_real scalar, x is an n element vector and A is an */ /* n by n symmetric matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - REAL . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* AP - REAL array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } if (info != 0) { PASTEF770(xerbla)("SSPR ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.f) { return 0; } /* Set the start point in X if the increment is not unity. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f) { temp = *alpha * x[j]; k = kk; i__2 = j; for (i__ = 1; i__ <= i__2; ++i__) { ap[k] += x[i__] * temp; ++k; /* L10: */ } } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f) { temp = *alpha * x[jx]; ix = kx; i__2 = kk + j - 1; for (k = kk; k <= i__2; ++k) { ap[k] += x[ix] * temp; ix += *incx; /* L30: */ } } jx += *incx; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f) { temp = *alpha * x[j]; k = kk; i__2 = *n; for (i__ = j; i__ <= i__2; ++i__) { ap[k] += x[i__] * temp; ++k; /* L50: */ } } kk = kk + *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f) { temp = *alpha * x[jx]; ix = jx; i__2 = kk + *n - j; for (k = kk; k <= i__2; ++k) { ap[k] += x[ix] * temp; ix += *incx; /* L70: */ } } jx += *incx; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of SSPR . */ } /* sspr_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_spr.h000066400000000000000000000040001427272030600224520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_spr2.c000066400000000000000000000372721427272030600225500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* dspr2.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_double temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DSPR2 performs the symmetric rank 2 operation */ /* A := alpha*x*y' + alpha*y*x' + A, */ /* where alpha is a scalar, x and y are n element vectors and A is an */ /* n by n symmetric matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - DOUBLE PRECISION. */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Y - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. */ /* Unchanged on exit. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* AP - DOUBLE PRECISION array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --y; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } else if (*incy == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("DSPR2 ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.) { return 0; } /* Set up the start points in X and Y if the increments are not both */ /* unity. */ if (*incx != 1 || *incy != 1) { if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } jx = kx; jy = ky; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0. || y[j] != 0.) { temp1 = *alpha * y[j]; temp2 = *alpha * x[j]; k = kk; i__2 = j; for (i__ = 1; i__ <= i__2; ++i__) { ap[k] = ap[k] + x[i__] * temp1 + y[i__] * temp2; ++k; /* L10: */ } } kk += j; /* L20: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0. || y[jy] != 0.) { temp1 = *alpha * y[jy]; temp2 = *alpha * x[jx]; ix = kx; iy = ky; i__2 = kk + j - 1; for (k = kk; k <= i__2; ++k) { ap[k] = ap[k] + x[ix] * temp1 + y[iy] * temp2; ix += *incx; iy += *incy; /* L30: */ } } jx += *incx; jy += *incy; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0. || y[j] != 0.) { temp1 = *alpha * y[j]; temp2 = *alpha * x[j]; k = kk; i__2 = *n; for (i__ = j; i__ <= i__2; ++i__) { ap[k] = ap[k] + x[i__] * temp1 + y[i__] * temp2; ++k; /* L50: */ } } kk = kk + *n - j + 1; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0. || y[jy] != 0.) { temp1 = *alpha * y[jy]; temp2 = *alpha * x[jx]; ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk; k <= i__2; ++k) { ap[k] = ap[k] + x[ix] * temp1 + y[iy] * temp2; ix += *incx; iy += *incy; /* L70: */ } } jx += *incx; jy += *incy; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of DSPR2 . */ } /* dspr2_ */ /* sspr2.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_real temp1, temp2; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* SSPR2 performs the symmetric rank 2 operation */ /* A := alpha*x*y' + alpha*y*x' + A, */ /* where alpha is a scalar, x and y are n element vectors and A is an */ /* n by n symmetric matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the matrix A is supplied in the packed */ /* array AP as follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* supplied in AP. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* supplied in AP. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* ALPHA - REAL . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* X - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Y - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the n */ /* element vector y. */ /* Unchanged on exit. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* AP - REAL array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */ /* and a( 2, 2 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the upper triangular part of the */ /* updated matrix. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular part of the symmetric matrix */ /* packed sequentially, column by column, so that AP( 1 ) */ /* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */ /* and a( 3, 1 ) respectively, and so on. On exit, the array */ /* AP is overwritten by the lower triangular part of the */ /* updated matrix. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --ap; --y; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*incx == 0) { info = 5; } else if (*incy == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("SSPR2 ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || *alpha == 0.f) { return 0; } /* Set up the start points in X and Y if the increments are not both */ /* unity. */ if (*incx != 1 || *incy != 1) { if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } jx = kx; jy = ky; } /* Start the operations. In this version the elements of the array AP */ /* are accessed sequentially with one pass through AP. */ kk = 1; if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form A when upper triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f || y[j] != 0.f) { temp1 = *alpha * y[j]; temp2 = *alpha * x[j]; k = kk; i__2 = j; for (i__ = 1; i__ <= i__2; ++i__) { ap[k] = ap[k] + x[i__] * temp1 + y[i__] * temp2; ++k; /* L10: */ } } kk += j; /* L20: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f || y[jy] != 0.f) { temp1 = *alpha * y[jy]; temp2 = *alpha * x[jx]; ix = kx; iy = ky; i__2 = kk + j - 1; for (k = kk; k <= i__2; ++k) { ap[k] = ap[k] + x[ix] * temp1 + y[iy] * temp2; ix += *incx; iy += *incy; /* L30: */ } } jx += *incx; jy += *incy; kk += j; /* L40: */ } } } else { /* Form A when lower triangle is stored in AP. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f || y[j] != 0.f) { temp1 = *alpha * y[j]; temp2 = *alpha * x[j]; k = kk; i__2 = *n; for (i__ = j; i__ <= i__2; ++i__) { ap[k] = ap[k] + x[i__] * temp1 + y[i__] * temp2; ++k; /* L50: */ } } kk = kk + *n - j + 1; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f || y[jy] != 0.f) { temp1 = *alpha * y[jy]; temp2 = *alpha * x[jx]; ix = jx; iy = jy; i__2 = kk + *n - j; for (k = kk; k <= i__2; ++k) { ap[k] = ap[k] + x[ix] * temp1 + y[iy] * temp2; ix += *incx; iy += *incy; /* L70: */ } } jx += *incx; jy += *incy; kk = kk + *n - j + 1; /* L80: */ } } } return 0; /* End of SSPR2 . */ } /* sspr2_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_spr2.h000066400000000000000000000041341427272030600225440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tbmv.c000066400000000000000000002041231427272030600226210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* ctbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; bla_scomplex q__1, q__2, q__3; /* Builtin functions */ //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CTBMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := conjg( A' )*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - COMPLEX array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("CTBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { i__2 = j; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = i__; i__3 = i__; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); /* L10: */ } if (nounit) { i__4 = j; i__2 = j; i__3 = kplus1 + j * a_dim1; bli_csets( (bli_creal(x[i__2]) * bli_creal(a[i__3]) - bli_cimag(x[i__2]) * bli_cimag(a[i__3])), (bli_creal(x[i__2]) * bli_cimag(a[i__3]) + bli_cimag(x[i__2]) * bli_creal(a[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] ); } } /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__4 = jx; if (bli_creal(x[i__4]) != 0.f || bli_cimag(x[i__4]) != 0.f) { i__4 = jx; bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp ); ix = kx; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = ix; i__2 = ix; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(x[i__2]) + bli_creal(q__2)), (bli_cimag(x[i__2]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] ); ix += *incx; /* L30: */ } if (nounit) { i__3 = jx; i__4 = jx; i__2 = kplus1 + j * a_dim1; bli_csets( (bli_creal(x[i__4]) * bli_creal(a[i__2]) - bli_cimag(x[i__4]) * bli_cimag(a[i__2])), (bli_creal(x[i__4]) * bli_cimag(a[i__2]) + bli_cimag(x[i__4]) * bli_creal(a[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); } } jx += *incx; if (j > *k) { kx += *incx; } /* L40: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) { i__1 = j; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); l = 1 - j; /* Computing MIN */ i__1 = *n, i__3 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__1,i__3); i__ >= i__4; --i__) { i__1 = i__; i__3 = i__; i__2 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__2]) - bli_cimag(temp) * bli_cimag(a[i__2])), (bli_creal(temp) * bli_cimag(a[i__2]) + bli_cimag(temp) * bli_creal(a[i__2])), q__2 ); bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); /* L50: */ } if (nounit) { i__4 = j; i__1 = j; i__3 = j * a_dim1 + 1; bli_csets( (bli_creal(x[i__1]) * bli_creal(a[i__3]) - bli_cimag(x[i__1]) * bli_cimag(a[i__3])), (bli_creal(x[i__1]) * bli_cimag(a[i__3]) + bli_cimag(x[i__1]) * bli_creal(a[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] ); } } /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__4 = jx; if (bli_creal(x[i__4]) != 0.f || bli_cimag(x[i__4]) != 0.f) { i__4 = jx; bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp ); ix = kx; l = 1 - j; /* Computing MIN */ i__4 = *n, i__1 = j + *k; i__3 = j + 1; for (i__ = f2c_min(i__4,i__1); i__ >= i__3; --i__) { i__4 = ix; i__1 = ix; i__2 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__2]) - bli_cimag(temp) * bli_cimag(a[i__2])), (bli_creal(temp) * bli_cimag(a[i__2]) + bli_cimag(temp) * bli_creal(a[i__2])), q__2 ); bli_csets( (bli_creal(x[i__1]) + bli_creal(q__2)), (bli_cimag(x[i__1]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__4] ); ix -= *incx; /* L70: */ } if (nounit) { i__3 = jx; i__4 = jx; i__1 = j * a_dim1 + 1; bli_csets( (bli_creal(x[i__4]) * bli_creal(a[i__1]) - bli_cimag(x[i__4]) * bli_cimag(a[i__1])), (bli_creal(x[i__4]) * bli_cimag(a[i__1]) + bli_cimag(x[i__4]) * bli_creal(a[i__1])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); } } jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L80: */ } } } } else { /* Form x := A'*x or x := conjg( A' )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__3 = j; bli_csets( (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp ); l = kplus1 - j; if (noconj) { if (nounit) { i__3 = kplus1 + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__3]) - bli_cimag(temp) * bli_cimag(a[i__3])), (bli_creal(temp) * bli_cimag(a[i__3]) + bli_cimag(temp) * bli_creal(a[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { i__4 = l + i__ + j * a_dim1; i__1 = i__; bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__1]) - bli_cimag(a[i__4]) * bli_cimag(x[i__1])), (bli_creal(a[i__4]) * bli_cimag(x[i__1]) + bli_cimag(a[i__4]) * bli_creal(x[i__1])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L90: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L100: */ } } i__3 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__3] ); /* L110: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__3 = jx; bli_csets( (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp ); kx -= *incx; ix = kx; l = kplus1 - j; if (noconj) { if (nounit) { i__3 = kplus1 + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__3]) - bli_cimag(temp) * bli_cimag(a[i__3])), (bli_creal(temp) * bli_cimag(a[i__3]) + bli_cimag(temp) * bli_creal(a[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { i__4 = l + i__ + j * a_dim1; i__1 = ix; bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__1]) - bli_cimag(a[i__4]) * bli_cimag(x[i__1])), (bli_creal(a[i__4]) * bli_cimag(x[i__1]) + bli_cimag(a[i__4]) * bli_creal(x[i__1])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix -= *incx; /* L120: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix -= *incx; /* L130: */ } } i__3 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__3] ); jx -= *incx; /* L140: */ } } } else { if (*incx == 1) { i__3 = *n; for (j = 1; j <= i__3; ++j) { i__4 = j; bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp ); l = 1 - j; if (noconj) { if (nounit) { i__4 = j * a_dim1 + 1; bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { i__1 = l + i__ + j * a_dim1; i__2 = i__; bli_csets( (bli_creal(a[i__1]) * bli_creal(x[i__2]) - bli_cimag(a[i__1]) * bli_cimag(x[i__2])), (bli_creal(a[i__1]) * bli_cimag(x[i__2]) + bli_cimag(a[i__1]) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L150: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__1 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L160: */ } } i__4 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] ); /* L170: */ } } else { jx = kx; i__3 = *n; for (j = 1; j <= i__3; ++j) { i__4 = jx; bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp ); kx += *incx; ix = kx; l = 1 - j; if (noconj) { if (nounit) { i__4 = j * a_dim1 + 1; bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { i__1 = l + i__ + j * a_dim1; i__2 = ix; bli_csets( (bli_creal(a[i__1]) * bli_creal(x[i__2]) - bli_cimag(a[i__1]) * bli_cimag(x[i__2])), (bli_creal(a[i__1]) * bli_cimag(x[i__2]) + bli_cimag(a[i__1]) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L180: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__1 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L190: */ } } i__4 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] ); jx += *incx; /* L200: */ } } } } return 0; /* End of CTBMV . */ } /* ctbmv_ */ /* dtbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ bla_integer info; bla_double temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DTBMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := A'*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("DTBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.) { temp = x[j]; l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { x[i__] += temp * a[l + i__ + j * a_dim1]; /* L10: */ } if (nounit) { x[j] *= a[kplus1 + j * a_dim1]; } } /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { temp = x[jx]; ix = kx; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { x[ix] += temp * a[l + i__ + j * a_dim1]; ix += *incx; /* L30: */ } if (nounit) { x[jx] *= a[kplus1 + j * a_dim1]; } } jx += *incx; if (j > *k) { kx += *incx; } /* L40: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.) { temp = x[j]; l = 1 - j; /* Computing MIN */ i__1 = *n, i__3 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__1,i__3); i__ >= i__4; --i__) { x[i__] += temp * a[l + i__ + j * a_dim1]; /* L50: */ } if (nounit) { x[j] *= a[j * a_dim1 + 1]; } } /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { if (x[jx] != 0.) { temp = x[jx]; ix = kx; l = 1 - j; /* Computing MIN */ i__4 = *n, i__1 = j + *k; i__3 = j + 1; for (i__ = f2c_min(i__4,i__1); i__ >= i__3; --i__) { x[ix] += temp * a[l + i__ + j * a_dim1]; ix -= *incx; /* L70: */ } if (nounit) { x[jx] *= a[j * a_dim1 + 1]; } } jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L80: */ } } } } else { /* Form x := A'*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; l = kplus1 - j; if (nounit) { temp *= a[kplus1 + j * a_dim1]; } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { temp += a[l + i__ + j * a_dim1] * x[i__]; /* L90: */ } x[j] = temp; /* L100: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { temp = x[jx]; kx -= *incx; ix = kx; l = kplus1 - j; if (nounit) { temp *= a[kplus1 + j * a_dim1]; } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { temp += a[l + i__ + j * a_dim1] * x[ix]; ix -= *incx; /* L110: */ } x[jx] = temp; jx -= *incx; /* L120: */ } } } else { if (*incx == 1) { i__3 = *n; for (j = 1; j <= i__3; ++j) { temp = x[j]; l = 1 - j; if (nounit) { temp *= a[j * a_dim1 + 1]; } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { temp += a[l + i__ + j * a_dim1] * x[i__]; /* L130: */ } x[j] = temp; /* L140: */ } } else { jx = kx; i__3 = *n; for (j = 1; j <= i__3; ++j) { temp = x[jx]; kx += *incx; ix = kx; l = 1 - j; if (nounit) { temp *= a[j * a_dim1 + 1]; } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { temp += a[l + i__ + j * a_dim1] * x[ix]; ix += *incx; /* L150: */ } x[jx] = temp; jx += *incx; /* L160: */ } } } } return 0; /* End of DTBMV . */ } /* dtbmv_ */ /* stbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ bla_integer info; bla_real temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* STBMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := A'*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - REAL array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("STBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f) { temp = x[j]; l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { x[i__] += temp * a[l + i__ + j * a_dim1]; /* L10: */ } if (nounit) { x[j] *= a[kplus1 + j * a_dim1]; } } /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f) { temp = x[jx]; ix = kx; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { x[ix] += temp * a[l + i__ + j * a_dim1]; ix += *incx; /* L30: */ } if (nounit) { x[jx] *= a[kplus1 + j * a_dim1]; } } jx += *incx; if (j > *k) { kx += *incx; } /* L40: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.f) { temp = x[j]; l = 1 - j; /* Computing MIN */ i__1 = *n, i__3 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__1,i__3); i__ >= i__4; --i__) { x[i__] += temp * a[l + i__ + j * a_dim1]; /* L50: */ } if (nounit) { x[j] *= a[j * a_dim1 + 1]; } } /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { if (x[jx] != 0.f) { temp = x[jx]; ix = kx; l = 1 - j; /* Computing MIN */ i__4 = *n, i__1 = j + *k; i__3 = j + 1; for (i__ = f2c_min(i__4,i__1); i__ >= i__3; --i__) { x[ix] += temp * a[l + i__ + j * a_dim1]; ix -= *incx; /* L70: */ } if (nounit) { x[jx] *= a[j * a_dim1 + 1]; } } jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L80: */ } } } } else { /* Form x := A'*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; l = kplus1 - j; if (nounit) { temp *= a[kplus1 + j * a_dim1]; } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { temp += a[l + i__ + j * a_dim1] * x[i__]; /* L90: */ } x[j] = temp; /* L100: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { temp = x[jx]; kx -= *incx; ix = kx; l = kplus1 - j; if (nounit) { temp *= a[kplus1 + j * a_dim1]; } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { temp += a[l + i__ + j * a_dim1] * x[ix]; ix -= *incx; /* L110: */ } x[jx] = temp; jx -= *incx; /* L120: */ } } } else { if (*incx == 1) { i__3 = *n; for (j = 1; j <= i__3; ++j) { temp = x[j]; l = 1 - j; if (nounit) { temp *= a[j * a_dim1 + 1]; } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { temp += a[l + i__ + j * a_dim1] * x[i__]; /* L130: */ } x[j] = temp; /* L140: */ } } else { jx = kx; i__3 = *n; for (j = 1; j <= i__3; ++j) { temp = x[jx]; kx += *incx; ix = kx; l = 1 - j; if (nounit) { temp *= a[j * a_dim1 + 1]; } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { temp += a[l + i__ + j * a_dim1] * x[ix]; ix += *incx; /* L150: */ } x[jx] = temp; jx += *incx; /* L160: */ } } } } return 0; /* End of STBMV . */ } /* stbmv_ */ /* ztbmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; bla_dcomplex z__1, z__2, z__3; /* Builtin functions */ //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZTBMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := conjg( A' )*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("ZTBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { i__2 = j; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = i__; i__3 = i__; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); /* L10: */ } if (nounit) { i__4 = j; i__2 = j; i__3 = kplus1 + j * a_dim1; bli_zsets( (bli_zreal(x[i__2]) * bli_zreal(a[i__3]) - bli_zimag(x[i__2]) * bli_zimag(a[i__3])), (bli_zreal(x[i__2]) * bli_zimag(a[i__3]) + bli_zimag(x[i__2]) * bli_zreal(a[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] ); } } /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__4 = jx; if (bli_zreal(x[i__4]) != 0. || bli_zimag(x[i__4]) != 0.) { i__4 = jx; bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp ); ix = kx; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = ix; i__2 = ix; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__2]) + bli_zreal(z__2)), (bli_zimag(x[i__2]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] ); ix += *incx; /* L30: */ } if (nounit) { i__3 = jx; i__4 = jx; i__2 = kplus1 + j * a_dim1; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(a[i__2]) - bli_zimag(x[i__4]) * bli_zimag(a[i__2])), (bli_zreal(x[i__4]) * bli_zimag(a[i__2]) + bli_zimag(x[i__4]) * bli_zreal(a[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); } } jx += *incx; if (j > *k) { kx += *incx; } /* L40: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) { i__1 = j; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); l = 1 - j; /* Computing MIN */ i__1 = *n, i__3 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__1,i__3); i__ >= i__4; --i__) { i__1 = i__; i__3 = i__; i__2 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__2]) - bli_zimag(temp) * bli_zimag(a[i__2])), (bli_zreal(temp) * bli_zimag(a[i__2]) + bli_zimag(temp) * bli_zreal(a[i__2])), z__2 ); bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); /* L50: */ } if (nounit) { i__4 = j; i__1 = j; i__3 = j * a_dim1 + 1; bli_zsets( (bli_zreal(x[i__1]) * bli_zreal(a[i__3]) - bli_zimag(x[i__1]) * bli_zimag(a[i__3])), (bli_zreal(x[i__1]) * bli_zimag(a[i__3]) + bli_zimag(x[i__1]) * bli_zreal(a[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] ); } } /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__4 = jx; if (bli_zreal(x[i__4]) != 0. || bli_zimag(x[i__4]) != 0.) { i__4 = jx; bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp ); ix = kx; l = 1 - j; /* Computing MIN */ i__4 = *n, i__1 = j + *k; i__3 = j + 1; for (i__ = f2c_min(i__4,i__1); i__ >= i__3; --i__) { i__4 = ix; i__1 = ix; i__2 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__2]) - bli_zimag(temp) * bli_zimag(a[i__2])), (bli_zreal(temp) * bli_zimag(a[i__2]) + bli_zimag(temp) * bli_zreal(a[i__2])), z__2 ); bli_zsets( (bli_zreal(x[i__1]) + bli_zreal(z__2)), (bli_zimag(x[i__1]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__4] ); ix -= *incx; /* L70: */ } if (nounit) { i__3 = jx; i__4 = jx; i__1 = j * a_dim1 + 1; bli_zsets( (bli_zreal(x[i__4]) * bli_zreal(a[i__1]) - bli_zimag(x[i__4]) * bli_zimag(a[i__1])), (bli_zreal(x[i__4]) * bli_zimag(a[i__1]) + bli_zimag(x[i__4]) * bli_zreal(a[i__1])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); } } jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L80: */ } } } } else { /* Form x := A'*x or x := conjg( A' )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__3 = j; bli_zsets( (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp ); l = kplus1 - j; if (noconj) { if (nounit) { i__3 = kplus1 + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__3]) - bli_zimag(temp) * bli_zimag(a[i__3])), (bli_zreal(temp) * bli_zimag(a[i__3]) + bli_zimag(temp) * bli_zreal(a[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { i__4 = l + i__ + j * a_dim1; i__1 = i__; bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__1]) - bli_zimag(a[i__4]) * bli_zimag(x[i__1])), (bli_zreal(a[i__4]) * bli_zimag(x[i__1]) + bli_zimag(a[i__4]) * bli_zreal(x[i__1])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L90: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__4 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L100: */ } } i__3 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] ); /* L110: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__3 = jx; bli_zsets( (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp ); kx -= *incx; ix = kx; l = kplus1 - j; if (noconj) { if (nounit) { i__3 = kplus1 + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__3]) - bli_zimag(temp) * bli_zimag(a[i__3])), (bli_zreal(temp) * bli_zimag(a[i__3]) + bli_zimag(temp) * bli_zreal(a[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { i__4 = l + i__ + j * a_dim1; i__1 = ix; bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__1]) - bli_zimag(a[i__4]) * bli_zimag(x[i__1])), (bli_zreal(a[i__4]) * bli_zimag(x[i__1]) + bli_zimag(a[i__4]) * bli_zreal(x[i__1])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix -= *incx; /* L120: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MAX */ i__4 = 1, i__1 = j - *k; i__3 = f2c_max(i__4,i__1); for (i__ = j - 1; i__ >= i__3; --i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__4 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix -= *incx; /* L130: */ } } i__3 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] ); jx -= *incx; /* L140: */ } } } else { if (*incx == 1) { i__3 = *n; for (j = 1; j <= i__3; ++j) { i__4 = j; bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp ); l = 1 - j; if (noconj) { if (nounit) { i__4 = j * a_dim1 + 1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { i__1 = l + i__ + j * a_dim1; i__2 = i__; bli_zsets( (bli_zreal(a[i__1]) * bli_zreal(x[i__2]) - bli_zimag(a[i__1]) * bli_zimag(x[i__2])), (bli_zreal(a[i__1]) * bli_zimag(x[i__2]) + bli_zimag(a[i__1]) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L150: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__1 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L160: */ } } i__4 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] ); /* L170: */ } } else { jx = kx; i__3 = *n; for (j = 1; j <= i__3; ++j) { i__4 = jx; bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp ); kx += *incx; ix = kx; l = 1 - j; if (noconj) { if (nounit) { i__4 = j * a_dim1 + 1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { i__1 = l + i__ + j * a_dim1; i__2 = ix; bli_zsets( (bli_zreal(a[i__1]) * bli_zreal(x[i__2]) - bli_zimag(a[i__1]) * bli_zimag(x[i__2])), (bli_zreal(a[i__1]) * bli_zimag(x[i__2]) + bli_zimag(a[i__1]) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L180: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } /* Computing MIN */ i__1 = *n, i__2 = j + *k; i__4 = f2c_min(i__1,i__2); for (i__ = j + 1; i__ <= i__4; ++i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__1 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L190: */ } } i__4 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] ); jx += *incx; /* L200: */ } } } } return 0; /* End of ZTBMV . */ } /* ztbmv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tbmv.h000066400000000000000000000052261427272030600226310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tbsv.c000066400000000000000000001761311427272030600226360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* ctbsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; bla_scomplex q__1, q__2, q__3; /* Builtin functions */ //void bla_c_div(bla_scomplex *, bla_scomplex *, bla_scomplex *), bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CTBSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, or conjg( A' )*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular band matrix, with ( k + 1 ) */ /* diagonals. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' conjg( A' )*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - COMPLEX array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("CTBSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed by sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) { l = kplus1 - j; if (nounit) { i__1 = j; bla_c_div(&q__1, &x[j], &a[kplus1 + j * a_dim1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); } i__1 = j; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { i__2 = i__; i__3 = i__; i__4 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__2 ); bli_csets( (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); /* L10: */ } } /* L20: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { kx -= *incx; i__1 = jx; if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) { ix = kx; l = kplus1 - j; if (nounit) { i__1 = jx; bla_c_div(&q__1, &x[jx], &a[kplus1 + j * a_dim1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); } i__1 = jx; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { i__2 = ix; i__3 = ix; i__4 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__4]) - bli_cimag(temp) * bli_cimag(a[i__4])), (bli_creal(temp) * bli_cimag(a[i__4]) + bli_cimag(temp) * bli_creal(a[i__4])), q__2 ); bli_csets( (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); ix -= *incx; /* L30: */ } } jx -= *incx; /* L40: */ } } } else { if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { l = 1 - j; if (nounit) { i__2 = j; bla_c_div(&q__1, &x[j], &a[j * a_dim1 + 1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); } i__2 = j; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); /* L50: */ } } /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { kx += *incx; i__2 = jx; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { ix = kx; l = 1 - j; if (nounit) { i__2 = jx; bla_c_div(&q__1, &x[jx], &a[j * a_dim1 + 1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); } i__2 = jx; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = ix; i__4 = ix; i__5 = l + i__ + j * a_dim1; bli_csets( (bli_creal(temp) * bli_creal(a[i__5]) - bli_cimag(temp) * bli_cimag(a[i__5])), (bli_creal(temp) * bli_cimag(a[i__5]) + bli_cimag(temp) * bli_creal(a[i__5])), q__2 ); bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); ix += *incx; /* L70: */ } } jx += *incx; /* L80: */ } } } } else { /* Form x := inv( A' )*x or x := inv( conjg( A') )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); l = kplus1 - j; if (noconj) { /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = l + i__ + j * a_dim1; i__3 = i__; bli_csets( (bli_creal(a[i__2]) * bli_creal(x[i__3]) - bli_cimag(a[i__2]) * bli_cimag(x[i__3])), (bli_creal(a[i__2]) * bli_cimag(x[i__3]) + bli_cimag(a[i__2]) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L90: */ } if (nounit) { bla_c_div(&q__1, &temp, &a[kplus1 + j * a_dim1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__4]) - bli_cimag(q__3) * bli_cimag(x[i__4])), (bli_creal(q__3) * bli_cimag(x[i__4]) + bli_cimag(q__3) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L100: */ } if (nounit) { bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__3 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__3] ); /* L110: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = jx; bli_csets( (bli_creal(x[i__3])), (bli_cimag(x[i__3])), temp ); ix = kx; l = kplus1 - j; if (noconj) { /* Computing MAX */ i__3 = 1, i__4 = j - *k; i__2 = j - 1; for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { i__3 = l + i__ + j * a_dim1; i__4 = ix; bli_csets( (bli_creal(a[i__3]) * bli_creal(x[i__4]) - bli_cimag(a[i__3]) * bli_cimag(x[i__4])), (bli_creal(a[i__3]) * bli_cimag(x[i__4]) + bli_cimag(a[i__3]) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L120: */ } if (nounit) { bla_c_div(&q__1, &temp, &a[kplus1 + j * a_dim1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__2 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L130: */ } if (nounit) { bla_r_cnjg(&q__2, &a[kplus1 + j * a_dim1]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__4 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] ); jx += *incx; if (j > *k) { kx += *incx; } /* L140: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); l = 1 - j; if (noconj) { /* Computing MIN */ i__1 = *n, i__4 = j + *k; i__2 = j + 1; for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) { i__1 = l + i__ + j * a_dim1; i__4 = i__; bli_csets( (bli_creal(a[i__1]) * bli_creal(x[i__4]) - bli_cimag(a[i__1]) * bli_cimag(x[i__4])), (bli_creal(a[i__1]) * bli_cimag(x[i__4]) + bli_cimag(a[i__1]) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L150: */ } if (nounit) { bla_c_div(&q__1, &temp, &a[j * a_dim1 + 1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { /* Computing MIN */ i__2 = *n, i__1 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__2,i__1); i__ >= i__4; --i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__2 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L160: */ } if (nounit) { bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__4 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__4] ); /* L170: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__4 = jx; bli_csets( (bli_creal(x[i__4])), (bli_cimag(x[i__4])), temp ); ix = kx; l = 1 - j; if (noconj) { /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__1 = j + 1; for (i__ = f2c_min(i__4,i__2); i__ >= i__1; --i__) { i__4 = l + i__ + j * a_dim1; i__2 = ix; bli_csets( (bli_creal(a[i__4]) * bli_creal(x[i__2]) - bli_cimag(a[i__4]) * bli_cimag(x[i__2])), (bli_creal(a[i__4]) * bli_cimag(x[i__2]) + bli_cimag(a[i__4]) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix -= *incx; /* L180: */ } if (nounit) { bla_c_div(&q__1, &temp, &a[j * a_dim1 + 1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { /* Computing MIN */ i__1 = *n, i__4 = j + *k; i__2 = j + 1; for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) { bla_r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__1 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix -= *incx; /* L190: */ } if (nounit) { bla_r_cnjg(&q__2, &a[j * a_dim1 + 1]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__2 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] ); jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L200: */ } } } } return 0; /* End of CTBSV . */ } /* ctbsv_ */ /* dtbsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ bla_integer info; bla_double temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DTBSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular band matrix, with ( k + 1 ) */ /* diagonals. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' A'*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("DTBSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed by sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.) { l = kplus1 - j; if (nounit) { x[j] /= a[kplus1 + j * a_dim1]; } temp = x[j]; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { x[i__] -= temp * a[l + i__ + j * a_dim1]; /* L10: */ } } /* L20: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { kx -= *incx; if (x[jx] != 0.) { ix = kx; l = kplus1 - j; if (nounit) { x[jx] /= a[kplus1 + j * a_dim1]; } temp = x[jx]; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { x[ix] -= temp * a[l + i__ + j * a_dim1]; ix -= *incx; /* L30: */ } } jx -= *incx; /* L40: */ } } } else { if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.) { l = 1 - j; if (nounit) { x[j] /= a[j * a_dim1 + 1]; } temp = x[j]; /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { x[i__] -= temp * a[l + i__ + j * a_dim1]; /* L50: */ } } /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { kx += *incx; if (x[jx] != 0.) { ix = kx; l = 1 - j; if (nounit) { x[jx] /= a[j * a_dim1 + 1]; } temp = x[jx]; /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { x[ix] -= temp * a[l + i__ + j * a_dim1]; ix += *incx; /* L70: */ } } jx += *incx; /* L80: */ } } } } else { /* Form x := inv( A')*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[j]; l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { temp -= a[l + i__ + j * a_dim1] * x[i__]; /* L90: */ } if (nounit) { temp /= a[kplus1 + j * a_dim1]; } x[j] = temp; /* L100: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[jx]; ix = kx; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { temp -= a[l + i__ + j * a_dim1] * x[ix]; ix += *incx; /* L110: */ } if (nounit) { temp /= a[kplus1 + j * a_dim1]; } x[jx] = temp; jx += *incx; if (j > *k) { kx += *incx; } /* L120: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; l = 1 - j; /* Computing MIN */ i__1 = *n, i__3 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__1,i__3); i__ >= i__4; --i__) { temp -= a[l + i__ + j * a_dim1] * x[i__]; /* L130: */ } if (nounit) { temp /= a[j * a_dim1 + 1]; } x[j] = temp; /* L140: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { temp = x[jx]; ix = kx; l = 1 - j; /* Computing MIN */ i__4 = *n, i__1 = j + *k; i__3 = j + 1; for (i__ = f2c_min(i__4,i__1); i__ >= i__3; --i__) { temp -= a[l + i__ + j * a_dim1] * x[ix]; ix -= *incx; /* L150: */ } if (nounit) { temp /= a[j * a_dim1 + 1]; } x[jx] = temp; jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L160: */ } } } } return 0; /* End of DTBSV . */ } /* dtbsv_ */ /* stbsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4; /* Local variables */ bla_integer info; bla_real temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* STBSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular band matrix, with ( k + 1 ) */ /* diagonals. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' A'*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - REAL array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("STBSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed by sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.f) { l = kplus1 - j; if (nounit) { x[j] /= a[kplus1 + j * a_dim1]; } temp = x[j]; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { x[i__] -= temp * a[l + i__ + j * a_dim1]; /* L10: */ } } /* L20: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { kx -= *incx; if (x[jx] != 0.f) { ix = kx; l = kplus1 - j; if (nounit) { x[jx] /= a[kplus1 + j * a_dim1]; } temp = x[jx]; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { x[ix] -= temp * a[l + i__ + j * a_dim1]; ix -= *incx; /* L30: */ } } jx -= *incx; /* L40: */ } } } else { if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f) { l = 1 - j; if (nounit) { x[j] /= a[j * a_dim1 + 1]; } temp = x[j]; /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { x[i__] -= temp * a[l + i__ + j * a_dim1]; /* L50: */ } } /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { kx += *incx; if (x[jx] != 0.f) { ix = kx; l = 1 - j; if (nounit) { x[jx] /= a[j * a_dim1 + 1]; } temp = x[jx]; /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { x[ix] -= temp * a[l + i__ + j * a_dim1]; ix += *incx; /* L70: */ } } jx += *incx; /* L80: */ } } } } else { /* Form x := inv( A')*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[j]; l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { temp -= a[l + i__ + j * a_dim1] * x[i__]; /* L90: */ } if (nounit) { temp /= a[kplus1 + j * a_dim1]; } x[j] = temp; /* L100: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[jx]; ix = kx; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { temp -= a[l + i__ + j * a_dim1] * x[ix]; ix += *incx; /* L110: */ } if (nounit) { temp /= a[kplus1 + j * a_dim1]; } x[jx] = temp; jx += *incx; if (j > *k) { kx += *incx; } /* L120: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; l = 1 - j; /* Computing MIN */ i__1 = *n, i__3 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__1,i__3); i__ >= i__4; --i__) { temp -= a[l + i__ + j * a_dim1] * x[i__]; /* L130: */ } if (nounit) { temp /= a[j * a_dim1 + 1]; } x[j] = temp; /* L140: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { temp = x[jx]; ix = kx; l = 1 - j; /* Computing MIN */ i__4 = *n, i__1 = j + *k; i__3 = j + 1; for (i__ = f2c_min(i__4,i__1); i__ >= i__3; --i__) { temp -= a[l + i__ + j * a_dim1] * x[ix]; ix -= *incx; /* L150: */ } if (nounit) { temp /= a[j * a_dim1 + 1]; } x[jx] = temp; jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L160: */ } } } } return 0; /* End of STBSV . */ } /* stbsv_ */ /* ztbsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; bla_dcomplex z__1, z__2, z__3; /* Builtin functions */ //void bla_z_div(bla_dcomplex *, bla_dcomplex *, bla_dcomplex *), bla_d_cnjg( // bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp; bla_integer i__, j, l; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kplus1, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZTBSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, or conjg( A' )*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular band matrix, with ( k + 1 ) */ /* diagonals. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' conjg( A' )*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry with UPLO = 'U' or 'u', K specifies the number of */ /* super-diagonals of the matrix A. */ /* On entry with UPLO = 'L' or 'l', K specifies the number of */ /* sub-diagonals of the matrix A. */ /* K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer an upper */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the matrix of coefficients, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer a lower */ /* triangular band matrix from conventional full matrix storage */ /* to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that when DIAG = 'U' or 'u' the elements of the array A */ /* corresponding to the diagonal elements of the matrix are not */ /* referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1 * 1; a -= a_offset; --x; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*k < 0) { info = 5; } else if (*lda < *k + 1) { info = 7; } else if (*incx == 0) { info = 9; } if (info != 0) { PASTEF770(xerbla)("ZTBSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of A are */ /* accessed by sequentially with one pass through A. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) { l = kplus1 - j; if (nounit) { i__1 = j; bla_z_div(&z__1, &x[j], &a[kplus1 + j * a_dim1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); } i__1 = j; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { i__2 = i__; i__3 = i__; i__4 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__2 ); bli_zsets( (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); /* L10: */ } } /* L20: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { kx -= *incx; i__1 = jx; if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) { ix = kx; l = kplus1 - j; if (nounit) { i__1 = jx; bla_z_div(&z__1, &x[jx], &a[kplus1 + j * a_dim1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); } i__1 = jx; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__1 = f2c_max(i__2,i__3); for (i__ = j - 1; i__ >= i__1; --i__) { i__2 = ix; i__3 = ix; i__4 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__4]) - bli_zimag(temp) * bli_zimag(a[i__4])), (bli_zreal(temp) * bli_zimag(a[i__4]) + bli_zimag(temp) * bli_zreal(a[i__4])), z__2 ); bli_zsets( (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); ix -= *incx; /* L30: */ } } jx -= *incx; /* L40: */ } } } else { if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { l = 1 - j; if (nounit) { i__2 = j; bla_z_div(&z__1, &x[j], &a[j * a_dim1 + 1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); } i__2 = j; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); /* L50: */ } } /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { kx += *incx; i__2 = jx; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { ix = kx; l = 1 - j; if (nounit) { i__2 = jx; bla_z_div(&z__1, &x[jx], &a[j * a_dim1 + 1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); } i__2 = jx; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); /* Computing MIN */ i__3 = *n, i__4 = j + *k; i__2 = f2c_min(i__3,i__4); for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = ix; i__4 = ix; i__5 = l + i__ + j * a_dim1; bli_zsets( (bli_zreal(temp) * bli_zreal(a[i__5]) - bli_zimag(temp) * bli_zimag(a[i__5])), (bli_zreal(temp) * bli_zimag(a[i__5]) + bli_zimag(temp) * bli_zreal(a[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); ix += *incx; /* L70: */ } } jx += *incx; /* L80: */ } } } } else { /* Form x := inv( A' )*x or x := inv( conjg( A') )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kplus1 = *k + 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); l = kplus1 - j; if (noconj) { /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = l + i__ + j * a_dim1; i__3 = i__; bli_zsets( (bli_zreal(a[i__2]) * bli_zreal(x[i__3]) - bli_zimag(a[i__2]) * bli_zimag(x[i__3])), (bli_zreal(a[i__2]) * bli_zimag(x[i__3]) + bli_zimag(a[i__2]) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L90: */ } if (nounit) { bla_z_div(&z__1, &temp, &a[kplus1 + j * a_dim1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = f2c_max(i__4,i__2); i__ <= i__3; ++i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__4 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__4]) - bli_zimag(z__3) * bli_zimag(x[i__4])), (bli_zreal(z__3) * bli_zimag(x[i__4]) + bli_zimag(z__3) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L100: */ } if (nounit) { bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__3 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__3] ); /* L110: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = jx; bli_zsets( (bli_zreal(x[i__3])), (bli_zimag(x[i__3])), temp ); ix = kx; l = kplus1 - j; if (noconj) { /* Computing MAX */ i__3 = 1, i__4 = j - *k; i__2 = j - 1; for (i__ = f2c_max(i__3,i__4); i__ <= i__2; ++i__) { i__3 = l + i__ + j * a_dim1; i__4 = ix; bli_zsets( (bli_zreal(a[i__3]) * bli_zreal(x[i__4]) - bli_zimag(a[i__3]) * bli_zimag(x[i__4])), (bli_zreal(a[i__3]) * bli_zimag(x[i__4]) + bli_zimag(a[i__3]) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L120: */ } if (nounit) { bla_z_div(&z__1, &temp, &a[kplus1 + j * a_dim1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = f2c_max(i__2,i__3); i__ <= i__4; ++i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__2 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L130: */ } if (nounit) { bla_d_cnjg(&z__2, &a[kplus1 + j * a_dim1]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__4 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] ); jx += *incx; if (j > *k) { kx += *incx; } /* L140: */ } } } else { if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); l = 1 - j; if (noconj) { /* Computing MIN */ i__1 = *n, i__4 = j + *k; i__2 = j + 1; for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) { i__1 = l + i__ + j * a_dim1; i__4 = i__; bli_zsets( (bli_zreal(a[i__1]) * bli_zreal(x[i__4]) - bli_zimag(a[i__1]) * bli_zimag(x[i__4])), (bli_zreal(a[i__1]) * bli_zimag(x[i__4]) + bli_zimag(a[i__1]) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L150: */ } if (nounit) { bla_z_div(&z__1, &temp, &a[j * a_dim1 + 1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { /* Computing MIN */ i__2 = *n, i__1 = j + *k; i__4 = j + 1; for (i__ = f2c_min(i__2,i__1); i__ >= i__4; --i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__2 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L160: */ } if (nounit) { bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__4 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__4] ); /* L170: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__4 = jx; bli_zsets( (bli_zreal(x[i__4])), (bli_zimag(x[i__4])), temp ); ix = kx; l = 1 - j; if (noconj) { /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__1 = j + 1; for (i__ = f2c_min(i__4,i__2); i__ >= i__1; --i__) { i__4 = l + i__ + j * a_dim1; i__2 = ix; bli_zsets( (bli_zreal(a[i__4]) * bli_zreal(x[i__2]) - bli_zimag(a[i__4]) * bli_zimag(x[i__2])), (bli_zreal(a[i__4]) * bli_zimag(x[i__2]) + bli_zimag(a[i__4]) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix -= *incx; /* L180: */ } if (nounit) { bla_z_div(&z__1, &temp, &a[j * a_dim1 + 1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { /* Computing MIN */ i__1 = *n, i__4 = j + *k; i__2 = j + 1; for (i__ = f2c_min(i__1,i__4); i__ >= i__2; --i__) { bla_d_cnjg(&z__3, &a[l + i__ + j * a_dim1]); i__1 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix -= *incx; /* L190: */ } if (nounit) { bla_d_cnjg(&z__2, &a[j * a_dim1 + 1]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__2 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] ); jx -= *incx; if (*n - j >= *k) { kx -= *incx; } /* L200: */ } } } } return 0; /* End of ZTBSV . */ } /* ztbsv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tbsv.h000066400000000000000000000052261427272030600226370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tpmv.c000066400000000000000000001550311427272030600226420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* ctpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_scomplex q__1, q__2, q__3; /* Builtin functions */ //void bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CTPMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := conjg( A' )*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - COMPLEX array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("CTPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x:= A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { i__2 = j; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(x[i__4]) + bli_creal(q__2)), (bli_cimag(x[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); ++k; /* L10: */ } if (nounit) { i__2 = j; i__3 = j; i__4 = kk + j - 1; bli_csets( (bli_creal(x[i__3]) * bli_creal(ap[i__4]) - bli_cimag(x[i__3]) * bli_cimag(ap[i__4])), (bli_creal(x[i__3]) * bli_cimag(ap[i__4]) + bli_cimag(x[i__3]) * bli_creal(ap[i__4])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); } } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { i__2 = jx; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = ix; i__4 = ix; i__5 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(x[i__4]) + bli_creal(q__2)), (bli_cimag(x[i__4]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); ix += *incx; /* L30: */ } if (nounit) { i__2 = jx; i__3 = jx; i__4 = kk + j - 1; bli_csets( (bli_creal(x[i__3]) * bli_creal(ap[i__4]) - bli_cimag(x[i__3]) * bli_cimag(ap[i__4])), (bli_creal(x[i__3]) * bli_cimag(ap[i__4]) + bli_cimag(x[i__3]) * bli_creal(ap[i__4])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); } } jx += *incx; kk += j; /* L40: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) { i__1 = j; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); k = kk; i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { i__2 = i__; i__3 = i__; i__4 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 ); bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); --k; /* L50: */ } if (nounit) { i__1 = j; i__2 = j; i__3 = kk - *n + j; bli_csets( (bli_creal(x[i__2]) * bli_creal(ap[i__3]) - bli_cimag(x[i__2]) * bli_cimag(ap[i__3])), (bli_creal(x[i__2]) * bli_cimag(ap[i__3]) + bli_cimag(x[i__2]) * bli_creal(ap[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); } } kk -= *n - j + 1; /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__1 = jx; if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) { i__1 = jx; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); ix = kx; i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { i__2 = ix; i__3 = ix; i__4 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 ); bli_csets( (bli_creal(x[i__3]) + bli_creal(q__2)), (bli_cimag(x[i__3]) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); ix -= *incx; /* L70: */ } if (nounit) { i__1 = jx; i__2 = jx; i__3 = kk - *n + j; bli_csets( (bli_creal(x[i__2]) * bli_creal(ap[i__3]) - bli_cimag(x[i__2]) * bli_cimag(ap[i__3])), (bli_creal(x[i__2]) * bli_cimag(ap[i__3]) + bli_cimag(x[i__2]) * bli_creal(ap[i__3])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); } } jx -= *incx; kk -= *n - j + 1; /* L80: */ } } } } else { /* Form x := A'*x or x := conjg( A' )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); k = kk - 1; if (noconj) { if (nounit) { i__1 = kk; bli_csets( (bli_creal(temp) * bli_creal(ap[i__1]) - bli_cimag(temp) * bli_cimag(ap[i__1])), (bli_creal(temp) * bli_cimag(ap[i__1]) + bli_cimag(temp) * bli_creal(ap[i__1])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } for (i__ = j - 1; i__ >= 1; --i__) { i__1 = k; i__2 = i__; bli_csets( (bli_creal(ap[i__1]) * bli_creal(x[i__2]) - bli_cimag(ap[i__1]) * bli_cimag(x[i__2])), (bli_creal(ap[i__1]) * bli_cimag(x[i__2]) + bli_cimag(ap[i__1]) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); --k; /* L90: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &ap[kk]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } for (i__ = j - 1; i__ >= 1; --i__) { bla_r_cnjg(&q__3, &ap[k]); i__1 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__1]) - bli_cimag(q__3) * bli_cimag(x[i__1])), (bli_creal(q__3) * bli_cimag(x[i__1]) + bli_cimag(q__3) * bli_creal(x[i__1])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); --k; /* L100: */ } } i__1 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] ); kk -= j; /* L110: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { i__1 = jx; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); ix = jx; if (noconj) { if (nounit) { i__1 = kk; bli_csets( (bli_creal(temp) * bli_creal(ap[i__1]) - bli_cimag(temp) * bli_cimag(ap[i__1])), (bli_creal(temp) * bli_cimag(ap[i__1]) + bli_cimag(temp) * bli_creal(ap[i__1])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; i__2 = k; i__3 = ix; bli_csets( (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L120: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &ap[kk]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; bla_r_cnjg(&q__3, &ap[k]); i__2 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L130: */ } } i__1 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] ); jx -= *incx; kk -= j; /* L140: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); k = kk + 1; if (noconj) { if (nounit) { i__2 = kk; bli_csets( (bli_creal(temp) * bli_creal(ap[i__2]) - bli_cimag(temp) * bli_cimag(ap[i__2])), (bli_creal(temp) * bli_cimag(ap[i__2]) + bli_cimag(temp) * bli_creal(ap[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = i__; bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ++k; /* L150: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &ap[kk]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { bla_r_cnjg(&q__3, &ap[k]); i__3 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ++k; /* L160: */ } } i__2 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] ); kk += *n - j + 1; /* L170: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); ix = jx; if (noconj) { if (nounit) { i__2 = kk; bli_csets( (bli_creal(temp) * bli_creal(ap[i__2]) - bli_cimag(temp) * bli_cimag(ap[i__2])), (bli_creal(temp) * bli_cimag(ap[i__2]) + bli_cimag(temp) * bli_creal(ap[i__2])), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; i__3 = k; i__4 = ix; bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L180: */ } } else { if (nounit) { bla_r_cnjg(&q__2, &ap[kk]); bli_csets( (bli_creal(temp) * bli_creal(q__2) - bli_cimag(temp) * bli_cimag(q__2)), (bli_creal(temp) * bli_cimag(q__2) + bli_cimag(temp) * bli_creal(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; bla_r_cnjg(&q__3, &ap[k]); i__3 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) + bli_creal(q__2)), (bli_cimag(temp) + bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); /* L190: */ } } i__2 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] ); jx += *incx; kk += *n - j + 1; /* L200: */ } } } } return 0; /* End of CTPMV . */ } /* ctpmv_ */ /* dtpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_double temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DTPMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := A'*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - DOUBLE PRECISION array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("DTPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x:= A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.) { temp = x[j]; k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { x[i__] += temp * ap[k]; ++k; /* L10: */ } if (nounit) { x[j] *= ap[kk + j - 1]; } } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { temp = x[jx]; ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { x[ix] += temp * ap[k]; ix += *incx; /* L30: */ } if (nounit) { x[jx] *= ap[kk + j - 1]; } } jx += *incx; kk += j; /* L40: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.) { temp = x[j]; k = kk; i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { x[i__] += temp * ap[k]; --k; /* L50: */ } if (nounit) { x[j] *= ap[kk - *n + j]; } } kk -= *n - j + 1; /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { if (x[jx] != 0.) { temp = x[jx]; ix = kx; i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { x[ix] += temp * ap[k]; ix -= *incx; /* L70: */ } if (nounit) { x[jx] *= ap[kk - *n + j]; } } jx -= *incx; kk -= *n - j + 1; /* L80: */ } } } } else { /* Form x := A'*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; if (nounit) { temp *= ap[kk]; } k = kk - 1; for (i__ = j - 1; i__ >= 1; --i__) { temp += ap[k] * x[i__]; --k; /* L90: */ } x[j] = temp; kk -= j; /* L100: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { temp = x[jx]; ix = jx; if (nounit) { temp *= ap[kk]; } i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; temp += ap[k] * x[ix]; /* L110: */ } x[jx] = temp; jx -= *incx; kk -= j; /* L120: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[j]; if (nounit) { temp *= ap[kk]; } k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { temp += ap[k] * x[i__]; ++k; /* L130: */ } x[j] = temp; kk += *n - j + 1; /* L140: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[jx]; ix = jx; if (nounit) { temp *= ap[kk]; } i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; temp += ap[k] * x[ix]; /* L150: */ } x[jx] = temp; jx += *incx; kk += *n - j + 1; /* L160: */ } } } } return 0; /* End of DTPMV . */ } /* dtpmv_ */ /* stpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_real temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* STPMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := A'*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - REAL array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("STPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x:= A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f) { temp = x[j]; k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { x[i__] += temp * ap[k]; ++k; /* L10: */ } if (nounit) { x[j] *= ap[kk + j - 1]; } } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f) { temp = x[jx]; ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { x[ix] += temp * ap[k]; ix += *incx; /* L30: */ } if (nounit) { x[jx] *= ap[kk + j - 1]; } } jx += *incx; kk += j; /* L40: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.f) { temp = x[j]; k = kk; i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { x[i__] += temp * ap[k]; --k; /* L50: */ } if (nounit) { x[j] *= ap[kk - *n + j]; } } kk -= *n - j + 1; /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { if (x[jx] != 0.f) { temp = x[jx]; ix = kx; i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { x[ix] += temp * ap[k]; ix -= *incx; /* L70: */ } if (nounit) { x[jx] *= ap[kk - *n + j]; } } jx -= *incx; kk -= *n - j + 1; /* L80: */ } } } } else { /* Form x := A'*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; if (nounit) { temp *= ap[kk]; } k = kk - 1; for (i__ = j - 1; i__ >= 1; --i__) { temp += ap[k] * x[i__]; --k; /* L90: */ } x[j] = temp; kk -= j; /* L100: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { temp = x[jx]; ix = jx; if (nounit) { temp *= ap[kk]; } i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; temp += ap[k] * x[ix]; /* L110: */ } x[jx] = temp; jx -= *incx; kk -= j; /* L120: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[j]; if (nounit) { temp *= ap[kk]; } k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { temp += ap[k] * x[i__]; ++k; /* L130: */ } x[j] = temp; kk += *n - j + 1; /* L140: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[jx]; ix = jx; if (nounit) { temp *= ap[kk]; } i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; temp += ap[k] * x[ix]; /* L150: */ } x[jx] = temp; jx += *incx; kk += *n - j + 1; /* L160: */ } } } } return 0; /* End of STPMV . */ } /* stpmv_ */ /* ztpmv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_dcomplex z__1, z__2, z__3; /* Builtin functions */ //void bla_d_cnjg(bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZTPMV performs one of the matrix-vector operations */ /* x := A*x, or x := A'*x, or x := conjg( A' )*x, */ /* where x is an n element vector and A is an n by n unit, or non-unit, */ /* upper or lower triangular matrix, supplied in packed form. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the operation to be performed as */ /* follows: */ /* TRANS = 'N' or 'n' x := A*x. */ /* TRANS = 'T' or 't' x := A'*x. */ /* TRANS = 'C' or 'c' x := conjg( A' )*x. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - COMPLEX*16 array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element vector x. On exit, X is overwritten with the */ /* tranformed vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("ZTPMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x:= A*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { i__2 = j; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__4]) + bli_zreal(z__2)), (bli_zimag(x[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); ++k; /* L10: */ } if (nounit) { i__2 = j; i__3 = j; i__4 = kk + j - 1; bli_zsets( (bli_zreal(x[i__3]) * bli_zreal(ap[i__4]) - bli_zimag(x[i__3]) * bli_zimag(ap[i__4])), (bli_zreal(x[i__3]) * bli_zimag(ap[i__4]) + bli_zimag(x[i__3]) * bli_zreal(ap[i__4])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); } } kk += j; /* L20: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { i__2 = jx; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = ix; i__4 = ix; i__5 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__4]) + bli_zreal(z__2)), (bli_zimag(x[i__4]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); ix += *incx; /* L30: */ } if (nounit) { i__2 = jx; i__3 = jx; i__4 = kk + j - 1; bli_zsets( (bli_zreal(x[i__3]) * bli_zreal(ap[i__4]) - bli_zimag(x[i__3]) * bli_zimag(ap[i__4])), (bli_zreal(x[i__3]) * bli_zimag(ap[i__4]) + bli_zimag(x[i__3]) * bli_zreal(ap[i__4])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); } } jx += *incx; kk += j; /* L40: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) { i__1 = j; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); k = kk; i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { i__2 = i__; i__3 = i__; i__4 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 ); bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); --k; /* L50: */ } if (nounit) { i__1 = j; i__2 = j; i__3 = kk - *n + j; bli_zsets( (bli_zreal(x[i__2]) * bli_zreal(ap[i__3]) - bli_zimag(x[i__2]) * bli_zimag(ap[i__3])), (bli_zreal(x[i__2]) * bli_zimag(ap[i__3]) + bli_zimag(x[i__2]) * bli_zreal(ap[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); } } kk -= *n - j + 1; /* L60: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__1 = jx; if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) { i__1 = jx; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); ix = kx; i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { i__2 = ix; i__3 = ix; i__4 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 ); bli_zsets( (bli_zreal(x[i__3]) + bli_zreal(z__2)), (bli_zimag(x[i__3]) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); ix -= *incx; /* L70: */ } if (nounit) { i__1 = jx; i__2 = jx; i__3 = kk - *n + j; bli_zsets( (bli_zreal(x[i__2]) * bli_zreal(ap[i__3]) - bli_zimag(x[i__2]) * bli_zimag(ap[i__3])), (bli_zreal(x[i__2]) * bli_zimag(ap[i__3]) + bli_zimag(x[i__2]) * bli_zreal(ap[i__3])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); } } jx -= *incx; kk -= *n - j + 1; /* L80: */ } } } } else { /* Form x := A'*x or x := conjg( A' )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); k = kk - 1; if (noconj) { if (nounit) { i__1 = kk; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__1]) - bli_zimag(temp) * bli_zimag(ap[i__1])), (bli_zreal(temp) * bli_zimag(ap[i__1]) + bli_zimag(temp) * bli_zreal(ap[i__1])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } for (i__ = j - 1; i__ >= 1; --i__) { i__1 = k; i__2 = i__; bli_zsets( (bli_zreal(ap[i__1]) * bli_zreal(x[i__2]) - bli_zimag(ap[i__1]) * bli_zimag(x[i__2])), (bli_zreal(ap[i__1]) * bli_zimag(x[i__2]) + bli_zimag(ap[i__1]) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); --k; /* L90: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &ap[kk]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } for (i__ = j - 1; i__ >= 1; --i__) { bla_d_cnjg(&z__3, &ap[k]); i__1 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__1]) - bli_zimag(z__3) * bli_zimag(x[i__1])), (bli_zreal(z__3) * bli_zimag(x[i__1]) + bli_zimag(z__3) * bli_zreal(x[i__1])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); --k; /* L100: */ } } i__1 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] ); kk -= j; /* L110: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { i__1 = jx; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); ix = jx; if (noconj) { if (nounit) { i__1 = kk; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__1]) - bli_zimag(temp) * bli_zimag(ap[i__1])), (bli_zreal(temp) * bli_zimag(ap[i__1]) + bli_zimag(temp) * bli_zreal(ap[i__1])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; i__2 = k; i__3 = ix; bli_zsets( (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L120: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &ap[kk]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; bla_d_cnjg(&z__3, &ap[k]); i__2 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L130: */ } } i__1 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] ); jx -= *incx; kk -= j; /* L140: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); k = kk + 1; if (noconj) { if (nounit) { i__2 = kk; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__2]) - bli_zimag(temp) * bli_zimag(ap[i__2])), (bli_zreal(temp) * bli_zimag(ap[i__2]) + bli_zimag(temp) * bli_zreal(ap[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = i__; bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ++k; /* L150: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &ap[kk]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { bla_d_cnjg(&z__3, &ap[k]); i__3 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ++k; /* L160: */ } } i__2 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] ); kk += *n - j + 1; /* L170: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); ix = jx; if (noconj) { if (nounit) { i__2 = kk; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__2]) - bli_zimag(temp) * bli_zimag(ap[i__2])), (bli_zreal(temp) * bli_zimag(ap[i__2]) + bli_zimag(temp) * bli_zreal(ap[i__2])), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; i__3 = k; i__4 = ix; bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L180: */ } } else { if (nounit) { bla_d_cnjg(&z__2, &ap[kk]); bli_zsets( (bli_zreal(temp) * bli_zreal(z__2) - bli_zimag(temp) * bli_zimag(z__2)), (bli_zreal(temp) * bli_zimag(z__2) + bli_zimag(temp) * bli_zreal(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; bla_d_cnjg(&z__3, &ap[k]); i__3 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) + bli_zreal(z__2)), (bli_zimag(temp) + bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); /* L190: */ } } i__2 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] ); jx += *incx; kk += *n - j + 1; /* L200: */ } } } } return 0; /* End of ZTPMV . */ } /* ztpmv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tpmv.h000066400000000000000000000047421427272030600226510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tpsv.c000066400000000000000000001471011427272030600226470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* ctpsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_scomplex q__1, q__2, q__3; /* Builtin functions */ //void bla_c_div(bla_scomplex *, bla_scomplex *, bla_scomplex *), bla_r_cnjg(bla_scomplex *, bla_scomplex *); /* Local variables */ bla_integer info; bla_scomplex temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CTPSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, or conjg( A' )*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular matrix, supplied in packed form. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' conjg( A' )*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - COMPLEX array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - COMPLEX array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("CTPSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) { if (nounit) { i__1 = j; bla_c_div(&q__1, &x[j], &ap[kk]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); } i__1 = j; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); k = kk - 1; for (i__ = j - 1; i__ >= 1; --i__) { i__1 = i__; i__2 = i__; i__3 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__3]) - bli_cimag(temp) * bli_cimag(ap[i__3])), (bli_creal(temp) * bli_cimag(ap[i__3]) + bli_cimag(temp) * bli_creal(ap[i__3])), q__2 ); bli_csets( (bli_creal(x[i__2]) - bli_creal(q__2)), (bli_cimag(x[i__2]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); --k; /* L10: */ } } kk -= j; /* L20: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { i__1 = jx; if (bli_creal(x[i__1]) != 0.f || bli_cimag(x[i__1]) != 0.f) { if (nounit) { i__1 = jx; bla_c_div(&q__1, &x[jx], &ap[kk]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__1] ); } i__1 = jx; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); ix = jx; i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; i__2 = ix; i__3 = ix; i__4 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__4]) - bli_cimag(temp) * bli_cimag(ap[i__4])), (bli_creal(temp) * bli_cimag(ap[i__4]) + bli_cimag(temp) * bli_creal(ap[i__4])), q__2 ); bli_csets( (bli_creal(x[i__3]) - bli_creal(q__2)), (bli_cimag(x[i__3]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); /* L30: */ } } jx -= *incx; kk -= j; /* L40: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { if (nounit) { i__2 = j; bla_c_div(&q__1, &x[j], &ap[kk]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); } i__2 = j; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); ++k; /* L50: */ } } kk += *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_creal(x[i__2]) != 0.f || bli_cimag(x[i__2]) != 0.f) { if (nounit) { i__2 = jx; bla_c_div(&q__1, &x[jx], &ap[kk]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__2] ); } i__2 = jx; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); ix = jx; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; i__3 = ix; i__4 = ix; i__5 = k; bli_csets( (bli_creal(temp) * bli_creal(ap[i__5]) - bli_cimag(temp) * bli_cimag(ap[i__5])), (bli_creal(temp) * bli_cimag(ap[i__5]) + bli_cimag(temp) * bli_creal(ap[i__5])), q__2 ); bli_csets( (bli_creal(x[i__4]) - bli_creal(q__2)), (bli_cimag(x[i__4]) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), x[i__3] ); /* L70: */ } } jx += *incx; kk += *n - j + 1; /* L80: */ } } } } else { /* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); k = kk; if (noconj) { i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = i__; bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ++k; /* L90: */ } if (nounit) { bla_c_div(&q__1, &temp, &ap[kk + j - 1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { bla_r_cnjg(&q__3, &ap[k]); i__3 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ++k; /* L100: */ } if (nounit) { bla_r_cnjg(&q__2, &ap[kk + j - 1]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__2 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] ); kk += j; /* L110: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_csets( (bli_creal(x[i__2])), (bli_cimag(x[i__2])), temp ); ix = kx; if (noconj) { i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = k; i__4 = ix; bli_csets( (bli_creal(ap[i__3]) * bli_creal(x[i__4]) - bli_cimag(ap[i__3]) * bli_cimag(x[i__4])), (bli_creal(ap[i__3]) * bli_cimag(x[i__4]) + bli_cimag(ap[i__3]) * bli_creal(x[i__4])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L120: */ } if (nounit) { bla_c_div(&q__1, &temp, &ap[kk + j - 1]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { bla_r_cnjg(&q__3, &ap[k]); i__3 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__3]) - bli_cimag(q__3) * bli_cimag(x[i__3])), (bli_creal(q__3) * bli_cimag(x[i__3]) + bli_cimag(q__3) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix += *incx; /* L130: */ } if (nounit) { bla_r_cnjg(&q__2, &ap[kk + j - 1]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__2 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__2] ); jx += *incx; kk += j; /* L140: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); k = kk; if (noconj) { i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { i__2 = k; i__3 = i__; bli_csets( (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); --k; /* L150: */ } if (nounit) { bla_c_div(&q__1, &temp, &ap[kk - *n + j]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { bla_r_cnjg(&q__3, &ap[k]); i__2 = i__; bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); --k; /* L160: */ } if (nounit) { bla_r_cnjg(&q__2, &ap[kk - *n + j]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__1 = j; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] ); kk -= *n - j + 1; /* L170: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__1 = jx; bli_csets( (bli_creal(x[i__1])), (bli_cimag(x[i__1])), temp ); ix = kx; if (noconj) { i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { i__2 = k; i__3 = ix; bli_csets( (bli_creal(ap[i__2]) * bli_creal(x[i__3]) - bli_cimag(ap[i__2]) * bli_cimag(x[i__3])), (bli_creal(ap[i__2]) * bli_cimag(x[i__3]) + bli_cimag(ap[i__2]) * bli_creal(x[i__3])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix -= *incx; /* L180: */ } if (nounit) { bla_c_div(&q__1, &temp, &ap[kk - *n + j]); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } else { i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { bla_r_cnjg(&q__3, &ap[k]); i__2 = ix; bli_csets( (bli_creal(q__3) * bli_creal(x[i__2]) - bli_cimag(q__3) * bli_cimag(x[i__2])), (bli_creal(q__3) * bli_cimag(x[i__2]) + bli_cimag(q__3) * bli_creal(x[i__2])), q__2 ); bli_csets( (bli_creal(temp) - bli_creal(q__2)), (bli_cimag(temp) - bli_cimag(q__2)), q__1 ); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); ix -= *incx; /* L190: */ } if (nounit) { bla_r_cnjg(&q__2, &ap[kk - *n + j]); bla_c_div(&q__1, &temp, &q__2); bli_csets( (bli_creal(q__1)), (bli_cimag(q__1)), temp ); } } i__1 = jx; bli_csets( (bli_creal(temp)), (bli_cimag(temp)), x[i__1] ); jx -= *incx; kk -= *n - j + 1; /* L200: */ } } } } return 0; /* End of CTPSV . */ } /* ctpsv_ */ /* dtpsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_double temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* DTPSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular matrix, supplied in packed form. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' A'*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - DOUBLE PRECISION array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("DTPSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.) { if (nounit) { x[j] /= ap[kk]; } temp = x[j]; k = kk - 1; for (i__ = j - 1; i__ >= 1; --i__) { x[i__] -= temp * ap[k]; --k; /* L10: */ } } kk -= j; /* L20: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { if (x[jx] != 0.) { if (nounit) { x[jx] /= ap[kk]; } temp = x[jx]; ix = jx; i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; x[ix] -= temp * ap[k]; /* L30: */ } } jx -= *incx; kk -= j; /* L40: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.) { if (nounit) { x[j] /= ap[kk]; } temp = x[j]; k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { x[i__] -= temp * ap[k]; ++k; /* L50: */ } } kk += *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { if (nounit) { x[jx] /= ap[kk]; } temp = x[jx]; ix = jx; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; x[ix] -= temp * ap[k]; /* L70: */ } } jx += *incx; kk += *n - j + 1; /* L80: */ } } } } else { /* Form x := inv( A' )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[j]; k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { temp -= ap[k] * x[i__]; ++k; /* L90: */ } if (nounit) { temp /= ap[kk + j - 1]; } x[j] = temp; kk += j; /* L100: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[jx]; ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { temp -= ap[k] * x[ix]; ix += *incx; /* L110: */ } if (nounit) { temp /= ap[kk + j - 1]; } x[jx] = temp; jx += *incx; kk += j; /* L120: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; k = kk; i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { temp -= ap[k] * x[i__]; --k; /* L130: */ } if (nounit) { temp /= ap[kk - *n + j]; } x[j] = temp; kk -= *n - j + 1; /* L140: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { temp = x[jx]; ix = kx; i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { temp -= ap[k] * x[ix]; ix -= *incx; /* L150: */ } if (nounit) { temp /= ap[kk - *n + j]; } x[jx] = temp; jx -= *incx; kk -= *n - j + 1; /* L160: */ } } } } return 0; /* End of DTPSV . */ } /* dtpsv_ */ /* stpsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2; /* Local variables */ bla_integer info; bla_real temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* STPSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular matrix, supplied in packed form. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' A'*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - REAL array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - REAL array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("STPSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.f) { if (nounit) { x[j] /= ap[kk]; } temp = x[j]; k = kk - 1; for (i__ = j - 1; i__ >= 1; --i__) { x[i__] -= temp * ap[k]; --k; /* L10: */ } } kk -= j; /* L20: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { if (x[jx] != 0.f) { if (nounit) { x[jx] /= ap[kk]; } temp = x[jx]; ix = jx; i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; x[ix] -= temp * ap[k]; /* L30: */ } } jx -= *incx; kk -= j; /* L40: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.f) { if (nounit) { x[j] /= ap[kk]; } temp = x[j]; k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { x[i__] -= temp * ap[k]; ++k; /* L50: */ } } kk += *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.f) { if (nounit) { x[jx] /= ap[kk]; } temp = x[jx]; ix = jx; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; x[ix] -= temp * ap[k]; /* L70: */ } } jx += *incx; kk += *n - j + 1; /* L80: */ } } } } else { /* Form x := inv( A' )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[j]; k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { temp -= ap[k] * x[i__]; ++k; /* L90: */ } if (nounit) { temp /= ap[kk + j - 1]; } x[j] = temp; kk += j; /* L100: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[jx]; ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { temp -= ap[k] * x[ix]; ix += *incx; /* L110: */ } if (nounit) { temp /= ap[kk + j - 1]; } x[jx] = temp; jx += *incx; kk += j; /* L120: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; k = kk; i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { temp -= ap[k] * x[i__]; --k; /* L130: */ } if (nounit) { temp /= ap[kk - *n + j]; } x[j] = temp; kk -= *n - j + 1; /* L140: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { temp = x[jx]; ix = kx; i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { temp -= ap[k] * x[ix]; ix -= *incx; /* L150: */ } if (nounit) { temp /= ap[kk - *n + j]; } x[jx] = temp; jx -= *incx; kk -= *n - j + 1; /* L160: */ } } } } return 0; /* End of STPSV . */ } /* stpsv_ */ /* ztpsv.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Subroutine */ int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx) { /* System generated locals */ bla_integer i__1, i__2, i__3, i__4, i__5; bla_dcomplex z__1, z__2, z__3; /* Builtin functions */ //void bla_z_div(bla_dcomplex *, bla_dcomplex *, bla_dcomplex *), bla_d_cnjg( // bla_dcomplex *, bla_dcomplex *); /* Local variables */ bla_integer info; bla_dcomplex temp; bla_integer i__, j, k; //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen); bla_integer kk, ix, jx, kx = 0; //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen); bla_logical noconj, nounit; /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZTPSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, or conjg( A' )*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular matrix, supplied in packed form. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' conjg( A' )*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - COMPLEX*16 array of DIMENSION at least */ /* ( ( n*( n + 1 ) )/2 ). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - COMPLEX*16 array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, ( ftnlen)1)) { info = 2; } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1)) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { PASTEF770(xerbla)("ZTPSV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1); nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) { /* Form x := inv( A )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) { if (nounit) { i__1 = j; bla_z_div(&z__1, &x[j], &ap[kk]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); } i__1 = j; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); k = kk - 1; for (i__ = j - 1; i__ >= 1; --i__) { i__1 = i__; i__2 = i__; i__3 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__3]) - bli_zimag(temp) * bli_zimag(ap[i__3])), (bli_zreal(temp) * bli_zimag(ap[i__3]) + bli_zimag(temp) * bli_zreal(ap[i__3])), z__2 ); bli_zsets( (bli_zreal(x[i__2]) - bli_zreal(z__2)), (bli_zimag(x[i__2]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); --k; /* L10: */ } } kk -= j; /* L20: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { i__1 = jx; if (bli_zreal(x[i__1]) != 0. || bli_zimag(x[i__1]) != 0.) { if (nounit) { i__1 = jx; bla_z_div(&z__1, &x[jx], &ap[kk]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__1] ); } i__1 = jx; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); ix = jx; i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; i__2 = ix; i__3 = ix; i__4 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__4]) - bli_zimag(temp) * bli_zimag(ap[i__4])), (bli_zreal(temp) * bli_zimag(ap[i__4]) + bli_zimag(temp) * bli_zreal(ap[i__4])), z__2 ); bli_zsets( (bli_zreal(x[i__3]) - bli_zreal(z__2)), (bli_zimag(x[i__3]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); /* L30: */ } } jx -= *incx; kk -= j; /* L40: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { if (nounit) { i__2 = j; bla_z_div(&z__1, &x[j], &ap[kk]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); } i__2 = j; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { i__3 = i__; i__4 = i__; i__5 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); ++k; /* L50: */ } } kk += *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; if (bli_zreal(x[i__2]) != 0. || bli_zimag(x[i__2]) != 0.) { if (nounit) { i__2 = jx; bla_z_div(&z__1, &x[jx], &ap[kk]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__2] ); } i__2 = jx; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); ix = jx; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; i__3 = ix; i__4 = ix; i__5 = k; bli_zsets( (bli_zreal(temp) * bli_zreal(ap[i__5]) - bli_zimag(temp) * bli_zimag(ap[i__5])), (bli_zreal(temp) * bli_zimag(ap[i__5]) + bli_zimag(temp) * bli_zreal(ap[i__5])), z__2 ); bli_zsets( (bli_zreal(x[i__4]) - bli_zreal(z__2)), (bli_zimag(x[i__4]) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), x[i__3] ); /* L70: */ } } jx += *incx; kk += *n - j + 1; /* L80: */ } } } } else { /* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. */ if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); k = kk; if (noconj) { i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { i__3 = k; i__4 = i__; bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ++k; /* L90: */ } if (nounit) { bla_z_div(&z__1, &temp, &ap[kk + j - 1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { bla_d_cnjg(&z__3, &ap[k]); i__3 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ++k; /* L100: */ } if (nounit) { bla_d_cnjg(&z__2, &ap[kk + j - 1]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__2 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] ); kk += j; /* L110: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = jx; bli_zsets( (bli_zreal(x[i__2])), (bli_zimag(x[i__2])), temp ); ix = kx; if (noconj) { i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { i__3 = k; i__4 = ix; bli_zsets( (bli_zreal(ap[i__3]) * bli_zreal(x[i__4]) - bli_zimag(ap[i__3]) * bli_zimag(x[i__4])), (bli_zreal(ap[i__3]) * bli_zimag(x[i__4]) + bli_zimag(ap[i__3]) * bli_zreal(x[i__4])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L120: */ } if (nounit) { bla_z_div(&z__1, &temp, &ap[kk + j - 1]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { bla_d_cnjg(&z__3, &ap[k]); i__3 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__3]) - bli_zimag(z__3) * bli_zimag(x[i__3])), (bli_zreal(z__3) * bli_zimag(x[i__3]) + bli_zimag(z__3) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix += *incx; /* L130: */ } if (nounit) { bla_d_cnjg(&z__2, &ap[kk + j - 1]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__2 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__2] ); jx += *incx; kk += j; /* L140: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { i__1 = j; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); k = kk; if (noconj) { i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { i__2 = k; i__3 = i__; bli_zsets( (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); --k; /* L150: */ } if (nounit) { bla_z_div(&z__1, &temp, &ap[kk - *n + j]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { bla_d_cnjg(&z__3, &ap[k]); i__2 = i__; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); --k; /* L160: */ } if (nounit) { bla_d_cnjg(&z__2, &ap[kk - *n + j]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__1 = j; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] ); kk -= *n - j + 1; /* L170: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { i__1 = jx; bli_zsets( (bli_zreal(x[i__1])), (bli_zimag(x[i__1])), temp ); ix = kx; if (noconj) { i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { i__2 = k; i__3 = ix; bli_zsets( (bli_zreal(ap[i__2]) * bli_zreal(x[i__3]) - bli_zimag(ap[i__2]) * bli_zimag(x[i__3])), (bli_zreal(ap[i__2]) * bli_zimag(x[i__3]) + bli_zimag(ap[i__2]) * bli_zreal(x[i__3])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix -= *incx; /* L180: */ } if (nounit) { bla_z_div(&z__1, &temp, &ap[kk - *n + j]); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } else { i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { bla_d_cnjg(&z__3, &ap[k]); i__2 = ix; bli_zsets( (bli_zreal(z__3) * bli_zreal(x[i__2]) - bli_zimag(z__3) * bli_zimag(x[i__2])), (bli_zreal(z__3) * bli_zimag(x[i__2]) + bli_zimag(z__3) * bli_zreal(x[i__2])), z__2 ); bli_zsets( (bli_zreal(temp) - bli_zreal(z__2)), (bli_zimag(temp) - bli_zimag(z__2)), z__1 ); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); ix -= *incx; /* L190: */ } if (nounit) { bla_d_cnjg(&z__2, &ap[kk - *n + j]); bla_z_div(&z__1, &temp, &z__2); bli_zsets( (bli_zreal(z__1)), (bli_zimag(z__1)), temp ); } } i__1 = jx; bli_zsets( (bli_zreal(temp)), (bli_zimag(temp)), x[i__1] ); jx -= *incx; kk -= *n - j + 1; /* L200: */ } } } } return 0; /* End of ZTPSV . */ } /* ztpsv_ */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_tpsv.h000066400000000000000000000047421427272030600226570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_xerbla.c000066400000000000000000000062171427272030600231320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS /* xerbla.f -- translated by f2c (version 19991025). You must link the resulting object file with the libraries: -lf2c -lm (in that order) */ /* Table of constant values */ /* Subroutine */ int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len) { /* -- LAPACK auxiliary routine (preliminary version) -- */ /* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., */ /* Courant Institute, Argonne National Lab, and Rice University */ /* February 29, 1992 */ /* .. Scalar Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* XERBLA is an error handler for the LAPACK routines. */ /* It is called by an LAPACK routine if an input parameter has an */ /* invalid value. A message is printed and execution stops. */ /* Installers may consider modifying the STOP statement in order to */ /* call system-specific exception-handling facilities. */ /* Arguments */ /* ========= */ /* SRNAME (input) CHARACTER*6 */ /* The name of the routine which called XERBLA. */ /* INFO (input) INTEGER */ /* The position of the invalid parameter in the parameter list */ /* of the calling routine. */ //int i; //for ( i = 0; i < srname_len; ++i ) // srname[i] = toupper( srname[i] ); printf("** On entry to %6s, parameter number %2i had an illegal value\n", srname, (int)*info); //bli_abort(); /* End of XERBLA */ return 0; } /* xerbla */ #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_xerbla.h000066400000000000000000000034541427272030600231370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_xerbla_array.c000066400000000000000000000050471427272030600243300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS #define MAX_NUM_CHARS 32 int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info) { int i; #if 1 // 01234567890123456789012345678901 char srname[ MAX_NUM_CHARS + 1 ] = " "; #else char srname[ MAX_NUM_CHARS + 1 ]; // Initialize srname to contain blank characters. for ( i = 0; i < MAX_NUM_CHARS; ++i ) srname[i] = ' '; #endif // Compute the number of chars to copy as the minimum of the length of // srname_array and MAX_NUM_CHARS. const int n_copy = bli_min( srname_len, MAX_NUM_CHARS ); // Copy over each element of srname_array. for ( i = 0; i < n_copy; ++i ) { srname[i] = srname_array[i]; } // NULL terminate. srname[i] = '\0'; // Call xerbla_(). PASTEF770(xerbla)( srname, info, ( ftnlen )srname_len ); return 0; } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/bla_xerbla_array.h000066400000000000000000000034541427272030600243350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/000077500000000000000000000000001427272030600216425ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_c_abs.c000066400000000000000000000034741427272030600237030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z) { return( bla_f__cabs( bli_creal( *z ), bli_cimag( *z ) ) ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_c_abs.h000066400000000000000000000033241427272030600237020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_c_div.c000066400000000000000000000035101427272030600237070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp) { bli_ccopys( *ap, *cp ); bli_cinvscals( *bp, *cp ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_c_div.h000066400000000000000000000033751427272030600237250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_abs.c000066400000000000000000000034251427272030600237000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x) { if(*x >= 0.0) return(*x); return(- *x); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_abs.h000066400000000000000000000033221427272030600237010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_cnjg.c000066400000000000000000000034341427272030600240540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src) { bli_zcopyjs( *src, *dest ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_cnjg.h000066400000000000000000000033511427272030600240570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_imag.c000066400000000000000000000034041427272030600240450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z) { return bli_zimag( *z ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_imag.h000066400000000000000000000033251427272030600240540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_sign.c000066400000000000000000000035011427272030600240660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b) { double x = (*a >= 0.0 ? *a : - *a); return(*b >= 0.0 ? x : -x); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_d_sign.h000066400000000000000000000033501427272030600240750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_f__cabs.c000066400000000000000000000041041427272030600241770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real_val, double imag_val) { double temp; if(real_val < 0) real_val = -real_val; if(imag_val < 0) imag_val = -imag_val; if(imag_val > real_val) { temp = real_val; real_val = imag_val; imag_val = temp; } if((real_val+imag_val) == real_val) return(real_val); temp = imag_val/real_val; temp = real_val*sqrt(1.0 + temp*temp); return(temp); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_f__cabs.h000066400000000000000000000033311427272030600242050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_abs.c000066400000000000000000000034231427272030600237140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x) { if(*x >= 0.0) return(*x); return(- *x); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_abs.h000066400000000000000000000033201427272030600237150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_cnjg.c000066400000000000000000000034341427272030600240720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src) { bli_ccopyjs( *src, *dest ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_cnjg.h000066400000000000000000000033511427272030600240750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_imag.c000066400000000000000000000034061427272030600240650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z) { return bli_cimag( *z ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_imag.h000066400000000000000000000033271427272030600240740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_sign.c000066400000000000000000000034751427272030600241160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b) { double x = (*a >= 0.0 ? *a : - *a); return(*b >= 0.0 ? x : -x); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_r_sign.h000066400000000000000000000033441427272030600241160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_z_abs.c000066400000000000000000000034741427272030600237320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z) { return( bla_f__cabs( bli_zreal( *z ), bli_zimag( *z ) ) ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_z_abs.h000066400000000000000000000033241427272030600237310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_z_div.c000066400000000000000000000035101427272030600237360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp) { bli_zcopys( *ap, *cp ); bli_zinvscals( *bp, *cp ); } #endif cython-blis-0.9.1/blis/_src/frame/compat/f2c/util/bla_z_div.h000066400000000000000000000033751427272030600237540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif cython-blis-0.9.1/blis/_src/frame/include/000077500000000000000000000000001427272030600203535ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/bli_arch_config.h000066400000000000000000000171741427272030600236260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC #include "bli_family_generic.h" #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_arch_config_pre.h000066400000000000000000000053701427272030600244670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif cython-blis-0.9.1/blis/_src/frame/include/bli_blas_macro_defs.h000066400000000000000000000061501427272030600244570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ /* The semantics of negative stride in BLAS are that the vector operand be traversed in reverse order. (Another way to think of this is that negative strides effectively reverse the order of the vector, but without any explicit data movements.) This is also how BLIS interprets negative strides. The differences is that with BLAS, the caller *always* passes in the 0th (i.e., top-most or left-most) element of the vector, even when the stride is negative. By contrast, in BLIS, negative strides are used *relative* to the vector address as it is given. Thus, in BLIS, if this backwards traversal is desired, the caller *must* pass in the address to the (n-1)th (i.e., the bottom-most or right-most) element along with a negative stride. */ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/bli_builtin_macro_defs.h000066400000000000000000000040041427272030600252000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_complex_macro_defs.h000066400000000000000000000044411427272030600252060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif cython-blis-0.9.1/blis/_src/frame/include/bli_config_macro_defs.h000066400000000000000000000241471427272030600250110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_edge_case_macro_defs.h000066400000000000000000000170711427272030600254410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2021, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ /* Scenario 1: the ukernel contains assembly-level support only for its IO preference (e.g. only row-oriented or only column-oriented IO). Use a temporary microtile for the other two cases as well as edge cases. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ /* Scenario 2: the ukernel contains assembly-level support for its IO preference as well as its opposite via in-register transpose (e.g. both row- and column-oriented IO). Use a temporary microtile for the general stride case as well as edge cases. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ /* Scenario 3: Similar to (2) where the assembly region also supports general stride I0. Use a temporary microtile only for edge cases. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ /* Scenario 4: Similar to (1), but uses temporary microtile to handle cases where the pointer to the C microtile is not aligned. */ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ /* If we actually used the temporary microtile, accumulate it to the output microtile. */ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ /* Scenario 1: the ukernel contains assembly-level support only for its IO preference (e.g. only row-oriented or only column-oriented IO). Use a temporary microtile for the other two cases as well as edge cases. */ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ /* Scenario 2: the ukernel contains assembly-level support for its IO preference as well as its opposite via in-register transpose (e.g. both row- and column-oriented IO). Use a temporary microtile for the general stride case as well as edge cases. */ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ /* Scenario 3: Similar to (2) where the assembly region also supports general stride I0. Use a temporary microtile only for edge cases. */ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ /* Scenario 4: Similar to (1), but uses temporary microtile to handle cases where the pointer to the C microtile is not aligned. */ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ /* If we actually used the temporary microtile, use it to overwrite the output microtile. Used by trsm. */ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif cython-blis-0.9.1/blis/_src/frame/include/bli_error_macro_defs.h000066400000000000000000000036021427272030600246660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif cython-blis-0.9.1/blis/_src/frame/include/bli_extern_defs.h000066400000000000000000000042611427272030600236630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif cython-blis-0.9.1/blis/_src/frame/include/bli_f2c.h000066400000000000000000000043751427272030600220350ustar00rootroot00000000000000// f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_genarray_macro_defs.h000066400000000000000000000250241427272030600253470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- /* #define GENARRAY2_VFP(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } */ // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } /* #define GENARRAYR(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, PASTEMAC2(s,d,op), NULL, }, \ { PASTEMAC2(c,s,op), NULL, PASTEMAC2(c,d,op), NULL, }, \ { PASTEMAC2(d,s,op), NULL, PASTEMAC2(d,d,op), NULL, }, \ { PASTEMAC2(z,s,op), NULL, PASTEMAC2(z,d,op), NULL, } \ } */ // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/bli_gentdef_macro_defs.h000066400000000000000000000054671427272030600251640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif cython-blis-0.9.1/blis/_src/frame/include/bli_gentfunc_macro_defs.h000066400000000000000000001522771427272030600253630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif cython-blis-0.9.1/blis/_src/frame/include/bli_gentprot_macro_defs.h000066400000000000000000000654261427272030600254130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif cython-blis-0.9.1/blis/_src/frame/include/bli_kernel_macro_defs.h000066400000000000000000000203371427272030600250210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_lang_defs.h000066400000000000000000000077131427272030600233040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_macro_defs.h000066400000000000000000000106571427272030600234650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros #include "bli_genarray_macro_defs.h" #include "bli_gentdef_macro_defs.h" #include "bli_gentfunc_macro_defs.h" #include "bli_gentprot_macro_defs.h" #include "bli_misc_macro_defs.h" #include "bli_edge_case_macro_defs.h" #include "bli_param_macro_defs.h" #include "bli_obj_macro_defs.h" #include "bli_complex_macro_defs.h" #include "bli_scalar_macro_defs.h" #include "bli_error_macro_defs.h" #include "bli_blas_macro_defs.h" #include "bli_builtin_macro_defs.h" #include "bli_oapi_macro_defs.h" #include "bli_tapi_macro_defs.h" #endif cython-blis-0.9.1/blis/_src/frame/include/bli_misc_macro_defs.h000066400000000000000000000105231427272030600244700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif cython-blis-0.9.1/blis/_src/frame/include/bli_oapi_ba.h000066400000000000000000000047631427272030600227560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; cython-blis-0.9.1/blis/_src/frame/include/bli_oapi_ex.h000066400000000000000000000046451427272030600230070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cython-blis-0.9.1/blis/_src/frame/include/bli_oapi_macro_defs.h000066400000000000000000000034201427272030600244630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex cython-blis-0.9.1/blis/_src/frame/include/bli_obj_macro_defs.h000066400000000000000000001171171427272030600243160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif cython-blis-0.9.1/blis/_src/frame/include/bli_param_macro_defs.h000066400000000000000000001020271427272030600246360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif cython-blis-0.9.1/blis/_src/frame/include/bli_pragma_macro_defs.h000066400000000000000000000051771427272030600250150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* NOTE: The following code is based on [1]. [1] https://github.com/jeffhammond/nwchem-tce-triples-kernels/blob/master/src/pragma_vendor.h */ #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_sbox.h000066400000000000000000000037631427272030600223360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_scalar_macro_defs.h000066400000000000000000000132021427272030600247770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). #include "bli_sets.h" // sets both real and imaginary components // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. #include "bli_setrs.h" // sets real component only #include "bli_setis.h" // sets imaginary component only // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) #include "bli_gets.h" // -- Scalar constant initialization macros -- #include "bli_constants.h" // -- Separated scalar macros (separated real/imaginary values) -- #include "bli_absq2ris.h" #include "bli_abval2ris.h" #include "bli_addris.h" #include "bli_addjris.h" #include "bli_add3ris.h" #include "bli_axpbyris.h" #include "bli_axpbyjris.h" #include "bli_axpyris.h" #include "bli_axpyjris.h" #include "bli_axmyris.h" #include "bli_conjris.h" #include "bli_copyris.h" #include "bli_copyjris.h" #include "bli_copycjris.h" #include "bli_eqris.h" #include "bli_invertris.h" #include "bli_invscalris.h" #include "bli_invscaljris.h" #include "bli_neg2ris.h" #include "bli_scalris.h" #include "bli_scaljris.h" #include "bli_scalcjris.h" #include "bli_scal2ris.h" #include "bli_scal2jris.h" #include "bli_set0ris.h" #include "bli_sqrt2ris.h" #include "bli_subris.h" #include "bli_subjris.h" #include "bli_swapris.h" #include "bli_xpbyris.h" #include "bli_xpbyjris.h" // Inlined scalar macros in loops #include "bli_scal2ris_mxn.h" #include "bli_scalris_mxn_uplo.h" // -- Conventional scalar macros (paired real/imaginary values) -- #include "bli_absq2s.h" #include "bli_abval2s.h" #include "bli_adds.h" #include "bli_addjs.h" #include "bli_add3s.h" #include "bli_axpbys.h" #include "bli_axpbyjs.h" #include "bli_axpys.h" #include "bli_axpyjs.h" #include "bli_axmys.h" #include "bli_conjs.h" #include "bli_copys.h" #include "bli_copyjs.h" #include "bli_copycjs.h" #include "bli_copynzs.h" #include "bli_copyjnzs.h" #include "bli_dots.h" #include "bli_dotjs.h" #include "bli_eq.h" #include "bli_fprints.h" #include "bli_inverts.h" #include "bli_invscals.h" #include "bli_invscaljs.h" #include "bli_neg2s.h" #include "bli_rands.h" #include "bli_randnp2s.h" #include "bli_scals.h" #include "bli_scaljs.h" #include "bli_scalcjs.h" #include "bli_scal2s.h" #include "bli_scal2js.h" #include "bli_set0s.h" #include "bli_set1s.h" #include "bli_seti0s.h" #include "bli_sqrt2s.h" #include "bli_subs.h" #include "bli_subjs.h" #include "bli_swaps.h" #include "bli_xpbys.h" #include "bli_xpbyjs.h" // Inlined scalar macros in loops #include "bli_adds_mxn.h" #include "bli_adds_mxn_uplo.h" #include "bli_set0s_mxn.h" #include "bli_copys_mxn.h" #include "bli_scal2s_mxn.h" #include "bli_xpbys_mxn.h" #include "bli_xpbys_mxn_uplo.h" // -- "broadcast B" scalar macros -- #include "bli_bcastbbs_mxn.h" #include "bli_scal2bbs_mxn.h" #include "bli_set0bbs_mxn.h" // -- 1m-specific scalar macros -- // 1e #include "bli_copy1es.h" #include "bli_copyj1es.h" #include "bli_invert1es.h" #include "bli_scal1es.h" #include "bli_scal21es.h" #include "bli_scal2j1es.h" // 1r #include "bli_copy1rs.h" #include "bli_copyj1rs.h" #include "bli_invert1rs.h" #include "bli_scal1rs.h" #include "bli_scal21rs.h" #include "bli_scal2j1rs.h" // 1m (1e or 1r) #include "bli_invert1ms_mxn_diag.h" #include "bli_scal1ms_mxn.h" #include "bli_scal21ms_mxn.h" #include "bli_scal21ms_mxn_diag.h" #include "bli_scal21ms_mxn_uplo.h" #include "bli_set1ms_mxn.h" #include "bli_set1ms_mxn_diag.h" #include "bli_set1ms_mxn_uplo.h" #include "bli_seti01ms_mxn_diag.h" #endif cython-blis-0.9.1/blis/_src/frame/include/bli_system.h000066400000000000000000000100431427272030600226740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include #include #include #include #include #include #include #include // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include #elif BLIS_OS_OSX #include #else //#include #include #endif #endif cython-blis-0.9.1/blis/_src/frame/include/bli_tapi_ba.h000066400000000000000000000047621427272030600227620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; cython-blis-0.9.1/blis/_src/frame/include/bli_tapi_ex.h000066400000000000000000000046441427272030600230130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cython-blis-0.9.1/blis/_src/frame/include/bli_tapi_macro_defs.h000066400000000000000000000034171427272030600244760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex cython-blis-0.9.1/blis/_src/frame/include/bli_type_defs.h000066400000000000000000001253151427272030600233430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include #include #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // /* info field description bit(s) purpose ------- ------- 2 ~ 0 Stored numerical datatype - 0: domain (0 == real, 1 == complex) - 1: precision (0 == single, 1 == double) - 2: special (100 = int; 101 = const) 3 Transposition required [during pack]? 4 Conjugation required [during pack]? 7 ~ 5 Part of matrix stored: - 5: strictly upper triangular - 6: diagonal - 7: strictly lower triangular 8 Implicit unit diagonal? 9 Invert diagonal required [during pack]? 12 ~ 10 Target numerical datatype - 10: domain (0 == real, 1 == complex) - 11: precision (0 == single, 1 == double) - 12: used to encode integer, constant types 15 ~ 13 Execution numerical datatype - 13: domain (0 == real, 1 == complex) - 14: precision (0 == single, 1 == double) - 15: used to encode integer, constant types 22 ~ 16 Packed type/status - 0 0000 00: not packed - 1 0000 00: packed (unspecified; by rows, columns, or vector) - 1 0000 00: packed by rows - 1 0000 01: packed by columns - 1 0000 10: packed by row panels - 1 0000 11: packed by column panels - 1 0001 10: packed by 1m expanded row panels - 1 0001 11: packed by 1m expanded column panels - 1 0010 10: packed by 1m reordered row panels - 1 0010 11: packed by 1m reordered column panels 23 Packed panel order if upper-stored - 0 == forward order if upper - 1 == reverse order if upper 24 Packed panel order if lower-stored - 0 == forward order if lower - 1 == reverse order if lower 26 ~ 25 Packed buffer type - 0 == block of A - 1 == panel of B - 2 == panel of C - 3 == general use 28 ~ 27 Structure type - 0 == general - 1 == Hermitian - 2 == symmetric - 3 == triangular 31 ~ 29 Computation numerical datatype - 29: domain (0 == real, 1 == complex) - 30: precision (0 == single, 1 == double) - 31: used to encode integer, constant types info2 field description bit(s) purpose ------- ------- 2 ~ 0 Scalar storage numerical datatype - 0: domain (0 == real, 1 == complex) - 1: precision (0 == single, 1 == double) - 2: used to encode integer, constant types */ // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. #include "bli_pthread.h" // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), /* this is changed later. */ \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), /* this is changed later. */ \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif cython-blis-0.9.1/blis/_src/frame/include/bli_x86_asm_macros.h000066400000000000000000001334531427272030600242140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_X86_ASM_MACROS_H #define BLIS_X86_ASM_MACROS_H // // Assembly macros to make inline x86 with AT&T syntax somewhat less painful // // "Private" macros end with _ // // Default syntax is Intel #if !defined(BLIS_ASM_SYNTAX_ATT) && !defined(BLIS_ASM_SYNTAX_INTEL) #define BLIS_ASM_SYNTAX_INTEL #endif #define STRINGIFY_(...) #__VA_ARGS__ #define GET_MACRO_(_1_,_2_,_3_,_4_,NAME,...) NAME #if (defined(_WIN32) && !defined(__clang__) && !defined(__MINGW32__)) || defined(__MIC__) // Intel-style assembly blocks #define BEGIN_ASM __asm { #define END_ASM(...) } #ifdef BLIS_ASM_SYNTAX_INTEL #define INSTR_4_(name,_0,_1,_2,_3) name _0,_1,_2,_3 #define INSTR_3_(name,_0,_1,_2) name _0,_1,_2 #define INSTR_2_(name,_0,_1) name _0,_1 #define INSTR_1_(name,_0) name _0 #define INSTR_0_(name) name #else #define INSTR_4_(name,_0,_1,_2,_3) name _3,_2,_1,_0 #define INSTR_3_(name,_0,_1,_2) name _2,_1,_0 #define INSTR_2_(name,_0,_1) name _1,_0 #define INSTR_1_(name,_0) name _0 #define INSTR_0_(name) name #endif #define LABEL(label) label: #define REGISTER_(r) r #define IMM(x) x #define VAR(x) x #define MASK_(x) {x} #define JMP_(insn, target) insn target #define MEM_4_(reg,off,scale,disp) [reg + off*scale + disp] #define MEM_3_(reg,off,scale) [reg + off*scale] #define MEM_2_(reg,disp) [reg + disp] #define MEM_1_(reg) [reg] #define ALIGN4 align 4 #define ALIGN8 align 8 #define ALIGN16 align 16 #define ALIGN32 align 32 #else // GCC extended assembly with AT&T syntax #define COMMENT_BEGIN "#" #define COMMENT_END #define BEGIN_ASM() __asm__ volatile ( #define END_ASM(...) __VA_ARGS__ ); #ifdef BLIS_ASM_SYNTAX_ATT #define INSTR_4_(name,_0,_1,_2,_3) STRINGIFY_(name) " " STRINGIFY_(_0,_1,_2,_3) "\n\t" #define INSTR_3_(name,_0,_1,_2) STRINGIFY_(name) " " STRINGIFY_(_0,_1,_2) "\n\t" #define INSTR_2_(name,_0,_1) STRINGIFY_(name) " " STRINGIFY_(_0,_1) "\n\t" #define INSTR_1_(name,_0) STRINGIFY_(name) " " STRINGIFY_(_0) "\n\t" #define INSTR_0_(name) STRINGIFY_(name) "\n\t" #else #define INSTR_4_(name,_0,_1,_2,_3) STRINGIFY_(name) " " STRINGIFY_(_3,_2,_1,_0) "\n\t" #define INSTR_3_(name,_0,_1,_2) STRINGIFY_(name) " " STRINGIFY_(_2,_1,_0) "\n\t" #define INSTR_2_(name,_0,_1) STRINGIFY_(name) " " STRINGIFY_(_1,_0) "\n\t" #define INSTR_1_(name,_0) STRINGIFY_(name) " " STRINGIFY_(_0) "\n\t" #define INSTR_0_(name) STRINGIFY_(name) "\n\t" #endif #if BLIS_OS_OSX #define LABEL_(label) "L" STRINGIFY_(label) "%=" #else #define LABEL_(label) ".L" STRINGIFY_(label) "%=" #endif #define REGISTER_(r) %%r #define IMM(x) $##x #define VAR(x) %[x] #define MASK_(x) %{x%} #define LABEL(target) LABEL_(target) ":\n\t" #define JMP_(insn, target) STRINGIFY_(insn) " " LABEL_(target) "\n\t" #define MEM_4_(reg,off,scale,disp) disp(reg,off,scale) #define MEM_3_(reg,off,scale) (reg,off,scale) #define MEM_2_(reg,disp) disp(reg) #define MEM_1_(reg) (reg) #define ALIGN4 ".p2align 2 \n\t" #define ALIGN8 ".p2align 3 \n\t" #define ALIGN16 ".p2align 4 \n\t" #define ALIGN32 ".p2align 5 \n\t" #endif #define begin_asm() BEGIN_ASM() #define end_asm(...) END_ASM(__VA_ARGS__) #define label(...) LABEL(__VA_ARGS__) #define imm(...) IMM(__VA_ARGS__) #define var(...) VAR(__VA_ARGS__) #define align16 ALIGN16 #define align32 ALIGN32 // General-purpose registers #define AL REGISTER_(al) #define AH REGISTER_(ah) #define BL REGISTER_(bl) #define BH REGISTER_(bh) #define CL REGISTER_(cl) #define CH REGISTER_(ch) #define DL REGISTER_(dl) #define DH REGISTER_(dh) #define R8B REGISTER_(r8b) #define R9B REGISTER_(r9b) #define R10B REGISTER_(r10b) #define R11B REGISTER_(r11b) #define R12B REGISTER_(r12b) #define R13B REGISTER_(r13b) #define R14B REGISTER_(r14b) #define R15B REGISTER_(r15b) #define al AL #define ah AH #define bl BL #define bh BH #define cl CL #define ch CH #define dl DL #define dh DH #define r8b R8B #define r9b R9B #define r10b R10B #define r11b R11B #define r12b R12B #define r13b R13B #define r14b R14B #define r15b R15B #define AX REGISTER_(ax) #define BX REGISTER_(bx) #define CX REGISTER_(cx) #define DX REGISTER_(dx) #define SI REGISTER_(si) #define DI REGISTER_(di) #define BP REGISTER_(bp) #define SP REGISTER_(sp) #define R8W REGISTER_(r8w) #define R9W REGISTER_(r9w) #define R10W REGISTER_(r10w) #define R11W REGISTER_(r11w) #define R12W REGISTER_(r12w) #define R13W REGISTER_(r13w) #define R14W REGISTER_(r14w) #define R15W REGISTER_(r15w) #define ax AX #define bx BX #define cx CX #define dx DX #define si SI #define di DI #define bp BP #define sp SP #define r8w R8W #define r9w R9W #define r10w R10W #define r11w R11W #define r12w R12W #define r13w R13W #define r14w R14W #define r15w R15W #define EAX REGISTER_(eax) #define EBX REGISTER_(ebx) #define ECX REGISTER_(ecx) #define EDX REGISTER_(edx) #define ESP REGISTER_(esp) #define EBP REGISTER_(ebp) #define EDI REGISTER_(edi) #define ESI REGISTER_(esi) #define R8D REGISTER_(r8d) #define R9D REGISTER_(r9d) #define R10D REGISTER_(r10d) #define R11D REGISTER_(r11d) #define R12D REGISTER_(r12d) #define R13D REGISTER_(r13d) #define R14D REGISTER_(r14d) #define R15D REGISTER_(r15d) #define eax EAX #define ebx EBX #define ecx ECX #define edx EDX #define esp ESP #define ebp EBP #define edi EDI #define esi ESI #define r8d R8D #define r9d R9D #define r10d R10D #define r11d R11D #define r12d R12D #define r13d R13D #define r14d R14D #define r15d R15D #define RAX REGISTER_(rax) #define RBX REGISTER_(rbx) #define RCX REGISTER_(rcx) #define RDX REGISTER_(rdx) #define RSP REGISTER_(rsp) #define RBP REGISTER_(rbp) #define RDI REGISTER_(rdi) #define RSI REGISTER_(rsi) #define R8 REGISTER_(r8) #define R9 REGISTER_(r9) #define R10 REGISTER_(r10) #define R11 REGISTER_(r11) #define R12 REGISTER_(r12) #define R13 REGISTER_(r13) #define R14 REGISTER_(r14) #define R15 REGISTER_(r15) #define rax RAX #define rbx RBX #define rcx RCX #define rdx RDX #define rsp RSP #define rbp RBP #define rdi RDI #define rsi RSI #define r8 R8 #define r9 R9 #define r10 R10 #define r11 R11 #define r12 R12 #define r13 R13 #define r14 R14 #define r15 R15 // Vector registers #define XMM(x) REGISTER_(Xmm##x) #define YMM(x) REGISTER_(Ymm##x) #define ZMM(x) REGISTER_(Zmm##x) #define K(x) REGISTER_(k##x) #define MASK_K(n) MASK_(K(n)) #define MASK_KZ(n) MASK_(K(n))MASK_(z) #define xmm(x) XMM(x) #define ymm(x) YMM(x) #define zmm(x) ZMM(x) #define k(x) K(x) #define mask_k(x) MASK_K(x) #define mask_kz(x) MASK_KZ(x) #define XMM0 XMM(0) #define XMM1 XMM(1) #define XMM2 XMM(2) #define XMM3 XMM(3) #define XMM4 XMM(4) #define XMM5 XMM(5) #define XMM6 XMM(6) #define XMM7 XMM(7) #define XMM8 XMM(8) #define XMM9 XMM(9) #define XMM10 XMM(10) #define XMM11 XMM(11) #define XMM12 XMM(12) #define XMM13 XMM(13) #define XMM14 XMM(14) #define XMM15 XMM(15) #define XMM16 XMM(16) #define XMM17 XMM(17) #define XMM18 XMM(18) #define XMM19 XMM(19) #define XMM20 XMM(20) #define XMM21 XMM(21) #define XMM22 XMM(22) #define XMM23 XMM(23) #define XMM24 XMM(24) #define XMM25 XMM(25) #define XMM26 XMM(26) #define XMM27 XMM(27) #define XMM28 XMM(28) #define XMM29 XMM(29) #define XMM30 XMM(30) #define XMM31 XMM(31) #define YMM0 YMM(0) #define YMM1 YMM(1) #define YMM2 YMM(2) #define YMM3 YMM(3) #define YMM4 YMM(4) #define YMM5 YMM(5) #define YMM6 YMM(6) #define YMM7 YMM(7) #define YMM8 YMM(8) #define YMM9 YMM(9) #define YMM10 YMM(10) #define YMM11 YMM(11) #define YMM12 YMM(12) #define YMM13 YMM(13) #define YMM14 YMM(14) #define YMM15 YMM(15) #define YMM16 YMM(16) #define YMM17 YMM(17) #define YMM18 YMM(18) #define YMM19 YMM(19) #define YMM20 YMM(20) #define YMM21 YMM(21) #define YMM22 YMM(22) #define YMM23 YMM(23) #define YMM24 YMM(24) #define YMM25 YMM(25) #define YMM26 YMM(26) #define YMM27 YMM(27) #define YMM28 YMM(28) #define YMM29 YMM(29) #define YMM30 YMM(30) #define YMM31 YMM(31) #define ZMM0 ZMM(0) #define ZMM1 ZMM(1) #define ZMM2 ZMM(2) #define ZMM3 ZMM(3) #define ZMM4 ZMM(4) #define ZMM5 ZMM(5) #define ZMM6 ZMM(6) #define ZMM7 ZMM(7) #define ZMM8 ZMM(8) #define ZMM9 ZMM(9) #define ZMM10 ZMM(10) #define ZMM11 ZMM(11) #define ZMM12 ZMM(12) #define ZMM13 ZMM(13) #define ZMM14 ZMM(14) #define ZMM15 ZMM(15) #define ZMM16 ZMM(16) #define ZMM17 ZMM(17) #define ZMM18 ZMM(18) #define ZMM19 ZMM(19) #define ZMM20 ZMM(20) #define ZMM21 ZMM(21) #define ZMM22 ZMM(22) #define ZMM23 ZMM(23) #define ZMM24 ZMM(24) #define ZMM25 ZMM(25) #define ZMM26 ZMM(26) #define ZMM27 ZMM(27) #define ZMM28 ZMM(28) #define ZMM29 ZMM(29) #define ZMM30 ZMM(30) #define ZMM31 ZMM(31) #define xmm0 xmm(0) #define xmm1 xmm(1) #define xmm2 xmm(2) #define xmm3 xmm(3) #define xmm4 xmm(4) #define xmm5 xmm(5) #define xmm6 xmm(6) #define xmm7 xmm(7) #define xmm8 xmm(8) #define xmm9 xmm(9) #define xmm10 xmm(10) #define xmm11 xmm(11) #define xmm12 xmm(12) #define xmm13 xmm(13) #define xmm14 xmm(14) #define xmm15 xmm(15) #define xmm16 xmm(16) #define xmm17 xmm(17) #define xmm18 xmm(18) #define xmm19 xmm(19) #define xmm20 xmm(20) #define xmm21 xmm(21) #define xmm22 xmm(22) #define xmm23 xmm(23) #define xmm24 xmm(24) #define xmm25 xmm(25) #define xmm26 xmm(26) #define xmm27 xmm(27) #define xmm28 xmm(28) #define xmm29 xmm(29) #define xmm30 xmm(30) #define xmm31 xmm(31) #define ymm0 ymm(0) #define ymm1 ymm(1) #define ymm2 ymm(2) #define ymm3 ymm(3) #define ymm4 ymm(4) #define ymm5 ymm(5) #define ymm6 ymm(6) #define ymm7 ymm(7) #define ymm8 ymm(8) #define ymm9 ymm(9) #define ymm10 ymm(10) #define ymm11 ymm(11) #define ymm12 ymm(12) #define ymm13 ymm(13) #define ymm14 ymm(14) #define ymm15 ymm(15) #define ymm16 ymm(16) #define ymm17 ymm(17) #define ymm18 ymm(18) #define ymm19 ymm(19) #define ymm20 ymm(20) #define ymm21 ymm(21) #define ymm22 ymm(22) #define ymm23 ymm(23) #define ymm24 ymm(24) #define ymm25 ymm(25) #define ymm26 ymm(26) #define ymm27 ymm(27) #define ymm28 ymm(28) #define ymm29 ymm(29) #define ymm30 ymm(30) #define ymm31 ymm(31) #define zmm0 zmm(0) #define zmm1 zmm(1) #define zmm2 zmm(2) #define zmm3 zmm(3) #define zmm4 zmm(4) #define zmm5 zmm(5) #define zmm6 zmm(6) #define zmm7 zmm(7) #define zmm8 zmm(8) #define zmm9 zmm(9) #define zmm10 zmm(10) #define zmm11 zmm(11) #define zmm12 zmm(12) #define zmm13 zmm(13) #define zmm14 zmm(14) #define zmm15 zmm(15) #define zmm16 zmm(16) #define zmm17 zmm(17) #define zmm18 zmm(18) #define zmm19 zmm(19) #define zmm20 zmm(20) #define zmm21 zmm(21) #define zmm22 zmm(22) #define zmm23 zmm(23) #define zmm24 zmm(24) #define zmm25 zmm(25) #define zmm26 zmm(26) #define zmm27 zmm(27) #define zmm28 zmm(28) #define zmm29 zmm(29) #define zmm30 zmm(30) #define zmm31 zmm(31) // Memory access // MEM(rax) -> (%rax) or [rax] // MEM(rax,0x80) -> 0x80(%rax) or [rax + 0x80] // MEM(rax,rsi,4) -> (%rax,%rsi,4) or [rax + rsi*4] // MEM(rax,rsi,4,0x80) -> 0x80(%rax,%rsi,4) or [rax + rsi*4 + 0x80] #define MEM(...) GET_MACRO_(__VA_ARGS__,MEM_4_,MEM_3_,MEM_2_,MEM_1_)(__VA_ARGS__) #define MEM_1TO8(...) MEM(__VA_ARGS__) MASK_(1to8) #define MEM_1TO16(...) MEM(__VA_ARGS__) MASK_(1to16) #define MEM_BCAST(...) MEM(__VA_ARGS__) MASK_(b) #define mem(...) MEM(__VA_ARGS__) #define mem_1to8(...) MEM_1TO8(__VA_ARGS__) #define mem_1to16(...) MEM_1TO16(__VA_ARGS__) #define mem_bcast(...) MEM_BCAST(__VA_ARGS__) #define VAR_1TO8(...) VAR(__VA_ARGS__) MASK_(1to8) #define VAR_1TO16(...) VAR(__VA_ARGS__) MASK_(1to16) #define VAR_BCAST(...) VAR(__VA_ARGS__) MASK_(b) #define var_1to8(...) VAR_1TO8(__VA_ARGS__) #define var_1to16(...) VAR_1TO16(__VA_ARGS__) #define var_bcast(...) VAR_BCAST(__VA_ARGS__) // Instructions #define INSTR_(name,...) GET_MACRO_(__VA_ARGS__,INSTR_4_,INSTR_3_,INSTR_2_, \ INSTR_1_,INSTR_0_)(name,__VA_ARGS__) // Jumps #define JC(_0) JMP_(jc, _0) #define JB(_0) JC(_0) #define JNAE(_0) JC(_0) #define JNC(_0) JMP_(jnc, _0) #define JNB(_0) JNC(_0) #define JAE(_0) JNC(_0) #define jc(_0) JC(_0) #define jb(_0) JB(_0) #define jnae(_0) JNAE(_0) #define jnc(_0) JNC(_0) #define jnb(_0) JNB(_0) #define jae(_0) JAE(_0) #define JO(_0) JMP_(jo, _0) #define JNO(_0) JMP_(jno, _0) #define jo(_0) JO(_0) #define jno(_0) JNO(_0) #define JP(_0) JMP_(jp, _0) #define JPE(_0) JP(_0) #define JNP(_0) JMP_(jnp, _0) #define JPO(_0) JNP(_0) #define jp(_0) JP(_0) #define jpe(_0) JPE(_0) #define jnp(_0) JNP(_0) #define jpo(_0) JPO(_0) #define JS(_0) JMP_(js, _0) #define JNS(_0) JMP_(jns, _0) #define js(_0) JS(_0) #define jns(_0) JNS(_0) #define JA(_0) JMP_(ja, _0) #define JNBE(_0) JA(_0) #define JNA(_0) JMP_(jna, _0) #define JBE(_0) JNA(_0) #define ja(_0) JA(_0) #define jnbe(_0) JNBE(_0) #define jna(_0) JNA(_0) #define jbe(_0) JBE(_0) #define JL(_0) JMP_(jl, _0) #define JNGE(_0) JL(_0) #define JNL(_0) JMP_(jnl, _0) #define JGE(_0) JNL(_0) #define jl(_0) JL(_0) #define jnge(_0) JNGE(_0) #define jnl(_0) JNL(_0) #define jge(_0) JGE(_0) #define JG(_0) JMP_(jg, _0) #define JNLE(_0) JG(_0) #define JNG(_0) JMP_(jng, _0) #define JLE(_0) JNG(_0) #define jg(_0) JG(_0) #define jnle(_0) JNLE(_0) #define jng(_0) JNG(_0) #define jle(_0) JLE(_0) #define JE(_0) JMP_(je, _0) #define JZ(_0) JE(_0) #define JNE(_0) JMP_(jne, _0) #define JNZ(_0) JNE(_0) #define je(_0) JE(_0) #define jz(_0) JZ(_0) #define jne(_0) JNE(_0) #define jnz(_0) JNZ(_0) #define JMP(_0) JMP_(jmp, _0) #define jmp(_0) JMP(_0) #define SETE(_0) INSTR_(sete, _0) #define SETZ(_0) SETE(_0) #define sete(_0) SETE(_0) #define setz(_0) SETZ(_0) // Comparisons #define CMP(_0, _1) INSTR_(cmp, _0, _1) #define TEST(_0, _1) INSTR_(test, _0, _1) #define cmp(_0, _1) CMP(_0, _1) #define test(_0, _1) TEST(_0, _1) // Integer math #define AND(_0, _1) INSTR_(and, _0, _1) #define OR(_0, _1) INSTR_(or, _0, _1) #define XOR(_0, _1) INSTR_(xor, _0, _1) #define ADD(_0, _1) INSTR_(add, _0, _1) #define SUB(_0, _1) INSTR_(sub, _0, _1) #define IMUL(_0, _1) INSTR_(imul, _0, _1) #define SAL(...) INSTR_(sal, __VA_ARGS__) #define SAR(...) INSTR_(sar, __VA_ARGS__) #define SHLX(_0, _1, _2) INSTR_(shlx, _0, _1, _2) #define SHRX(_0, _1, _2) INSTR_(shrx, _0, _1, _2) #define RORX(_0, _1, _2) INSTR_(rorx, _0, _1, _2) #define DEC(_0) INSTR_(dec, _0) #define INC(_0) INSTR_(inc, _0) #define and(_0, _1) AND(_0, _1) #define or(_0, _1) OR(_0, _1) #define xor(_0, _1) XOR(_0, _1) #define add(_0, _1) ADD(_0, _1) #define sub(_0, _1) SUB(_0, _1) #define imul(_0, _1) IMUL(_0, _1) #define sal(...) SAL(__VA_ARGS__) #define sar(...) SAR(__VA_ARGS__) #define shlx(_0, _1, _2) SHLX(_0, _1, _2) #define shrx(_0, _1, _2) SHRX(_0, _1, _2) #define rorx(_0, _1, _2) RORX(_0, _1, _2) #define dec(_0) DEC(_0) #define inc(_0) INC(_0) // Memory access #define LEA(_0, _1) INSTR_(lea, _0, _1) #define MOV(_0, _1) INSTR_(mov, _0, _1) #define MOVD(_0, _1) INSTR_(movd, _0, _1) #define MOVL(_0, _1) INSTR_(movl, _0, _1) #define MOVQ(_0, _1) INSTR_(movq, _0, _1) #define CMOVA(_0, _1) INSTR_(cmova, _0, _1) #define CMOVAE(_0, _1) INSTR_(cmovae, _0, _1) #define CMOVB(_0, _1) INSTR_(cmovb, _0, _1) #define CMOVBE(_0, _1) INSTR_(cmovbe, _0, _1) #define CMOVC(_0, _1) INSTR_(cmovc, _0, _1) #define CMOVP(_0, _1) INSTR_(cmovp, _0, _1) #define CMOVO(_0, _1) INSTR_(cmovo, _0, _1) #define CMOVS(_0, _1) INSTR_(cmovs, _0, _1) #define CMOVE(_0, _1) INSTR_(cmove, _0, _1) #define CMOVZ(_0, _1) INSTR_(cmovz, _0, _1) #define CMOVG(_0, _1) INSTR_(cmovg, _0, _1) #define CMOVGE(_0, _1) INSTR_(cmovge, _0, _1) #define CMOVL(_0, _1) INSTR_(cmovl, _0, _1) #define CMOVLE(_0, _1) INSTR_(cmovle, _0, _1) #define CMOVNA(_0, _1) INSTR_(cmovna, _0, _1) #define CMOVNAE(_0, _1) INSTR_(cmovnae, _0, _1) #define CMOVNB(_0, _1) INSTR_(cmovnb, _0, _1) #define CMOVNBE(_0, _1) INSTR_(cmovnbe, _0, _1) #define CMOVNC(_0, _1) INSTR_(cmovnc, _0, _1) #define CMOVNP(_0, _1) INSTR_(cmovnp, _0, _1) #define CMOVNO(_0, _1) INSTR_(cmovno, _0, _1) #define CMOVNS(_0, _1) INSTR_(cmovns, _0, _1) #define CMOVNE(_0, _1) INSTR_(cmovne, _0, _1) #define CMOVNZ(_0, _1) INSTR_(cmovnz, _0, _1) #define CMOVNG(_0, _1) INSTR_(cmovng, _0, _1) #define CMOVNGE(_0, _1) INSTR_(cmovnge, _0, _1) #define CMOVNL(_0, _1) INSTR_(cmovnl, _0, _1) #define CMOVNLE(_0, _1) INSTR_(cmovnle, _0, _1) #define lea(_0, _1) LEA(_0, _1) #define mov(_0, _1) MOV(_0, _1) #define movd(_0, _1) MOVD(_0, _1) #define movl(_0, _1) MOVL(_0, _1) #define movq(_0, _1) MOVQ(_0, _1) #define cmova(_0, _1) CMOVA(_0, _1) #define cmovae(_0, _1) CMOVAE(_0, _1) #define cmovb(_0, _1) CMOVB(_0, _1) #define cmovbe(_0, _1) CMOVBE(_0, _1) #define cmovc(_0, _1) CMOVC(_0, _1) #define cmovp(_0, _1) CMOVP(_0, _1) #define cmovo(_0, _1) CMOVO(_0, _1) #define cmovs(_0, _1) CMOVS(_0, _1) #define cmove(_0, _1) CMOVE(_0, _1) #define cmovz(_0, _1) CMOVZ(_0, _1) #define cmovg(_0, _1) CMOVG(_0, _1) #define cmovge(_0, _1) CMOVGE(_0, _1) #define cmovl(_0, _1) CMOVL(_0, _1) #define cmovle(_0, _1) CMOVLE(_0, _1) #define cmovna(_0, _1) CMOVNA(_0, _1) #define cmovnae(_0, _1) CMOVNAE(_0, _1) #define cmovnb(_0, _1) CMOVNB(_0, _1) #define cmovnbe(_0, _1) CMOVNBE(_0, _1) #define cmovnc(_0, _1) CMOVNC(_0, _1) #define cmovnp(_0, _1) CMOVNP(_0, _1) #define cmovno(_0, _1) CMOVNO(_0, _1) #define cmovns(_0, _1) CMOVNS(_0, _1) #define cmovne(_0, _1) CMOVNE(_0, _1) #define cmovnz(_0, _1) CMOVNZ(_0, _1) #define cmovng(_0, _1) CMOVNG(_0, _1) #define cmovnge(_0, _1) CMOVNGE(_0, _1) #define cmovnl(_0, _1) CMOVNL(_0, _1) #define cmovnle(_0, _1) CMOVNLE(_0, _1) // Vector moves #define MOVSS(_0, _1) INSTR_(movss, _0, _1) #define MOVSD(_0, _1) INSTR_(movsd, _0, _1) #define MOVAPS(_0, _1) INSTR_(movaps, _0, _1) #define MOVAPD(_0, _1) INSTR_(movaps, _0, _1) //use movaps because it is shorter #define MOVDDUP(_0, _1) INSTR_(movddup, _0, _1) #define MOVLPS(_0, _1) INSTR_(movlps, _0, _1) #define MOVHPS(_0, _1) INSTR_(movhps, _0, _1) #define MOVLPD(_0, _1) INSTR_(movlpd, _0, _1) #define MOVHPD(_0, _1) INSTR_(movhpd, _0, _1) #define movss(_0, _1) MOVSS(_0, _1) #define movsd(_0, _1) MOVSD(_0, _1) #define movaps(_0, _1) MOVAPS(_0, _1) #define movapd(_0, _1) MOVAPD(_0, _1) #define movddup(_0, _1) MOVDDUP(_0, _1) #define movlps(_0, _1) MOVLPS(_0, _1) #define movhps(_0, _1) MOVHPS(_0, _1) #define movlpd(_0, _1) MOVLPD(_0, _1) #define movhpd(_0, _1) MOVHPD(_0, _1) #define VMOVDDUP(_0, _1) INSTR_(vmovddup, _0, _1) #define VMOVSLDUP(_0, _1) INSTR_(vmovsldup, _0, _1) #define VMOVSHDUP(_0, _1) INSTR_(vmovshdup, _0, _1) #define VMOVD(_0, _1) INSTR_(vmovd, _0, _1) #define VMOVQ(_0, _1) INSTR_(vmovq, _0, _1) #define VMOVSS(_0, _1) INSTR_(vmovss, _0, _1) #define VMOVSD(_0, _1) INSTR_(vmovsd, _0, _1) #define VMOVAPS(_0, _1) INSTR_(vmovaps, _0, _1) #define VMOVUPS(_0, _1) INSTR_(vmovups, _0, _1) #define VMOVAPD(_0, _1) INSTR_(vmovapd, _0, _1) #define VMOVUPD(_0, _1) INSTR_(vmovupd, _0, _1) #define VMOVLPS(...) INSTR_(vmovlps, __VA_ARGS__) #define VMOVHPS(...) INSTR_(vmovhps, __VA_ARGS__) #define VMOVLPD(...) INSTR_(vmovlpd, __VA_ARGS__) #define VMOVHPD(...) INSTR_(vmovhpd, __VA_ARGS__) #define VMOVDQA(_0, _1) INSTR_(vmovdqa, _0, _1) #define VMOVDQA32(_0, _1) INSTR_(vmovdqa32, _0, _1) #define VMOVDQA64(_0, _1) INSTR_(vmovdqa64, _0, _1) #define VBROADCASTSS(_0, _1) INSTR_(vbroadcastss, _0, _1) #define VBROADCASTSD(_0, _1) INSTR_(vbroadcastsd, _0, _1) #define VPBROADCASTD(_0, _1) INSTR_(vpbroadcastd, _0, _1) #define VPBROADCASTQ(_0, _1) INSTR_(vpbroadcastq, _0, _1) #define VBROADCASTF128(_0, _1) INSTR_(vbroadcastf128, _0, _1) #define VBROADCASTF64X4(_0, _1) INSTR_(vbroadcastf64x4, _0, _1) #define VGATHERDPS(...) INSTR_(vgatherdps, __VA_ARGS__) #define VSCATTERDPS(_0, _1) INSTR_(vscatterdps, _0, _1) #define VGATHERDPD(...) INSTR_(vgatherdpd, __VA_ARGS__) #define VSCATTERDPD(_0, _1) INSTR_(vscatterdpd, _0, _1) #define VGATHERQPS(...) INSTR_(vgatherqps, __VA_ARGS__) #define VSCATTERQPS(_0, _1) INSTR_(vscatterqps, _0, _1) #define VGATHERQPD(...) INSTR_(vgatherqpd, __VA_ARGS__) #define VSCATTERQPD(_0, _1) INSTR_(vscatterqpd, _0, _1) #define vmovddup(_0, _1) VMOVDDUP(_0, _1) #define vmovsldup(_0, _1) VMOVSLDUP(_0, _1) #define vmovshdup(_0, _1) VMOVSHDUP(_0, _1) #define vmovd(_0, _1) VMOVD(_0, _1) #define vmovq(_0, _1) VMOVQ(_0, _1) #define vmovss(_0, _1) VMOVSS(_0, _1) #define vmovsd(_0, _1) VMOVSD(_0, _1) #define vmovaps(_0, _1) VMOVAPS(_0, _1) #define vmovups(_0, _1) VMOVUPS(_0, _1) #define vmovapd(_0, _1) VMOVAPD(_0, _1) #define vmovupd(_0, _1) VMOVUPD(_0, _1) #define vmovlps(...) VMOVLPS(__VA_ARGS__) #define vmovhps(...) VMOVHPS(__VA_ARGS__) #define vmovlpd(...) VMOVLPD(__VA_ARGS__) #define vmovhpd(...) VMOVHPD(__VA_ARGS__) #define vmovdqa(_0, _1) VMOVDQA(_0, _1) #define vmovdqa32(_0, _1) VMOVDQA32(_0, _1) #define vmovdqa64(_0, _1) VMOVDQA64(_0, _1) #define vbroadcastss(_0, _1) VBROADCASTSS(_0, _1) #define vbroadcastsd(_0, _1) VBROADCASTSD(_0, _1) #define vpbroadcastd(_0, _1) VPBROADCASTD(_0, _1) #define vpbroadcastq(_0, _1) VPBROADCASTQ(_0, _1) #define vbroadcastf128(_0, _1) VBROADCASTF128(_0, _1) #define vbroadcastf64x4(_0, _1) VBROADCASTF64X4(_0, _1) #define vgatherdps(...) VGATHERDPS(__VA_ARGS__) #define vscatterdps(_0, _1) VSCATTERDPS(_0, _1) #define vgatherdpd(...) VGATHERDPD(__VA_ARGS__) #define vscatterdpd(_0, _1) VSCATTERDPD(_0, _1) #define vgatherqps(...) VGATHERQPS(__VA_ARGS__) #define vscatterqps(_0, _1) VSCATTERQPS(_0, _1) #define vgatherqpd(...) VGATHERQPD(__VA_ARGS__) #define vscatterqpd(_0, _1) VSCATTERQPD(_0, _1) // Vector comparisons #define VPCMPEQB(_0, _1, _2) INSTR_(vpcmpeqb, _0, _1, _2) #define VPCMPEQW(_0, _1, _2) INSTR_(vpcmpeqw, _0, _1, _2) #define VPCMPEQD(_0, _1, _2) INSTR_(vpcmpeqd, _0, _1, _2) #define vpcmpeqb(_0, _1, _2) VPCMPEQB(_0, _1, _2) #define vpcmpeqw(_0, _1, _2) VPCMPEQW(_0, _1, _2) #define vpcmpeqd(_0, _1, _2) VPCMPEQD(_0, _1, _2) // Vector integer math #define VPADDB(_0, _1, _2) INSTR_(vpaddb, _0, _1, _2) #define VPADDW(_0, _1, _2) INSTR_(vpaddw, _0, _1, _2) #define VPADDD(_0, _1, _2) INSTR_(vpaddd, _0, _1, _2) #define VPADDQ(_0, _1, _2) INSTR_(vpaddq, _0, _1, _2) #define vpaddb(_0, _1, _2) VPADDB(_0, _1, _2) #define vpaddw(_0, _1, _2) VPADDW(_0, _1, _2) #define vpaddd(_0, _1, _2) VPADDD(_0, _1, _2) #define vpaddq(_0, _1, _2) VPADDQ(_0, _1, _2) // Vector math #define ADDPS(_0, _1) INSTR_(addps, _0, _1) #define ADDPD(_0, _1) INSTR_(addpd, _0, _1) #define SUBPS(_0, _1) INSTR_(subps, _0, _1) #define SUBPD(_0, _1) INSTR_(subpd, _0, _1) #define MULPS(_0, _1) INSTR_(mulps, _0, _1) #define MULPD(_0, _1) INSTR_(mulpd, _0, _1) #define DIVPS(_0, _1) INSTR_(divps, _0, _1) #define DIVPD(_0, _1) INSTR_(divpd, _0, _1) #define XORPS(_0, _1) INSTR_(xorps, _0, _1) #define XORPD(_0, _1) INSTR_(xorpd, _0, _1) #define UCOMISS(_0, _1) INSTR_(ucomiss, _0, _1) #define UCOMISD(_0, _1) INSTR_(ucomisd, _0, _1) #define COMISS(_0, _1) INSTR_(comiss, _0, _1) #define COMISD(_0, _1) INSTR_(comisd, _0, _1) #define addps(_0, _1) ADDPS(_0, _1) #define addpd(_0, _1) ADDPD(_0, _1) #define subps(_0, _1) SUBPS(_0, _1) #define subpd(_0, _1) SUBPD(_0, _1) #define mulps(_0, _1) MULPS(_0, _1) #define mulpd(_0, _1) MULPD(_0, _1) #define divps(_0, _1) DIVPS(_0, _1) #define divpd(_0, _1) DIVPD(_0, _1) #define xorps(_0, _1) XORPS(_0, _1) #define xorpd(_0, _1) XORPD(_0, _1) #define ucomiss(_0, _1) UCOMISS(_0, _1) #define ucomisd(_0, _1) UCOMISD(_0, _1) #define cmoiss(_0, _1) COMISS(_0, _1) #define comisd(_0, _1) COMISD(_0, _1) #define VADDSUBPS(_0, _1, _2) INSTR_(vaddsubps, _0, _1, _2) #define VADDSUBPD(_0, _1, _2) INSTR_(vaddsubpd, _0, _1, _2) #define VHADDPD(_0, _1, _2) INSTR_(vhaddpd, _0, _1, _2) #define VHADDPS(_0, _1, _2) INSTR_(vhaddps, _0, _1, _2) #define VHSUBPD(_0, _1, _2) INSTR_(vhsubpd, _0, _1, _2) #define VHSUBPS(_0, _1, _2) INSTR_(vhsubps, _0, _1, _2) #define VADDPS(_0, _1, _2) INSTR_(vaddps, _0, _1, _2) #define VADDPD(_0, _1, _2) INSTR_(vaddpd, _0, _1, _2) #define VSUBPS(_0, _1, _2) INSTR_(vsubps, _0, _1, _2) #define VSUBPD(_0, _1, _2) INSTR_(vsubpd, _0, _1, _2) #define VMULSS(_0, _1, _2) INSTR_(vmulss, _0, _1, _2) #define VMULSD(_0, _1, _2) INSTR_(vmulsd, _0, _1, _2) #define VMULPS(_0, _1, _2) INSTR_(vmulps, _0, _1, _2) #define VMULPD(_0, _1, _2) INSTR_(vmulpd, _0, _1, _2) #define VDIVSS(_0, _1, _2) INSTR_(vdivss, _0, _1, _2) #define VDIVSD(_0, _1, _2) INSTR_(vdivsd, _0, _1, _2) #define VDIVPS(_0, _1, _2) INSTR_(vdivps, _0, _1, _2) #define VDIVPD(_0, _1, _2) INSTR_(vdivpd, _0, _1, _2) #define VPMULLD(_0, _1, _2) INSTR_(vpmulld, _0, _1, _2) #define VPMULLQ(_0, _1, _2) INSTR_(vpmullq, _0, _1, _2) #define VPADDD(_0, _1, _2) INSTR_(vpaddd, _0, _1, _2) #define VPSLLD(_0, _1, _2) INSTR_(vpslld, _0, _1, _2) #define VXORPS(_0, _1, _2) INSTR_(vxorps, _0, _1, _2) #define VXORPD(_0, _1, _2) INSTR_(vxorpd, _0, _1, _2) #define VPXORD(_0, _1, _2) INSTR_(vpxord, _0, _1, _2) #define VUCOMISS(_0, _1) INSTR_(vucomiss, _0, _1) #define VUCOMISD(_0, _1) INSTR_(vucomisd, _0, _1) #define VCOMISS(_0, _1) INSTR_(vcomiss, _0, _1) #define VCOMISD(_0, _1) INSTR_(vcomisd, _0, _1) #define VFMADD132SS(_0, _1, _2) INSTR_(vfmadd132ss, _0, _1, _2) #define VFMADD213SS(_0, _1, _2) INSTR_(vfmadd213ss, _0, _1, _2) #define VFMADD231SS(_0, _1, _2) INSTR_(vfmadd231ss, _0, _1, _2) #define VFMADD132SD(_0, _1, _2) INSTR_(vfmadd132sd, _0, _1, _2) #define VFMADD213SD(_0, _1, _2) INSTR_(vfmadd213sd, _0, _1, _2) #define VFMADD231SD(_0, _1, _2) INSTR_(vfmadd231sd, _0, _1, _2) #define VFMADD132PS(_0, _1, _2) INSTR_(vfmadd132ps, _0, _1, _2) #define VFMADD213PS(_0, _1, _2) INSTR_(vfmadd213ps, _0, _1, _2) #define VFMADD231PS(_0, _1, _2) INSTR_(vfmadd231ps, _0, _1, _2) #define VFMADD132PD(_0, _1, _2) INSTR_(vfmadd132pd, _0, _1, _2) #define VFMADD213PD(_0, _1, _2) INSTR_(vfmadd213pd, _0, _1, _2) #define VFMADD231PD(_0, _1, _2) INSTR_(vfmadd231pd, _0, _1, _2) #define VFMSUB132SS(_0, _1, _2) INSTR_(vfmsub132ss, _0, _1, _2) #define VFMSUB213SS(_0, _1, _2) INSTR_(vfmsub213ss, _0, _1, _2) #define VFMSUB231SS(_0, _1, _2) INSTR_(vfmsub231ss, _0, _1, _2) #define VFMSUB132SD(_0, _1, _2) INSTR_(vfmsub132sd, _0, _1, _2) #define VFMSUB213SD(_0, _1, _2) INSTR_(vfmsub213sd, _0, _1, _2) #define VFMSUB231SD(_0, _1, _2) INSTR_(vfmsub231sd, _0, _1, _2) #define VFMSUB132PS(_0, _1, _2) INSTR_(vfmsub132ps, _0, _1, _2) #define VFMSUB213PS(_0, _1, _2) INSTR_(vfmsub213ps, _0, _1, _2) #define VFMSUB231PS(_0, _1, _2) INSTR_(vfmsub231ps, _0, _1, _2) #define VFMSUB132PD(_0, _1, _2) INSTR_(vfmsub132pd, _0, _1, _2) #define VFMSUB213PD(_0, _1, _2) INSTR_(vfmsub213pd, _0, _1, _2) #define VFMSUB231PD(_0, _1, _2) INSTR_(vfmsub231pd, _0, _1, _2) #define VFNMADD132SS(_0, _1, _2) INSTR_(vfnmadd132ss, _0, _1, _2) #define VFNMADD213SS(_0, _1, _2) INSTR_(vfnmadd213ss, _0, _1, _2) #define VFNMADD231SS(_0, _1, _2) INSTR_(vfnmadd231ss, _0, _1, _2) #define VFNMADD132SD(_0, _1, _2) INSTR_(vfnmadd132sd, _0, _1, _2) #define VFNMADD213SD(_0, _1, _2) INSTR_(vfnmadd213sd, _0, _1, _2) #define VFNMADD231SD(_0, _1, _2) INSTR_(vfnmadd231sd, _0, _1, _2) #define VFNMADD132PS(_0, _1, _2) INSTR_(vfnmadd132ps, _0, _1, _2) #define VFNMADD213PS(_0, _1, _2) INSTR_(vfnmadd213ps, _0, _1, _2) #define VFNMADD231PS(_0, _1, _2) INSTR_(vfnmadd231ps, _0, _1, _2) #define VFNMADD132PD(_0, _1, _2) INSTR_(vfnmadd132pd, _0, _1, _2) #define VFNMADD213PD(_0, _1, _2) INSTR_(vfnmadd213pd, _0, _1, _2) #define VFNMADD231PD(_0, _1, _2) INSTR_(vfnmadd231pd, _0, _1, _2) #define VFNMSUB132SS(_0, _1, _2) INSTR_(vfnmsub132ss, _0, _1, _2) #define VFNMSUB213SS(_0, _1, _2) INSTR_(vfnmsub213ss, _0, _1, _2) #define VFNMSUB231SS(_0, _1, _2) INSTR_(vfnmsub231ss, _0, _1, _2) #define VFNMSUB132SD(_0, _1, _2) INSTR_(vfnmsub132sd, _0, _1, _2) #define VFNMSUB213SD(_0, _1, _2) INSTR_(vfnmsub213sd, _0, _1, _2) #define VFNMSUB231SD(_0, _1, _2) INSTR_(vfnmsub231sd, _0, _1, _2) #define VFNMSUB132PS(_0, _1, _2) INSTR_(vfnmsub132ps, _0, _1, _2) #define VFNMSUB213PS(_0, _1, _2) INSTR_(vfnmsub213ps, _0, _1, _2) #define VFNMSUB231PS(_0, _1, _2) INSTR_(vfnmsub231ps, _0, _1, _2) #define VFNMSUB132PD(_0, _1, _2) INSTR_(vfnmsub132pd, _0, _1, _2) #define VFNMSUB213PD(_0, _1, _2) INSTR_(vfnmsub213pd, _0, _1, _2) #define VFNMSUB231PD(_0, _1, _2) INSTR_(vfnmsub231pd, _0, _1, _2) #define VFMADDSUB132SS(_0, _1, _2) INSTR_(vfmaddsub132ss, _0, _1, _2) #define VFMADDSUB213SS(_0, _1, _2) INSTR_(vfmaddsub213ss, _0, _1, _2) #define VFMADDSUB231SS(_0, _1, _2) INSTR_(vfmaddsub231ss, _0, _1, _2) #define VFMADDSUB132SD(_0, _1, _2) INSTR_(vfmaddsub132sd, _0, _1, _2) #define VFMADDSUB213SD(_0, _1, _2) INSTR_(vfmaddsub213sd, _0, _1, _2) #define VFMADDSUB231SD(_0, _1, _2) INSTR_(vfmaddsub231sd, _0, _1, _2) #define VFMADDSUB132PS(_0, _1, _2) INSTR_(vfmaddsub132ps, _0, _1, _2) #define VFMADDSUB213PS(_0, _1, _2) INSTR_(vfmaddsub213ps, _0, _1, _2) #define VFMADDSUB231PS(_0, _1, _2) INSTR_(vfmaddsub231ps, _0, _1, _2) #define VFMADDSUB132PD(_0, _1, _2) INSTR_(vfmaddsub132pd, _0, _1, _2) #define VFMADDSUB213PD(_0, _1, _2) INSTR_(vfmaddsub213pd, _0, _1, _2) #define VFMADDSUB231PD(_0, _1, _2) INSTR_(vfmaddsub231pd, _0, _1, _2) #define VFMSUBADD132SS(_0, _1, _2) INSTR_(vfmsubadd132ss, _0, _1, _2) #define VFMSUBADD213SS(_0, _1, _2) INSTR_(vfmsubadd213ss, _0, _1, _2) #define VFMSUBADD231SS(_0, _1, _2) INSTR_(vfmsubadd231ss, _0, _1, _2) #define VFMSUBADD132SD(_0, _1, _2) INSTR_(vfmsubadd132sd, _0, _1, _2) #define VFMSUBADD213SD(_0, _1, _2) INSTR_(vfmsubadd213sd, _0, _1, _2) #define VFMSUBADD231SD(_0, _1, _2) INSTR_(vfmsubadd231sd, _0, _1, _2) #define VFMSUBADD132PS(_0, _1, _2) INSTR_(vfmsubadd132ps, _0, _1, _2) #define VFMSUBADD213PS(_0, _1, _2) INSTR_(vfmsubadd213ps, _0, _1, _2) #define VFMSUBADD231PS(_0, _1, _2) INSTR_(vfmsubadd231ps, _0, _1, _2) #define VFMSUBADD132PD(_0, _1, _2) INSTR_(vfmsubadd132pd, _0, _1, _2) #define VFMSUBADD213PD(_0, _1, _2) INSTR_(vfmsubadd213pd, _0, _1, _2) #define VFMSUBADD231PD(_0, _1, _2) INSTR_(vfmsubadd231pd, _0, _1, _2) #define VFMADDSS(_0, _1, _2, _3) INSTR_(vfmaddss, _0, _1, _2, _3) #define VFMADDSD(_0, _1, _2, _3) INSTR_(vfmaddsd, _0, _1, _2, _3) #define VFMADDPS(_0, _1, _2, _3) INSTR_(vfmaddps, _0, _1, _2, _3) #define VFMADDPD(_0, _1, _2, _3) INSTR_(vfmaddpd, _0, _1, _2, _3) #define VFMSUBSS(_0, _1, _2, _3) INSTR_(vfmsubss, _0, _1, _2, _3) #define VFMSUBSD(_0, _1, _2, _3) INSTR_(vfmsubsd, _0, _1, _2, _3) #define VFMSUBPS(_0, _1, _2, _3) INSTR_(vfmsubps, _0, _1, _2, _3) #define VFMSUBPD(_0, _1, _2, _3) INSTR_(vfmsubpd, _0, _1, _2, _3) #define VFNMADDSS(_0, _1, _2, _3) INSTR_(vfnmaddss, _0, _1, _2, _3) #define VFNMADDSD(_0, _1, _2, _3) INSTR_(vfnmaddsd, _0, _1, _2, _3) #define VFNMADDPS(_0, _1, _2, _3) INSTR_(vfnmaddps, _0, _1, _2, _3) #define VFNMADDPD(_0, _1, _2, _3) INSTR_(vfnmaddpd, _0, _1, _2, _3) #define VFNMSUBSS(_0, _1, _2, _3) INSTR_(vfnmsubss, _0, _1, _2, _3) #define VFNMSUBSD(_0, _1, _2, _3) INSTR_(vfnmsubsd, _0, _1, _2, _3) #define VFNMSUBPS(_0, _1, _2, _3) INSTR_(vfnmsubps, _0, _1, _2, _3) #define VFNMSUBPD(_0, _1, _2, _3) INSTR_(vfnmsubpd, _0, _1, _2, _3) #define VFMADDSUBSS(_0, _1, _2, _3) INSTR_(vfmaddsubss, _0, _1, _2, _3) #define VFMADDSUBSD(_0, _1, _2, _3) INSTR_(vfmaddsubsd, _0, _1, _2, _3) #define VFMADDSUBPS(_0, _1, _2, _3) INSTR_(vfmaddsubps, _0, _1, _2, _3) #define VFMADDSUBPD(_0, _1, _2, _3) INSTR_(vfmaddsubpd, _0, _1, _2, _3) #define VFMSUBADDSS(_0, _1, _2, _3) INSTR_(vfmsubaddss, _0, _1, _2, _3) #define VFMSUBADDSD(_0, _1, _2, _3) INSTR_(vfmsubaddsd, _0, _1, _2, _3) #define VFMSUBADDPS(_0, _1, _2, _3) INSTR_(vfmsubaddps, _0, _1, _2, _3) #define VFMSUBADDPD(_0, _1, _2, _3) INSTR_(vfmsubaddpd, _0, _1, _2, _3) #define V4FMADDSS(_0, _1, _2) INSTR_(v4fmaddss, _0, _1, _2) #define V4FMADDPS(_0, _1, _2) INSTR_(v4fmaddps, _0, _1, _2) #define V4FNMADDSS(_0, _1, _2) INSTR_(v4fnmaddss, _0, _1, _2) #define V4FNMADDPS(_0, _1, _2) INSTR_(v4fnmaddps, _0, _1, _2) #define vaddsubps(_0, _1, _2) VADDSUBPS(_0, _1, _2) #define vaddsubpd(_0, _1, _2) VADDSUBPD(_0, _1, _2) #define vhaddpd(_0, _1, _2) VHADDPD(_0, _1, _2) #define vhaddps(_0, _1, _2) VHADDPS(_0, _1, _2) #define vhsubpd(_0, _1, _2) VHSUBPD(_0, _1, _2) #define vhsubps(_0, _1, _2) VHSUBPS(_0, _1, _2) #define vaddps(_0, _1, _2) VADDPS(_0, _1, _2) #define vaddpd(_0, _1, _2) VADDPD(_0, _1, _2) #define vsubps(_0, _1, _2) VSUBPS(_0, _1, _2) #define vsubpd(_0, _1, _2) VSUBPD(_0, _1, _2) #define vmulss(_0, _1, _2) VMULSS(_0, _1, _2) #define vmulps(_0, _1, _2) VMULPS(_0, _1, _2) #define vmulsd(_0, _1, _2) VMULSD(_0, _1, _2) #define vmulpd(_0, _1, _2) VMULPD(_0, _1, _2) #define vdivss(_0, _1, _2) VDIVSS(_0, _1, _2) #define vdivps(_0, _1, _2) VDIVPS(_0, _1, _2) #define vdivsd(_0, _1, _2) VDIVSD(_0, _1, _2) #define vdivpd(_0, _1, _2) VDIVPD(_0, _1, _2) #define vpmulld(_0, _1, _2) VPMULLD(_0, _1, _2) #define vpmullq(_0, _1, _2) VPMULLQ(_0, _1, _2) #define vpaddd(_0, _1, _2) VPADDD(_0, _1, _2) #define vpslld(_0, _1, _2) VPSLLD(_0, _1, _2) #define vxorps(_0, _1, _2) VXORPS(_0, _1, _2) #define vxorpd(_0, _1, _2) VXORPD(_0, _1, _2) #define vpxord(_0, _1, _2) VPXORD(_0, _1, _2) #define vucomiss(_0, _1) VUCOMISS(_0, _1) #define vucomisd(_0, _1) VUCOMISD(_0, _1) #define vcomiss(_0, _1) VCOMISS(_0, _1) #define vcomisd(_0, _1) VCOMISD(_0, _1) #define vfmadd132ss(_0, _1, _2) VFMADD132SS(_0, _1, _2) #define vfmadd213ss(_0, _1, _2) VFMADD213SS(_0, _1, _2) #define vfmadd231ss(_0, _1, _2) VFMADD231SS(_0, _1, _2) #define vfmadd132sd(_0, _1, _2) VFMADD132SD(_0, _1, _2) #define vfmadd213sd(_0, _1, _2) VFMADD213SD(_0, _1, _2) #define vfmadd231sd(_0, _1, _2) VFMADD231SD(_0, _1, _2) #define vfmadd132ps(_0, _1, _2) VFMADD132PS(_0, _1, _2) #define vfmadd213ps(_0, _1, _2) VFMADD213PS(_0, _1, _2) #define vfmadd231ps(_0, _1, _2) VFMADD231PS(_0, _1, _2) #define vfmadd132pd(_0, _1, _2) VFMADD132PD(_0, _1, _2) #define vfmadd213pd(_0, _1, _2) VFMADD213PD(_0, _1, _2) #define vfmadd231pd(_0, _1, _2) VFMADD231PD(_0, _1, _2) #define vfmadd132ss(_0, _1, _2) VFMADD132SS(_0, _1, _2) #define vfmsub213ss(_0, _1, _2) VFMSUB213SS(_0, _1, _2) #define vfmsub231ss(_0, _1, _2) VFMSUB231SS(_0, _1, _2) #define vfmsub132sd(_0, _1, _2) VFMSUB132SD(_0, _1, _2) #define vfmsub213sd(_0, _1, _2) VFMSUB213SD(_0, _1, _2) #define vfmsub231sd(_0, _1, _2) VFMSUB231SD(_0, _1, _2) #define vfmsub132ps(_0, _1, _2) VFMSUB132PS(_0, _1, _2) #define vfmsub213ps(_0, _1, _2) VFMSUB213PS(_0, _1, _2) #define vfmsub231ps(_0, _1, _2) VFMSUB231PS(_0, _1, _2) #define vfmsub132pd(_0, _1, _2) VFMSUB132PD(_0, _1, _2) #define vfmsub213pd(_0, _1, _2) VFMSUB213PD(_0, _1, _2) #define vfmsub231pd(_0, _1, _2) VFMSUB231PD(_0, _1, _2) #define vfnmadd132ss(_0, _1, _2) VFNMADD132SS(_0, _1, _2) #define vfnmadd213ss(_0, _1, _2) VFNMADD213SS(_0, _1, _2) #define vfnmadd231ss(_0, _1, _2) VFNMADD231SS(_0, _1, _2) #define vfnmadd132sd(_0, _1, _2) VFNMADD132SD(_0, _1, _2) #define vfnmadd213sd(_0, _1, _2) VFNMADD213SD(_0, _1, _2) #define vfnmadd231sd(_0, _1, _2) VFNMADD231SD(_0, _1, _2) #define vfnmadd132ps(_0, _1, _2) VFNMADD132PS(_0, _1, _2) #define vfnmadd213ps(_0, _1, _2) VFNMADD213PS(_0, _1, _2) #define vfnmadd231ps(_0, _1, _2) VFNMADD231PS(_0, _1, _2) #define vfnmadd132pd(_0, _1, _2) VFNMADD132PD(_0, _1, _2) #define vfnmadd213pd(_0, _1, _2) VFNMADD213PD(_0, _1, _2) #define vfnmadd231pd(_0, _1, _2) VFNMADD231PD(_0, _1, _2) #define vfnmadd132ss(_0, _1, _2) VFNMADD132SS(_0, _1, _2) #define vfnmsub213ss(_0, _1, _2) VFNMSUB213SS(_0, _1, _2) #define vfnmsub231ss(_0, _1, _2) VFNMSUB231SS(_0, _1, _2) #define vfnmsub132sd(_0, _1, _2) VFNMSUB132SD(_0, _1, _2) #define vfnmsub213sd(_0, _1, _2) VFNMSUB213SD(_0, _1, _2) #define vfnmsub231sd(_0, _1, _2) VFNMSUB231SD(_0, _1, _2) #define vfnmsub132ps(_0, _1, _2) VFNMSUB132PS(_0, _1, _2) #define vfnmsub213ps(_0, _1, _2) VFNMSUB213PS(_0, _1, _2) #define vfnmsub231ps(_0, _1, _2) VFNMSUB231PS(_0, _1, _2) #define vfnmsub132pd(_0, _1, _2) VFNMSUB132PD(_0, _1, _2) #define vfnmsub213pd(_0, _1, _2) VFNMSUB213PD(_0, _1, _2) #define vfnmsub231pd(_0, _1, _2) VFNMSUB231PD(_0, _1, _2) #define vfmaddsub132ss(_0, _1, _2) VFMADDSUB132SS(_0, _1, _2) #define vfmaddsub213ss(_0, _1, _2) VFMADDSUB213SS(_0, _1, _2) #define vfmaddsub231ss(_0, _1, _2) VFMADDSUB231SS(_0, _1, _2) #define vfmaddsub132sd(_0, _1, _2) VFMADDSUB132SD(_0, _1, _2) #define vfmaddsub213sd(_0, _1, _2) VFMADDSUB213SD(_0, _1, _2) #define vfmaddsub231sd(_0, _1, _2) VFMADDSUB231SD(_0, _1, _2) #define vfmaddsub132ps(_0, _1, _2) VFMADDSUB132PS(_0, _1, _2) #define vfmaddsub213ps(_0, _1, _2) VFMADDSUB213PS(_0, _1, _2) #define vfmaddsub231ps(_0, _1, _2) VFMADDSUB231PS(_0, _1, _2) #define vfmaddsub132pd(_0, _1, _2) VFMADDSUB132PD(_0, _1, _2) #define vfmaddsub213pd(_0, _1, _2) VFMADDSUB213PD(_0, _1, _2) #define vfmaddsub231pd(_0, _1, _2) VFMADDSUB231PD(_0, _1, _2) #define vfmsubadd132ss(_0, _1, _2) VFMSUBADD132SS(_0, _1, _2) #define vfmsubadd213ss(_0, _1, _2) VFMSUBADD213SS(_0, _1, _2) #define vfmsubadd231ss(_0, _1, _2) VFMSUBADD231SS(_0, _1, _2) #define vfmsubadd132sd(_0, _1, _2) VFMSUBADD132SD(_0, _1, _2) #define vfmsubadd213sd(_0, _1, _2) VFMSUBADD213SD(_0, _1, _2) #define vfmsubadd231sd(_0, _1, _2) VFMSUBADD231SD(_0, _1, _2) #define vfmsubadd132ps(_0, _1, _2) VFMSUBADD132PS(_0, _1, _2) #define vfmsubadd213ps(_0, _1, _2) VFMSUBADD213PS(_0, _1, _2) #define vfmsubadd231ps(_0, _1, _2) VFMSUBADD231PS(_0, _1, _2) #define vfmsubadd132pd(_0, _1, _2) VFMSUBADD132PD(_0, _1, _2) #define vfmsubadd213pd(_0, _1, _2) VFMSUBADD213PD(_0, _1, _2) #define vfmsubadd231pd(_0, _1, _2) VFMSUBADD231PD(_0, _1, _2) #define vfmaddss(_0, _1, _2, _3) VFMADDSS(_0, _1, _2, _3) #define vfmaddsd(_0, _1, _2, _3) VFMADDSD(_0, _1, _2, _3) #define vfmaddps(_0, _1, _2, _3) VFMADDPS(_0, _1, _2, _3) #define vfmaddpd(_0, _1, _2, _3) VFMADDPD(_0, _1, _2, _3) #define vfmsubss(_0, _1, _2, _3) VFMSUBSS(_0, _1, _2, _3) #define vfmsubsd(_0, _1, _2, _3) VFMSUBSD(_0, _1, _2, _3) #define vfmsubps(_0, _1, _2, _3) VFMSUBPS(_0, _1, _2, _3) #define vfmsubpd(_0, _1, _2, _3) VFMSUBPD(_0, _1, _2, _3) #define vfnmaddss(_0, _1, _2, _3) VFNMADDSS(_0, _1, _2, _3) #define vfnmaddsd(_0, _1, _2, _3) VFNMADDSD(_0, _1, _2, _3) #define vfnmaddps(_0, _1, _2, _3) VFNMADDPS(_0, _1, _2, _3) #define vfnmaddpd(_0, _1, _2, _3) VFNMADDPD(_0, _1, _2, _3) #define vfnmsubss(_0, _1, _2, _3) VFNMSUBSS(_0, _1, _2, _3) #define vfnmsubsd(_0, _1, _2, _3) VFNMSUBSD(_0, _1, _2, _3) #define vfnmsubps(_0, _1, _2, _3) VFNMSUBPS(_0, _1, _2, _3) #define vfnmsubpd(_0, _1, _2, _3) VFNMSUBPD(_0, _1, _2, _3) #define vfmaddsubss(_0, _1, _2, _3) VFMADDSUBSS(_0, _1, _2, _3) #define vfmaddsubsd(_0, _1, _2, _3) VFMADDSUBSD(_0, _1, _2, _3) #define vfmaddsubps(_0, _1, _2, _3) VFMADDSUBPS(_0, _1, _2, _3) #define vfmaddsubpd(_0, _1, _2, _3) VFMADDSUBPD(_0, _1, _2, _3) #define vfmsubaddss(_0, _1, _2, _3) VFMSUBADDSS(_0, _1, _2, _3) #define vfmsubaddsd(_0, _1, _2, _3) VFMSUBADDSD(_0, _1, _2, _3) #define vfmsubaddps(_0, _1, _2, _3) VFMSUBADDPS(_0, _1, _2, _3) #define vfmsubaddpd(_0, _1, _2, _3) VFMSUBADDPD(_0, _1, _2, _3) #define v4fmaddss(_0, _1, _2) V4FMADDSS(_0, _1, _2) #define v4fmaddps(_0, _1, _2) V4FMADDPS(_0, _1, _2) #define v4fnmaddss(_0, _1, _2) V4FNMADDSS(_0, _1, _2) #define v4fnmaddps(_0, _1, _2) V4FNMADDPS(_0, _1, _2) // Conversions #define CVTSS2SD(_0, _1) INSTR_(cvtss2sd, _0, _1) #define CVTSD2SS(_0, _1) INSTR_(cvtsd2ss, _0, _1) #define CVTPS2PD(_0, _1) INSTR_(cvtps2pd, _0, _1) #define CVTPD2PS(_0, _1) INSTR_(cvtpd2ps, _0, _1) #define cvtss2sd(_0, _1) CVTSS2SD(_0, _1) #define cvtsd2ss(_0, _1) CVTSD2SS(_0, _1) #define cvtps2pd(_0, _1) CVTPS2PD(_0, _1) #define cvtpd2ps(_0, _1) CVTPD2PS(_0, _1) #define VCVTSS2SD(_0, _1) INSTR_(vcvtss2sd, _0, _1) #define VCVTSD2SS(_0, _1) INSTR_(vcvtsd2ss, _0, _1) #define VCVTPS2PD(_0, _1) INSTR_(vcvtps2pd, _0, _1) #define VCVTPD2PS(_0, _1) INSTR_(vcvtpd2ps, _0, _1) #define vcvtss2sd(_0, _1) VCVTSS2SD(_0, _1) #define vcvtsd2ss(_0, _1) VCVTSD2SS(_0, _1) #define vcvtps2pd(_0, _1) VCVTPS2PD(_0, _1) #define vcvtpd2ps(_0, _1) VCVTPD2PS(_0, _1) // Vector shuffles #define PSHUFD(_0, _1, _2) INSTR_(pshufd, _0, _1, _2) #define SHUFPS(_0, _1, _2) INSTR_(shufps, _0, _1, _2) #define SHUFPD(_0, _1, _2) INSTR_(shufpd, _0, _1, _2) #define UNPCKLPS(_0, _1) INSTR_(unpcklps, _0, _1) #define UNPCKHPS(_0, _1) INSTR_(unpckhps, _0, _1) #define UNPCKLPD(_0, _1) INSTR_(unpcklpd, _0, _1) #define UNPCKHPD(_0, _1) INSTR_(unpckhpd, _0, _1) #define pshufd(_0, _1, _2) PSHUFD(_0, _1, _2) #define shufps(_0, _1, _2) SHUFPS(_0, _1, _2) #define shufpd(_0, _1, _2) SHUFPD(_0, _1, _2) #define unpcklps(_0, _1) UNPCKLPS(_0, _1) #define unpckhps(_0, _1) UNPCKHPS(_0, _1) #define unpcklpd(_0, _1) UNPCKLPD(_0, _1) #define unpckhpd(_0, _1) UNPCKHPD(_0, _1) #define VSHUFPS(_0, _1, _2, _3) INSTR_(vshufps, _0, _1, _2, _3) #define VSHUFPD(_0, _1, _2, _3) INSTR_(vshufpd, _0, _1, _2, _3) #define VPERMILPS(_0, _1, _2) INSTR_(vpermilps, _0, _1, _2) #define VPERMILPD(_0, _1, _2) INSTR_(vpermilpd, _0, _1, _2) #define VPERM2F128(_0, _1, _2, _3) INSTR_(vperm2f128, _0, _1, _2, _3) #define VPERMPD(_0, _1, _2) INSTR_(vpermpd, _0, _1, _2) #define VUNPCKLPS(_0, _1, _2) INSTR_(vunpcklps, _0, _1, _2) #define VUNPCKHPS(_0, _1, _2) INSTR_(vunpckhps, _0, _1, _2) #define VUNPCKLPD(_0, _1, _2) INSTR_(vunpcklpd, _0, _1, _2) #define VUNPCKHPD(_0, _1, _2) INSTR_(vunpckhpd, _0, _1, _2) #define VSHUFF32X4(_0, _1, _2, _3) INSTR_(vshuff32x4, _0, _1, _2, _3) #define VSHUFF64X2(_0, _1, _2, _3) INSTR_(vshuff64x2, _0, _1, _2, _3) #define VINSERTF128(_0, _1, _2, _3) INSTR_(vinsertf128, _0, _1, _2, _3) #define VINSERTF32X4(_0, _1, _2, _3) INSTR_(vinsertf32x4, _0, _1, _2, _3) #define VINSERTF32X8(_0, _1, _2, _3) INSTR_(vinsertf32x8, _0, _1, _2, _3) #define VINSERTF64X2(_0, _1, _2, _3) INSTR_(vinsertf64x2, _0, _1, _2, _3) #define VINSERTF64X4(_0, _1, _2, _3) INSTR_(vinsertf64x4, _0, _1, _2, _3) #define VEXTRACTF128(_0, _1, _2) INSTR_(vextractf128, _0, _1, _2) #define VEXTRACTF32X4(_0, _1, _2) INSTR_(vextractf32x4, _0, _1, _2) #define VEXTRACTF32X8(_0, _1, _2) INSTR_(vextractf32x8, _0, _1, _2) #define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2) #define VEXTRACTF64X4(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2) #define VBLENDPS(_0, _1, _2, _3) INSTR_(vblendps, _0, _1, _2, _3) #define VBLENDPD(_0, _1, _2, _3) INSTR_(vblendpd, _0, _1, _2, _3) #define VBLENDMPS(_0, _1, _2) INSTR_(vblendmps, _0, _1, _2) #define VBLENDMPD(_0, _1, _2) INSTR_(vblendmpd, _0, _1, _2) #define vshufps(_0, _1, _2, _3) VSHUFPS(_0, _1, _2, _3) #define vshufpd(_0, _1, _2, _3) VSHUFPD(_0, _1, _2, _3) #define vpermilps(_0, _1, _2) VPERMILPS(_0, _1, _2) #define vpermilpd(_0, _1, _2) VPERMILPD(_0, _1, _2) #define vperm2f128(_0, _1, _2, _3) VPERM2F128(_0, _1, _2, _3) #define vpermpd(_0, _1, _2) VPERMPD(_0, _1, _2) #define vunpcklps(_0, _1, _2) VUNPCKLPS(_0, _1, _2) #define vunpckhps(_0, _1, _2) VUNPCKHPS(_0, _1, _2) #define vunpcklpd(_0, _1, _2) VUNPCKLPD(_0, _1, _2) #define vunpckhpd(_0, _1, _2) VUNPCKHPD(_0, _1, _2) #define vshuff32x4(_0, _1, _2, _3) VSHUFF32x4(_0, _1, _2, _3) #define vshuff64x2(_0, _1, _2, _3) VSHUFF64x2(_0, _1, _2, _3) #define vinsertf128(_0, _1, _2, _3) VINSERTF128(_0, _1, _2, _3) #define vinsertf32x4(_0, _1, _2, _3) VINSERTF32x4(_0, _1, _2, _3) #define vinsertf32x8(_0, _1, _2, _3) VINSERTF32x8(_0, _1, _2, _3) #define vinsertf64x2(_0, _1, _2, _3) VINSERTF64x2(_0, _1, _2, _3) #define vinsertf64x4(_0, _1, _2, _3) VINSERTF64x4(_0, _1, _2, _3) #define vextractf128(_0, _1, _2) VEXTRACTF128(_0, _1, _2) #define vextractf32x4(_0, _1, _2) VEXTRACTF32x4(_0, _1, _2) #define vextractf32x8(_0, _1, _2) VEXTRACTF32x8(_0, _1, _2) #define vextractf64x2(_0, _1, _2) VEXTRACTF64x2(_0, _1, _2) #define vextractf64x4(_0, _1, _2) VEXTRACTF64x4(_0, _1, _2) #define vblendps(_0, _1, _2, _3) VBLENDPS(_0, _1, _2, _3) #define vblendpd(_0, _1, _2, _3) VBLENDPD(_0, _1, _2, _3) #define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2) #define vblendmpd(_0, _1, _2) VBLENDMPD(_0, _1, _2) // Prefetches #define PREFETCH(_0, _1) INSTR_(prefetcht##_0, _1) #define PREFETCHW0(_0) INSTR_(prefetchw, _0) #define PREFETCHW1(_0) INSTR_(prefetchwt1, _0) #define VGATHERPFDPS(_0, _1) INSTR_(vgatherpf##_0##dps, _1) #define VSCATTERPFDPS(_0, _1) INSTR_(vscatterpf##_0##dps, _1) #define VGATHERPFDPD(_0, _1) INSTR_(vgatherpf##_0##dpd, _1) #define VSCATTERPFDPD(_0, _1) INSTR_(vscatterpf##_0##dpd, _1) #define VGATHERPFQPS(_0, _1) INSTR_(vgatherpf##_0##qps, _1) #define VSCATTERPFQPS(_0, _1) INSTR_(vscatterpf##_0##qps, _1) #define VGATHERPFQPD(_0, _1) INSTR_(vgatherpf##_0##qpd, _1) #define VSCATTERPFQPD(_0, _1) INSTR_(vscatterpf##_0##qpd, _1) #define prefetch(_0, _1) PREFETCH(_0, _1) #define prefetchw0(_0) PREFETCHW0(_0) #define prefetchw1(_0) PREFETCHW1(_0) #define vgatherpfdps(_0, _1) VGATHERPFDPS(_0, _1) #define vscatterpfdps(_0, _1) VSCATTERPFDPS(_0, _1) #define vgatherpfdpd(_0, _1) VGATHERPFDPD(_0, _1) #define vscatterpfdpd(_0, _1) VSCATTERPFDPD(_0, _1) #define vgatherpfqps(_0, _1) VGATHERPFQPS(_0, _1) #define vscatterpfqps(_0, _1) VSCATTERPFQPS(_0, _1) #define vgatherpfqpd(_0, _1) VGATHERPFQPD(_0, _1) #define vscatterpfqpd(_0, _1) VSCATTERPFQPD(_0, _1) // Mask operations #ifdef __MIC__ #define KMOVW(_0, _1) INSTR_(kmov, _0, _1) #define JKNZD(_0, _1) INSTR_(jknzd, _0, _1) #else #define KMOVW(_0, _1) INSTR_(kmovw, _0, _1) #define JKNZD(_0, _1) INSTR_(kortestw, _0, _0) INSTR_(jnz, _1) #endif #define KXNORW(_0, _1, _2) INSTR_(kxnorw, _0, _1, _2) #define KSHIFTRW(_0, _1, _2) INSTR_(kshiftrw, _0, _1, _2) #define kmovw(_0, _1) KMOVW(_0, _1) #define jknzd(_0, _1) JKNZD(_0, _1) #define kxnorw(_0, _1, _2) KXNORW(_0, _1, _2) #define kshiftrw(_0, _1, _2) KSHIFTRW(_0, _1, _2) // Other #define RDTSC() INSTR_(rdtsc) #define VZEROALL() INSTR_(vzeroall) #define VZEROUPPER() INSTR_(vzeroupper) #define rdtsc() RDTSC() #define vzeroall() VZEROALL() #define vzeroupper() VZEROUPPER() #endif cython-blis-0.9.1/blis/_src/frame/include/bli_xapi_undef.h000066400000000000000000000045711427272030600235030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS cython-blis-0.9.1/blis/_src/frame/include/blis.h000066400000000000000000000125601427272030600214610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). #include "bli_config.h" // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. #include "bli_system.h" #include "bli_lang_defs.h" // -- configure default definitions -- #include "bli_config_macro_defs.h" // -- Common BLIS definitions -- #include "bli_type_defs.h" #include "bli_macro_defs.h" // -- pragma definitions -- #include "bli_pragma_macro_defs.h" // -- Threading definitions -- #include "bli_thread.h" #include "bli_pthread.h" // -- Constant definitions -- #include "bli_extern_defs.h" // -- BLIS architecture/kernel definitions -- #include "bli_l1v_ker_prot.h" #include "bli_l1f_ker_prot.h" #include "bli_l1m_ker_prot.h" #include "bli_l3_ukr_prot.h" #include "bli_l3_sup_ker_prot.h" #include "bli_arch_config_pre.h" #include "bli_arch_config.h" #include "bli_kernel_macro_defs.h" // -- Base operation prototypes -- #include "bli_init.h" #include "bli_malloc.h" #include "bli_const.h" #include "bli_obj.h" #include "bli_obj_scalar.h" #include "bli_blksz.h" #include "bli_func.h" #include "bli_mbool.h" #include "bli_cntx.h" #include "bli_rntm.h" #include "bli_gks.h" #include "bli_ind.h" #include "bli_pba.h" #include "bli_pool.h" #include "bli_array.h" #include "bli_apool.h" #include "bli_sba.h" #include "bli_memsys.h" #include "bli_mem.h" #include "bli_part.h" #include "bli_prune.h" #include "bli_query.h" #include "bli_auxinfo.h" #include "bli_param_map.h" #include "bli_clock.h" #include "bli_check.h" #include "bli_error.h" #include "bli_f2c.h" #include "bli_machval.h" #include "bli_getopt.h" #include "bli_opid.h" #include "bli_cntl.h" #include "bli_env.h" #include "bli_pack.h" #include "bli_info.h" #include "bli_arch.h" #include "bli_cpuid.h" #include "bli_string.h" #include "bli_setgetijm.h" #include "bli_setgetijv.h" #include "bli_setri.h" #include "bli_castm.h" #include "bli_castnzm.h" #include "bli_castv.h" #include "bli_projm.h" #include "bli_projv.h" // -- Level-0 operations -- #include "bli_l0.h" // -- Level-1v operations -- #include "bli_l1v.h" // -- Level-1d operations -- #include "bli_l1d.h" // -- Level-1f operations -- #include "bli_l1f.h" // -- Level-1m operations -- #include "bli_l1m.h" // -- Level-2 operations -- #include "bli_l2.h" // -- Level-3 operations -- #include "bli_l3.h" // -- Utility operations -- #include "bli_util.h" // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. #include "bli_addon.h" // -- sandbox implementation -- #include "bli_sbox.h" // -- BLAS compatibility layer -- #include "bli_blas.h" // -- CBLAS compatibility layer -- #include "bli_cblas.h" // -- Windows definitions #include "bli_winsys.h" // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/frame/include/level0/000077500000000000000000000000001427272030600215425ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/1e/000077500000000000000000000000001427272030600220475ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/1e/bli_copy1es.h000066400000000000000000000063221427272030600244340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1e/bli_copyj1es.h000066400000000000000000000063511427272030600246100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1e/bli_invert1es.h000066400000000000000000000040741427272030600247730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1e/bli_scal1es.h000066400000000000000000000041661427272030600244100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1e/bli_scal21es.h000066400000000000000000000246431427272030600244740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1e/bli_scal2j1es.h000066400000000000000000000247521427272030600246470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/000077500000000000000000000000001427272030600220575ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_invert1ms_mxn_diag.h000066400000000000000000000106271427272030600266620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_scal1ms_mxn.h000066400000000000000000000075461427272030600253170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_scal21ms_mxn.h000066400000000000000000000134551427272030600253750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; /* Handle 1e and 1r separately. */ if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else /* if ( bli_is_1r_packed( schema ) ) */ { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ if ( rs_y2 == 1 ) { cs_y2 *= 2; } else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; /* Handle 1e and 1r separately. */ if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else /* if ( bli_is_1r_packed( schema ) ) */ { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ if ( rs_y2 == 1 ) { cs_y2 *= 2; } else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_scal21ms_mxn_diag.h000066400000000000000000000102701427272030600263510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_scal21ms_mxn_uplo.h000066400000000000000000000210021427272030600264170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_set1ms_mxn.h000066400000000000000000000145311427272030600251600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ /* Include real domain version to facilitate macro-izing mixed-datatype components of packm. */ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ /* Include real domain version to facilitate macro-izing mixed-datatype components of packm. */ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; /* Optimization: The loops walk through y with unit stride if y is column-stored. If y is row-stored, swap the dimensions and strides to preserve unit stride movement. */ if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } /* Handle 1e and 1r separately. */ if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else /* if ( bli_is_1r_packed( schema ) ) */ { /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ if ( rs_y2 == 1 ) { cs_y2 *= 2; } else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; /* Optimization: The loops walk through y with unit stride if y is column-stored. If y is row-stored, swap the dimensions and strides to preserve unit stride movement. */ if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } /* Handle 1e and 1r separately. */ if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else /* if ( bli_is_1r_packed( schema ) ) */ { /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ if ( rs_y2 == 1 ) { cs_y2 *= 2; } else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_set1ms_mxn_diag.h000066400000000000000000000107421427272030600261440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_set1ms_mxn_uplo.h000066400000000000000000000137641427272030600262260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ /* Set the off-diagonal increment. */ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ /* Set the off-diagonal increment. */ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ /* Set the off-diagonal increment. */ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ /* Set the off-diagonal increment. */ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else /* if ( diagoff < 0 ) */ offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1m/bli_seti01ms_mxn_diag.h000066400000000000000000000071661427272030600264030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ /* Handle 1e and 1r separately. */ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ /* Scale the non-unit stride by two for the 1r loop, which steps in units of real (not complex) values. */ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else /* if ( cs_y2 == 1 ) */ { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1r/000077500000000000000000000000001427272030600220645ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/1r/bli_copy1rs.h000066400000000000000000000036201427272030600244640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1r/bli_copyj1rs.h000066400000000000000000000036271427272030600246450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1r/bli_invert1rs.h000066400000000000000000000035101427272030600250170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1r/bli_scal1rs.h000066400000000000000000000041301427272030600244310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1r/bli_scal21rs.h000066400000000000000000000045651427272030600245270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/1r/bli_scal2j1rs.h000066400000000000000000000046001427272030600246670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bb/000077500000000000000000000000001427272030600221255ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/bb/bli_bcastbbs_mxn.h000066400000000000000000000046601427272030600255770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ /* Assume that the duplication factor is the column stride of y. */ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bb/bli_scal2bbs_mxn.h000066400000000000000000000143051427272030600255040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ /* Assume that the duplication factor is the row stride of y. */ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ /* Assume that the duplication factor is the row stride of y. */ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bb/bli_set0bbs_mxn.h000066400000000000000000000046441427272030600253600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ /* Assume that the duplication factor is the row stride of y. */ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_absq2s.h000066400000000000000000000115721427272030600237420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_abval2s.h000066400000000000000000000117601427272030600241000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_add3s.h000066400000000000000000000311431427272030600235410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_addjs.h000066400000000000000000000100551427272030600236270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_adds.h000066400000000000000000000077221427272030600234640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_adds_mxn.h000066400000000000000000000374051427272030600243470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_adds_mxn_uplo.h000066400000000000000000000130231427272030600253740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_axmys.h000066400000000000000000000312231427272030600237030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_axpbyjs.h000066400000000000000000001552001427272030600242240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_axpbys.h000066400000000000000000001525651427272030600240650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_axpyjs.h000066400000000000000000000316061427272030600240650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_axpys.h000066400000000000000000000312231427272030600237060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_conjs.h000066400000000000000000000042401427272030600236550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_constants.h000066400000000000000000000062051427272030600245600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_copycjs.h000066400000000000000000000113171427272030600242160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_copyjnzs.h000066400000000000000000000077501427272030600244310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_copyjs.h000066400000000000000000000103041427272030600240460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_copynzs.h000066400000000000000000000075461427272030600242620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_copys.h000066400000000000000000000075101427272030600237010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_copys_mxn.h000066400000000000000000000376211427272030600245710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_dotjs.h000066400000000000000000000133561427272030600236740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_dots.h000066400000000000000000000131011427272030600235060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_eq.h000066400000000000000000000077561427272030600231650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_fprints.h000066400000000000000000000044271427272030600242350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_gets.h000066400000000000000000000076451427272030600235170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_inverts.h000066400000000000000000000043641427272030600242420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_invscaljs.h000066400000000000000000000104011427272030600245310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_invscals.h000066400000000000000000000102461427272030600243660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_neg2s.h000066400000000000000000000100051427272030600235530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_randnp2s.h000066400000000000000000000124031427272030600242700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ /* Compute a narrow-range power of two. For the purposes of commentary, we'll assume that m_max = 4. This represents the largest power of two we will use to generate the random numbers. */ \ \ /* Generate a random real number t on the interval: [0.0, 6.0]. */ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ /* Modify t to guarantee that is never equal to the upper bound of the interval (in this case, 6.0). */ \ if ( t == m_max2 ) t = t - 1.0; \ \ /* Transform the interval into the set of integers, {0,1,2,3,4,5}. */ \ t = floor( t ); \ \ /* Map values of t == 0 to a final value of 0. */ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ /* This case handles values of t = {1,2,3,4,5}. */ \ \ double s_exp, s_val; \ \ /* Compute two random numbers to determine the signs of the exponent and the end result. */ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ /* Compute r_val = 2^s where s = +/-(t-1) = {-4,-3,-2,-1,0,1,2,3,4}. */ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ /* If our sign value is negative, our random power of two will be negative. */ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ /* Normalize by the largest possible positive value. */ \ r_val = r_val / pow( 2.0, m_max ); \ \ /* r_val = 0, or +/-{2^-4, 2^-3, 2^-2, 2^-1, 2^0, 2^1, 2^2, 2^3, 2^4}. */ \ /* NOTE: For single-precision macros, this assignment results in typecast down to float. */ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ /* Compute a narrow-range power of two. For the purposes of commentary, we'll assume that m_max = 4. This represents the largest power of two we will use to generate the random numbers. */ \ \ do \ { \ /* Generate a random real number t on the interval: [0.0, 6.0]. */ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ /* Transform the interval into the set of integers, {0,1,2,3,4,5}. Note that 6 is prohibited by the loop guard below. */ \ t = floor( t ); \ } \ /* If t is ever equal to m_max2, we re-randomize. The guard against m_max2 < t is for sanity and shouldn't happen, unless perhaps there is weirdness in the typecasting to double when computing t above. */ \ while ( m_max2 <= t ); \ \ /* Map values of t == 0 to a final value of 0. */ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ /* This case handles values of t = {1,2,3,4,5}. */ \ \ double s_val; \ \ /* Compute r_val = 2^s where s = -(t-1) = {-4,-3,-2,-1,0}. */ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ /* Compute a random number to determine the sign of the final result. */ \ PASTEMAC(d,rands)( s_val ); \ \ /* If our sign value is negative, our random power of two will be negative. */ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ /* r_val = 0, or +/-{2^0, 2^-1, 2^-2, 2^-3, 2^-4}. */ \ /* NOTE: For single-precision macros, this assignment results in typecast down to float. */ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_rands.h000066400000000000000000000043761427272030600236620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_scal2js.h000066400000000000000000000321011427272030600240770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_scal2s.h000066400000000000000000000315151427272030600237350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_scal2s_mxn.h000066400000000000000000000055131427272030600246160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_scalcjs.h000066400000000000000000000111331427272030600241620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_scaljs.h000066400000000000000000000101501427272030600240150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_scals.h000066400000000000000000000100151427272030600236430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_set0s.h000066400000000000000000000036251427272030600236050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_set0s_mxn.h000066400000000000000000000055031427272030600244640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_set1s.h000066400000000000000000000036251427272030600236060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_seti0s.h000066400000000000000000000036071427272030600237560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_setis.h000066400000000000000000000053351427272030600236760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_setrs.h000066400000000000000000000054051427272030600237050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_sets.h000066400000000000000000000102671427272030600235250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_sqrt2s.h000066400000000000000000000117211427272030600240010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_subjs.h000066400000000000000000000100551427272030600236700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_subs.h000066400000000000000000000077221427272030600235250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_swaps.h000066400000000000000000000101421427272030600236740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_xpbyjs.h000066400000000000000000000320151427272030600240610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_xpbys.h000066400000000000000000000315421427272030600237130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_xpbys_mxn.h000066400000000000000000000517771427272030600246110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/bli_xpbys_mxn_uplo.h000066400000000000000000000206771427272030600256430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* If beta is zero, overwrite y with x (in case y has infs or NaNs). */ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/000077500000000000000000000000001427272030600223205ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_cast.h000066400000000000000000000110021427272030600242430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_CAST_H #define BLIS_CAST_H // cast // Notes: // - The first char encodes the type of *ap. // - The second char encodes the type of b. #define bli_sscast( ap, b ) \ { \ (b) = ( float ) *(( float* )(ap)); \ } #define bli_dscast( ap, b ) \ { \ (b) = ( float ) *(( double* )(ap)); \ } #define bli_cscast( ap, b ) \ { \ (b) = ( float ) bli_creal( *(( scomplex* )(ap)) ); \ } #define bli_zscast( ap, b ) \ { \ (b) = ( float ) bli_zreal( *(( dcomplex* )(ap)) ); \ } #define bli_sdcast( ap, b ) \ { \ (b) = ( double ) *(( float* )(ap)); \ } #define bli_ddcast( ap, b ) \ { \ (b) = ( double ) *(( double* )(ap)); \ } #define bli_cdcast( ap, b ) \ { \ (b) = ( double ) bli_creal( *(( scomplex* )(ap)) ); \ } #define bli_zdcast( ap, b ) \ { \ (b) = ( double ) bli_zreal( *(( dcomplex* )(ap)) ); \ } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccast( ap, b ) \ { \ bli_scsets( bli_sreal( *(( float* )(ap)) ), \ 0.0, (b) ); \ } #define bli_dccast( ap, b ) \ { \ bli_dcsets( bli_dreal( *(( double* )(ap)) ), \ 0.0, (b) ); \ } #define bli_cccast( ap, b ) \ { \ bli_ccsets( bli_creal( *(( scomplex* )(ap)) ), \ bli_cimag( *(( scomplex* )(ap)) ), (b) ); \ } #define bli_zccast( ap, b ) \ { \ bli_zcsets( bli_zreal( *(( dcomplex* )(ap)) ), \ bli_zimag( *(( dcomplex* )(ap)) ), (b) ); \ } #define bli_szcast( ap, b ) \ { \ bli_szsets( bli_sreal( *(( float* )(ap)) ), \ 0.0, (b) ); \ } #define bli_dzcast( ap, b ) \ { \ bli_dzsets( bli_dreal( *(( double* )(ap)) ), \ 0.0, (b) ); \ } #define bli_czcast( ap, b ) \ { \ bli_czsets( bli_creal( *(( scomplex* )(ap)) ), \ bli_cimag( *(( scomplex* )(ap)) ), (b) ); \ } #define bli_zzcast( ap, b ) \ { \ bli_zzsets( bli_zreal( *(( dcomplex* )(ap)) ), \ bli_zimag( *(( dcomplex* )(ap)) ), (b) ); \ } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccast( ap, b ) { (b) = ( scomplex ) *(( float* )(ap)); } #define bli_dccast( ap, b ) { (b) = ( scomplex ) *(( double* )(ap)); } #define bli_cccast( ap, b ) { (b) = ( scomplex ) *(( scomplex* )(ap)); } #define bli_zccast( ap, b ) { (b) = ( scomplex ) *(( dcomplex* )(ap)); } #define bli_szcast( ap, b ) { (b) = ( dcomplex ) *(( float* )(ap)); } #define bli_dzcast( ap, b ) { (b) = ( dcomplex ) *(( double* )(ap)); } #define bli_czcast( ap, b ) { (b) = ( dcomplex ) *(( scomplex* )(ap)); } #define bli_zzcast( ap, b ) { (b) = ( dcomplex ) *(( dcomplex* )(ap)); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_scast( ap, b ) bli_sscast( ap, b ) #define bli_dcast( ap, b ) bli_ddcast( ap, b ) #define bli_ccast( ap, b ) bli_cccast( ap, b ) #define bli_zcast( ap, b ) bli_zzcast( ap, b ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_castfrom.h000066400000000000000000000032101427272030600251310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_castto.h000066400000000000000000000032101427272030600246100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_copynzjs.h000066400000000000000000000070061427272030600252010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYNZJS_H #define BLIS_COPYNZJS_H // copynzjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - x is copied in conjugated form. #define bli_sscopynzjs( x, y ) \ { \ (y) = ( float ) (x); \ } #define bli_dscopynzjs( x, y ) \ { \ (y) = ( float ) (x); \ } #define bli_cscopynzjs( x, y ) \ { \ (y) = ( float ) (x).real; \ } #define bli_zscopynzjs( x, y ) \ { \ (y) = ( float ) (x).real; \ } #define bli_sdcopynzjs( x, y ) \ { \ (y) = ( double ) (x); \ } #define bli_ddcopynzjs( x, y ) \ { \ (y) = ( double ) (x); \ } #define bli_cdcopynzjs( x, y ) \ { \ (y) = ( double ) (x).real; \ } #define bli_zdcopynzjs( x, y ) \ { \ (y) = ( double ) (x).real; \ } #define bli_sccopynzjs( x, y ) \ { \ (y).real = ( float ) (x); \ /* (y).imag = 0.0F; (SKIP COPYING OF ZERO) */ \ } #define bli_dccopynzjs( x, y ) \ { \ (y).real = ( float ) (x); \ /* (y).imag = 0.0F; (SKIP COPYING OF ZERO) */ \ } #define bli_cccopynzjs( x, y ) \ { \ (y).real = ( float ) (x).real; \ (y).imag = ( float ) -(x).imag; \ } #define bli_zccopynzjs( x, y ) \ { \ (y).real = ( float ) (x).real; \ (y).imag = ( float ) -(x).imag; \ } #define bli_szcopynzjs( x, y ) \ { \ (y).real = ( double ) (x); \ /* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \ } #define bli_dzcopynzjs( x, y ) \ { \ (y).real = ( double ) (x); \ /* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \ } #define bli_czcopynzjs( x, y ) \ { \ (y).real = ( double ) (x).real; \ (y).imag = ( double ) -(x).imag; \ } #define bli_zzcopynzjs( x, y ) \ { \ (y).real = ( double ) (x).real; \ (y).imag = ( double ) -(x).imag; \ } #define bli_scopynzjs( x, y ) \ { \ bli_sscopynzjs( x, y ); \ } #define bli_dcopynzjs( x, y ) \ { \ bli_ddcopynzjs( x, y ); \ } #define bli_ccopynzjs( x, y ) \ { \ bli_cccopynzjs( x, y ); \ } #define bli_zcopynzjs( x, y ) \ { \ bli_zzcopynzjs( x, y ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_copynzs.h000066400000000000000000000066761427272030600250430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) \ { \ (y) = ( float ) (x); \ } #define bli_dscopynzs( x, y ) \ { \ (y) = ( float ) (x); \ } #define bli_cscopynzs( x, y ) \ { \ (y) = ( float ) (x).real; \ } #define bli_zscopynzs( x, y ) \ { \ (y) = ( float ) (x).real; \ } #define bli_sdcopynzs( x, y ) \ { \ (y) = ( double ) (x); \ } #define bli_ddcopynzs( x, y ) \ { \ (y) = ( double ) (x); \ } #define bli_cdcopynzs( x, y ) \ { \ (y) = ( double ) (x).real; \ } #define bli_zdcopynzs( x, y ) \ { \ (y) = ( double ) (x).real; \ } #define bli_sccopynzs( x, y ) \ { \ (y).real = ( float ) (x); \ /* (y).imag = 0.0F; (SKIP COPYING OF ZERO) */ \ } #define bli_dccopynzs( x, y ) \ { \ (y).real = ( float ) (x); \ /* (y).imag = 0.0F (SKIP COPYING OF ZERO) */; \ } #define bli_cccopynzs( x, y ) \ { \ (y).real = ( float ) (x).real; \ (y).imag = ( float ) (x).imag; \ } #define bli_zccopynzs( x, y ) \ { \ (y).real = ( float ) (x).real; \ (y).imag = ( float ) (x).imag; \ } #define bli_szcopynzs( x, y ) \ { \ (y).real = ( double ) (x); \ /* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \ } #define bli_dzcopynzs( x, y ) \ { \ (y).real = ( double ) (x); \ /* (y).imag = 0.0; (SKIP COPYING OF ZERO) */ \ } #define bli_czcopynzs( x, y ) \ { \ (y).real = ( double ) (x).real; \ (y).imag = ( double ) (x).imag; \ } #define bli_zzcopynzs( x, y ) \ { \ (y).real = ( double ) (x).real; \ (y).imag = ( double ) (x).imag; \ } #define bli_scopynzs( x, y ) \ { \ bli_sscopynzs( x, y ); \ } #define bli_dcopynzs( x, y ) \ { \ bli_ddcopynzs( x, y ); \ } #define bli_ccopynzs( x, y ) \ { \ bli_cccopynzs( x, y ); \ } #define bli_zcopynzs( x, y ) \ { \ bli_zzcopynzs( x, y ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_invscalcjs.h000066400000000000000000000122311427272030600254550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVSCALCJS_H #define BLIS_INVSCALCJS_H // invscalcjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. #define bli_ssinvscalcjs( conj, a, x ) \ { \ (x) /= ( float ) (a); \ } #define bli_dsinvscalcjs( conj, a, x ) \ { \ (x) /= ( float ) (a); \ } #define bli_csinvscalcjs( conj, a, x ) \ { \ (x) /= ( float ) (a).real; \ } #define bli_zsinvscalcjs( conj, a, x ) \ { \ (x) /= ( float ) (a).real; \ } #define bli_sdinvscalcjs( conj, a, x ) \ { \ (x) /= ( double ) (a); \ } #define bli_ddinvscalcjs( conj, a, x ) \ { \ (x) /= ( double ) (a); \ } #define bli_cdinvscalcjs( conj, a, x ) \ { \ (x) /= ( double ) (a).real; \ } #define bli_zdinvscalcjs( conj, a, x ) \ { \ (x) /= ( double ) (a).real; \ } #define bli_scinvscalcjs( conj, a, x ) \ { \ (x).real /= ( float ) (a); \ (x).imag /= ( float ) (a); \ } #define bli_dcinvscalcjs( conj, a, x ) \ { \ (x).real /= ( float ) (a); \ (x).imag /= ( float ) (a); \ } #define bli_ccinvscalcjs( conj, a, x ) \ { \ float aimag = ( bli_is_conj( conj ) ? ( float ) -(a).imag : \ ( float ) (a).imag ); \ float temp = ( float ) (a).real * (a).real + ( float ) aimag * (a).imag; \ float xr = ( float ) ( ( float ) (a).real * (x).real + ( float ) aimag * (x).imag ) / temp; \ float xi = ( float ) ( ( float ) (a).real * (x).imag - ( float ) aimag * (x).real ) / temp; \ (x).real = xr; \ (x).imag = xi; \ } #define bli_zcinvscalcjs( conj, a, x ) \ { \ float aimag = ( bli_is_conj( conj ) ? ( float ) -(a).imag : \ ( float ) (a).imag ); \ float temp = ( float ) (a).real * (a).real + ( float ) aimag * (a).imag; \ float xr = ( float ) ( ( float ) (a).real * (x).real + ( float ) aimag * (x).imag ) / temp; \ float xi = ( float ) ( ( float ) (a).real * (x).imag - ( float ) aimag * (x).real ) / temp; \ (x).real = xr; \ (x).imag = xi; \ } #define bli_szinvscalcjs( conj, a, x ) \ { \ (x).real /= ( double ) (a); \ (x).imag /= ( double ) (a); \ } #define bli_dzinvscalcjs( conj, a, x ) \ { \ (x).real /= ( double ) (a); \ (x).imag /= ( double ) (a); \ } #define bli_czinvscalcjs( conj, a, x ) \ { \ double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \ ( double ) (a).imag ); \ double temp = ( double ) (a).real * (a).real + ( double ) aimag * (a).imag; \ double xr = ( double ) ( ( double ) (a).real * (x).real + ( double ) aimag * (x).imag ) / temp; \ double xi = ( double ) ( ( double ) (a).real * (x).imag - ( double ) aimag * (x).real ) / temp; \ (x).real = xr; \ (x).imag = xi; \ } #define bli_zzinvscalcjs( conj, a, x ) \ { \ double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \ ( double ) (a).imag ); \ double temp = ( double ) (a).real * (a).real + ( double ) aimag * (a).imag; \ double xr = ( double ) ( ( double ) (a).real * (x).real + ( double ) aimag * (x).imag ) / temp; \ double xi = ( double ) ( ( double ) (a).real * (x).imag - ( double ) aimag * (x).real ) / temp; \ (x).real = xr; \ (x).imag = xi; \ } #define bli_sinvscalcjs( conj, a, x ) \ { \ bli_ssinvscalcjs( conj, a, x ); \ } #define bli_dinvscalcjs( conj, a, x ) \ { \ bli_ddinvscalcjs( conj, a, x ); \ } #define bli_cinvscalcjs( conj, a, x ) \ { \ bli_ccinvscalcjs( conj, a, x ); \ } #define bli_zinvscalcjs( conj, a, x ) \ { \ bli_zzinvscalcjs( conj, a, x ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_scalcjs.h000066400000000000000000000111431427272030600247410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - a is (conditionally) used in conjugated form. #define bli_ssscalcjs( conj, a, x ) \ { \ (x) *= ( float ) (a); \ } #define bli_dsscalcjs( conj, a, x ) \ { \ (x) *= ( float ) (a); \ } #define bli_csscalcjs( conj, a, x ) \ { \ (x) *= ( float ) (a).real; \ } #define bli_zsscalcjs( conj, a, x ) \ { \ (x) *= ( float ) (a).real; \ } #define bli_sdscalcjs( conj, a, x ) \ { \ (x) *= ( double ) (a); \ } #define bli_ddscalcjs( conj, a, x ) \ { \ (x) *= ( double ) (a); \ } #define bli_cdscalcjs( conj, a, x ) \ { \ (x) *= ( double ) (a).real; \ } #define bli_zdscalcjs( conj, a, x ) \ { \ (x) *= ( double ) (a).real; \ } #define bli_scscalcjs( conj, a, x ) \ { \ (x).real *= ( float ) (a); \ (x).imag *= ( float ) (a); \ } #define bli_dcscalcjs( conj, a, x ) \ { \ (x).real *= ( float ) (a); \ (x).imag *= ( float ) (a); \ } #define bli_ccscalcjs( conj, a, x ) \ { \ float aimag = ( bli_is_conj( conj ) ? ( float ) -(a).imag : \ ( float ) (a).imag ); \ float tempr = ( float ) (a).real * (x).real - ( float ) aimag * (x).imag; \ float tempi = ( float ) (a).real * (x).imag + ( float ) aimag * (x).real; \ (x).real = tempr; \ (x).imag = tempi; \ } #define bli_zcscalcjs( conj, a, x ) \ { \ float aimag = ( bli_is_conj( conj ) ? ( float ) -(a).imag : \ ( float ) (a).imag ); \ float tempr = ( float ) (a).real * (x).real - ( float ) aimag * (x).imag; \ float tempi = ( float ) (a).real * (x).imag + ( float ) aimag * (x).real; \ (x).real = tempr; \ (x).imag = tempi; \ } #define bli_szscalcjs( conj, a, x ) \ { \ (x).real *= ( double ) (a); \ (x).imag *= ( double ) (a); \ } #define bli_dzscalcjs( conj, a, x ) \ { \ (x).real *= ( double ) (a); \ (x).imag *= ( double ) (a); \ } #define bli_czscalcjs( conj, a, x ) \ { \ double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \ ( double ) (a).imag ); \ double tempr = ( double ) (a).real * (x).real - ( double ) aimag * (x).imag; \ double tempi = ( double ) (a).real * (x).imag + ( double ) aimag * (x).real; \ (x).real = tempr; \ (x).imag = tempi; \ } #define bli_zzscalcjs( conj, a, x ) \ { \ double aimag = ( bli_is_conj( conj ) ? ( double ) -(a).imag : \ ( double ) (a).imag ); \ double tempr = ( double ) (a).real * (x).real - ( double ) aimag * (x).imag; \ double tempi = ( double ) (a).real * (x).imag + ( double ) aimag * (x).real; \ (x).real = tempr; \ (x).imag = tempi; \ } #define bli_sscalcjs( conj, a, x ) \ { \ bli_ssscalcjs( conj, a, x ); \ } #define bli_dscalcjs( conj, a, x ) \ { \ bli_ddscalcjs( conj, a, x ); \ } #define bli_cscalcjs( conj, a, x ) \ { \ bli_ccscalcjs( conj, a, x ); \ } #define bli_zscalcjs( conj, a, x ) \ { \ bli_zzscalcjs( conj, a, x ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/bli_set0ris_mxn.h000066400000000000000000000052041427272030600255730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET0RIS_MXN_H #define BLIS_SET0RIS_MXN_H // set0ris_mxn #define bli_sset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ bli_sset0ris( *(ar + _i*rs_a + _j*cs_a), \ *(ai + _i*rs_a + _j*cs_a) ); \ } #define bli_dset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ bli_dset0ris( *(ar + _i*rs_a + _j*cs_a), \ *(ai + _i*rs_a + _j*cs_a) ); \ } #define bli_cset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ bli_cset0ris( *(ar + _i*rs_a + _j*cs_a), \ *(ai + _i*rs_a + _j*cs_a) ); \ } #define bli_zset0ris_mxn( m, n, ar, ai, rs_a, cs_a ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ bli_zset0ris( *(ar + _i*rs_a + _j*cs_a), \ *(ai + _i*rs_a + _j*cs_a) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/io/000077500000000000000000000000001427272030600227275ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/old/io/bli_scal2ios.h000066400000000000000000000041251427272030600254470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyiight notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyiight notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2IOS_H #define BLIS_SCAL2IOS_H // scal2ios #define bli_cscal2ios( a, x, yi ) \ { \ (yi) = bli_cimag(a) * bli_creal(x) + bli_creal(a) * bli_cimag(x); \ } #define bli_zscal2ios( a, x, yi ) \ { \ (yi) = bli_zimag(a) * bli_zreal(x) + bli_zreal(a) * bli_zimag(x); \ } #define bli_scscal2ios( a, x, yi ) \ { \ (yi) = bli_creal(a) * bli_cimag(x); \ } #define bli_dzscal2ios( a, x, yi ) \ { \ (yi) = bli_zreal(a) * bli_zimag(x); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/io/bli_scal2jios.h000066400000000000000000000036651427272030600256310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyiight notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyiight notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2JIOS_H #define BLIS_SCAL2JIOS_H // scal2jios #define bli_cscal2jios( a, x, yi ) \ { \ (yi) = bli_cimag(a) * bli_creal(x) - bli_creal(a) * bli_cimag(x); \ } #define bli_zscal2jios( a, x, yi ) \ { \ (yi) = bli_zimag(a) * bli_zreal(x) - bli_zreal(a) * bli_zimag(x); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/ri3/000077500000000000000000000000001427272030600230155ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/old/ri3/bli_copyjri3s.h000066400000000000000000000041171427272030600257440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYJRI3S_H #define BLIS_COPYJRI3S_H // copyjri3s #define bli_scopyjri3s( ar, ai, br, bi, bri ) bli_scopyri3s( (ar), -(ai), (br), (bi), (bri) ) #define bli_dcopyjri3s( ar, ai, br, bi, bri ) bli_dcopyri3s( (ar), -(ai), (br), (bi), (bri) ) #define bli_ccopyjri3s( ar, ai, br, bi, bri ) bli_ccopyri3s( (ar), -(ai), (br), (bi), (bri) ) #define bli_zcopyjri3s( ar, ai, br, bi, bri ) bli_zcopyri3s( (ar), -(ai), (br), (bi), (bri) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/ri3/bli_copyri3s.h000066400000000000000000000040731427272030600255730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYRI3S_H #define BLIS_COPYRI3S_H // copyri3s #define bli_scopyri3s( ar, ai, br, bi, bri ) \ { \ (br) = (ar); \ } #define bli_dcopyri3s( ar, ai, br, bi, bri ) \ { \ (br) = (ar); \ } #define bli_ccopyri3s( ar, ai, br, bi, bri ) \ { \ (br) = (ar); \ (bi) = (ai); \ (bri) = (ar) + (ai); \ } #define bli_zcopyri3s( ar, ai, br, bi, bri ) \ { \ (br) = (ar); \ (bi) = (ai); \ (bri) = (ar) + (ai); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/ri3/bli_scal2jri3s.h000066400000000000000000000047311427272030600260000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2JRI3S_H #define BLIS_SCAL2JRI3S_H // scal2jri3s #define bli_sscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ } #define bli_dscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ (yri) = (yr) + (yi); \ } #define bli_zscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ (yri) = (yr) + (yi); \ } #define bli_scscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ (yri) = (yr) + (yi); \ } #define bli_dzscal2jri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ (yri) = (yr) + (yi); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/ri3/bli_scal2ri3s.h000066400000000000000000000047141427272030600256270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RI3S_H #define BLIS_SCAL2RI3S_H // scal2ri3s #define bli_sscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ } #define bli_dscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ (yri) = (yr) + (yi); \ } #define bli_zscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ (yri) = (yr) + (yi); \ } #define bli_scscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ (yri) = (yr) + (yi); \ } #define bli_dzscal2ri3s( ar, ai, xr, xi, yr, yi, yri ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ (yri) = (yr) + (yi); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/ri3/bli_scal2ri3s_mxn.h000066400000000000000000000132241427272030600265050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RI3S_MXN_H #define BLIS_SCAL2RI3S_MXN_H // scal2ri3s_mxn BLIS_INLINE void bli_cscal2ri3s_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ float* restrict y_rpi = ( float* )y + 2*is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; float* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; bli_cscal2jri3s ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i, *psi11_rpi ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; float* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; bli_cscal2ri3s ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i, *psi11_rpi ); } } } BLIS_INLINE void bli_zscal2ri3s_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ double* restrict y_rpi = ( double* )y + 2*is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; double* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; bli_zscal2jri3s ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i, *psi11_rpi ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; double* restrict psi11_rpi = y_rpi + (i )*1 + (j )*cs_y; bli_zscal2ri3s ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i, *psi11_rpi ); } } } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/rih/000077500000000000000000000000001427272030600231025ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/old/rih/bli_scal2rihs_mxn.h000066400000000000000000000152641427272030600266650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RIHS_MXN_H #define BLIS_SCAL2RIHS_MXN_H // scal2rihs_mxn BLIS_INLINE void bli_cscal2rihs_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { scomplex* restrict x_r = x; float* restrict y_r = ( float* )y; if ( bli_is_ro_packed( schema ) ) { if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_cscal2jros ( *alpha, *chi11, *psi11_r ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_cscal2ros ( *alpha, *chi11, *psi11_r ); } } } else if ( bli_is_io_packed( schema ) ) { if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_cscal2jios ( *alpha, *chi11, *psi11_r ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_cscal2ios ( *alpha, *chi11, *psi11_r ); } } } else /* if ( bli_is_rpi_packed( schema ) ) */ { if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_cscal2jrpis ( *alpha, *chi11, *psi11_r ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { scomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; float* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_cscal2rpis ( *alpha, *chi11, *psi11_r ); } } } } BLIS_INLINE void bli_zscal2rihs_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { dcomplex* restrict x_r = x; double* restrict y_r = ( double* )y; if ( bli_is_ro_packed( schema ) ) { if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_zscal2jros ( *alpha, *chi11, *psi11_r ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_zscal2ros ( *alpha, *chi11, *psi11_r ); } } } else if ( bli_is_io_packed( schema ) ) { if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_zscal2jios ( *alpha, *chi11, *psi11_r ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_zscal2ios ( *alpha, *chi11, *psi11_r ); } } } else /* if ( bli_is_rpi_packed( schema ) ) */ { if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_zscal2jrpis ( *alpha, *chi11, *psi11_r ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { dcomplex* restrict chi11 = x_r + (i )*rs_x + (j )*cs_x; double* restrict psi11_r = y_r + (i )*rs_y + (j )*cs_y; bli_zscal2rpis ( *alpha, *chi11, *psi11_r ); } } } } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/rih/bli_scal2rihs_mxn_diag.h000066400000000000000000000067241427272030600276520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RIHS_MXN_DIAG_H #define BLIS_SCAL2RIHS_MXN_DIAG_H // scal2rihs_mxn_diag #define bli_cscscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t _i; \ \ /* Handle ro, io, and rpi separately. */ \ if ( bli_is_ro_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_scscal2ros( *(x + _i*rs_x + _i*cs_x), \ *(a), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else if ( bli_is_io_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_scscal2ios( *(x + _i*rs_x + _i*cs_x), \ *(a), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else /* if ( bli_is_rpi_packed( schema ) ) */ \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_scscal2rpis( *(x + _i*rs_x + _i*cs_x), \ *(a), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ } #define bli_zdzscal2rihs_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t _i; \ \ /* Handle ro, io, and rpi separately. */ \ if ( bli_is_ro_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_dzscal2ros( *(x + _i*rs_x + _i*cs_x), \ *(a), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else if ( bli_is_io_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_dzscal2ios( *(x + _i*rs_x + _i*cs_x), \ *(a), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else /* if ( bli_is_rpi_packed( schema ) ) */ \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_dzscal2rpis( *(x + _i*rs_x + _i*cs_x), \ *(a), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/rih/bli_scal2rihs_mxn_uplo.h000066400000000000000000000226141427272030600277210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RIHS_MXN_UPLO_H #define BLIS_SCAL2RIHS_MXN_UPLO_H // scal2rihs_mxn_uplo #define bli_cscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* Handle ro, io, and rpi separately. */ \ if ( bli_is_ro_packed( schema ) ) \ { \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_cscal2jros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_cscal2ros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_cscal2jros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_cscal2ros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } \ else if ( bli_is_io_packed( schema ) ) \ { \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_cscal2jios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_cscal2ios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_cscal2jios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_cscal2ios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } \ else /* if ( bli_is_rpi_packed( schema ) ) */ \ { \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_cscal2jrpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_cscal2rpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_cscal2jrpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_cscal2rpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } \ } #define bli_zscal2rihs_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y_r, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ /* Handle ro, io, and rpi separately. */ \ if ( bli_is_ro_packed( schema ) ) \ { \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_zscal2jros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_zscal2ros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_zscal2jros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_zscal2ros( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } \ else if ( bli_is_io_packed( schema ) ) \ { \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_zscal2jios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_zscal2ios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_zscal2jios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_zscal2ios( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } \ else /* if ( bli_is_rpi_packed( schema ) ) */ \ { \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_zscal2jrpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = _j; _i < m; ++_i ) \ { \ bli_zscal2rpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ else /* if ( bli_is_upper( uplo ) ) */ \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_zscal2jrpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ else /* if ( bli_is_noconj( conjx ) ) */ \ { \ for ( _j = 0; _j < m; ++_j ) \ for ( _i = 0; _i < _j + 1; ++_i ) \ { \ bli_zscal2rpis( *(a), \ *(x + _i*rs_x + _j*cs_x), \ *(y_r + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/rih/bli_setrihs_mxn_diag.h000066400000000000000000000065221427272030600274350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SETRIHS_MXN_DIAG_H #define BLIS_SETRIHS_MXN_DIAG_H // setrihs_mxn_diag #define bli_csetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ { \ const float a_r = bli_zreal( *a ); \ const float a_i = bli_zimag( *a ); \ dim_t min_m_n = bli_min( m, n ); \ dim_t _i; \ \ /* Handle ro, io, and rpi separately. */ \ if ( bli_is_ro_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_scopys( (a_r), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else if ( bli_is_io_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_scopys( (a_i), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else /* if ( bli_is_rpi_packed( schema ) ) */ \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_sadd3s( (a_r), \ (a_i), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ } #define bli_zsetrihs_mxn_diag( schema, m, n, a, y_r, rs_y, cs_y ) \ { \ const double a_r = bli_zreal( *a ); \ const double a_i = bli_zimag( *a ); \ dim_t min_m_n = bli_min( m, n ); \ dim_t _i; \ \ /* Handle ro, io, and rpi separately. */ \ if ( bli_is_ro_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_dcopys( (a_r), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else if ( bli_is_io_packed( schema ) ) \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_dcopys( (a_i), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ else /* if ( bli_is_rpi_packed( schema ) ) */ \ { \ for ( _i = 0; _i < min_m_n; ++_i ) \ { \ bli_dadd3s( (a_r), \ (a_i), \ *(y_r + _i*rs_y + _i*cs_y) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/ro/000077500000000000000000000000001427272030600227405ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/old/ro/bli_scal2jros.h000066400000000000000000000036641427272030600256520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2JROS_H #define BLIS_SCAL2JROS_H // scal2jros #define bli_cscal2jros( a, x, yr ) \ { \ (yr) = bli_creal(a) * bli_creal(x) + bli_cimag(a) * bli_cimag(x); \ } #define bli_zscal2jros( a, x, yr ) \ { \ (yr) = bli_zreal(a) * bli_zreal(x) + bli_zimag(a) * bli_zimag(x); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/ro/bli_scal2ros.h000066400000000000000000000041261427272030600254720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2ROS_H #define BLIS_SCAL2ROS_H // scal2ros #define bli_cscal2ros( a, x, yr ) \ { \ (yr) = bli_creal(a) * bli_creal(x) - bli_cimag(a) * bli_cimag(x); \ } #define bli_zscal2ros( a, x, yr ) \ { \ (yr) = bli_zreal(a) * bli_zreal(x) - bli_zimag(a) * bli_zimag(x); \ } #define bli_scscal2ros( a, x, yr ) \ { \ (yr) = bli_creal(a) * bli_creal(x); \ } #define bli_dzscal2ros( a, x, yr ) \ { \ (yr) = bli_zreal(a) * bli_zreal(x); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/rpi/000077500000000000000000000000001427272030600231125ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/old/rpi/bli_scal2jrpis.h000066400000000000000000000040311427272030600261630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyrpiight notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyrpiight notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2JRPIS_H #define BLIS_SCAL2JRPIS_H // scal2jrpis #define bli_cscal2jrpis( a, x, yrpi ) \ { \ (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ (bli_cimag(a)-bli_creal(a)) * bli_cimag(x); \ } #define bli_zscal2jrpis( a, x, yrpi ) \ { \ (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ (bli_zimag(a)-bli_zreal(a)) * bli_zimag(x); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/old/rpi/bli_scal2rpis.h000066400000000000000000000044251427272030600260200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RPIS_H #define BLIS_SCAL2RPIS_H // scal2rpis #define bli_cscal2rpis( a, x, yrpi ) \ { \ (yrpi) = (bli_creal(a)+bli_cimag(a)) * bli_creal(x) + \ (bli_creal(a)-bli_cimag(a)) * bli_cimag(x); \ } #define bli_zscal2rpis( a, x, yrpi ) \ { \ (yrpi) = (bli_zreal(a)+bli_zimag(a)) * bli_zreal(x) + \ (bli_zreal(a)-bli_zimag(a)) * bli_zimag(x); \ } #define bli_scscal2rpis( a, x, yrpi ) \ { \ (yrpi) = bli_creal(a) * bli_creal(x) + \ bli_creal(a) * bli_cimag(x); \ } #define bli_dzscal2rpis( a, x, yrpi ) \ { \ (yrpi) = bli_zreal(a) * bli_zreal(x) + \ bli_zreal(a) * bli_zimag(x); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/000077500000000000000000000000001427272030600221545ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_absq2ris.h000066400000000000000000000040521427272030600247020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_abval2ris.h000066400000000000000000000047021427272030600250430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_add3ris.h000066400000000000000000000040661427272030600245120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_addjris.h000066400000000000000000000040111427272030600245670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_addris.h000066400000000000000000000040171427272030600244230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_axmyris.h000066400000000000000000000045061427272030600246540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_axpbyjris.h000066400000000000000000000065041427272030600251730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_axpbyris.h000066400000000000000000000064261427272030600250240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_axpyjris.h000066400000000000000000000125511427272030600250300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_axpyris.h000066400000000000000000000123271427272030600246570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_conjris.h000066400000000000000000000036301427272030600246240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_copycjris.h000066400000000000000000000045421427272030600251650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_copyjris.h000066400000000000000000000063301427272030600250170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_copyris.h000066400000000000000000000062171427272030600246510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_eqris.h000066400000000000000000000060741427272030600243050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_invertris.h000066400000000000000000000045071427272030600252060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_invscaljris.h000066400000000000000000000043501427272030600255040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_invscalris.h000066400000000000000000000052621427272030600253350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_neg2ris.h000066400000000000000000000037621427272030600245340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_scal2jris.h000066400000000000000000000215651427272030600250600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_scal2ris.h000066400000000000000000000125371427272030600247050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_scal2ris_mxn.h000066400000000000000000000122211427272030600255550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ /* Treat the micro-panel as panel_dim x panel_len and column-stored (unit row stride). */ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else /* if ( bli_is_noconj( conjx ) ) */ { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_scalcjris.h000066400000000000000000000051401427272030600251300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_scaljris.h000066400000000000000000000042731427272030600247730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_scalris.h000066400000000000000000000045661427272030600246260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_scalris_mxn_uplo.h000066400000000000000000000063551427272030600265450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_set0ris.h000066400000000000000000000037251427272030600245530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_sqrt2ris.h000066400000000000000000000053731427272030600247540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_subjris.h000066400000000000000000000040111427272030600246300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_subris.h000066400000000000000000000040171427272030600244640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_swapris.h000066400000000000000000000047431427272030600246530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_xpbyjris.h000066400000000000000000000125231427272030600250300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif cython-blis-0.9.1/blis/_src/frame/include/level0/ri/bli_xpbyris.h000066400000000000000000000123011427272030600246500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif cython-blis-0.9.1/blis/_src/frame/thread/000077500000000000000000000000001427272030600201775ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_decor.h000066400000000000000000000051141427272030600226710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. #include "bli_l3_decor_single.h" #include "bli_l3_decor_openmp.h" #include "bli_l3_decor_pthreads.h" #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_decor_openmp.c000066400000000000000000000211261427272030600242430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_OPENMP // Define a dummy function bli_l3_thread_entry(), which is needed in the // pthreads version, so that when building Windows DLLs (with OpenMP enabled // or no multithreading) we don't risk having an unresolved symbol. void* bli_l3_thread_entry( void* data_void ) { return NULL; } //#define PRINT_THRINFO void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { // This is part of a hack to support mixed domain in bli_gemm_front(). // Sometimes we need to specify a non-standard schema for A and B, and // we decided to transmit them via the schema field in the obj_t's // rather than pass them in as function parameters. Once the values // have been read, we immediately reset them back to their expected // values for unpacked objects. pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // Query the total number of threads from the rntm_t object. const dim_t n_threads = bli_rntm_num_threads( rntm ); #ifdef PRINT_THRINFO err_t r_val; thrinfo_t** threads = bli_malloc_intl( n_threads * sizeof( thrinfo_t* ), &r_val ); #endif // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field // initialized and ready for the global communicator creation below. bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) { // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. rntm_t rntm_l = *rntm; rntm_t* restrict rntm_p = &rntm_l; // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); // Check for a somewhat obscure OpenMP thread-mistmatch issue. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. bli_sba_rntm_set_pool( tid, array, rntm_p ); obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; // Alias thread-local copies of A, B, and C. These will be the objects // we pass down the algorithmic function stack. Making thread-local // aliases is highly recommended in case a thread needs to change any // of the properties of an object without affecting other threads' // objects. bli_obj_alias_to( a, &a_t ); bli_obj_alias_to( b, &b_t ); bli_obj_alias_to( c, &c_t ); // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); #if 1 func ( alpha, &a_t, &b_t, beta, &c_t, cntx, rntm_p, cntl_use, thread ); #else bli_thrinfo_grow_tree ( rntm_p, cntl_use, thread ); #endif // Free the thread's local control tree. bli_l3_cntl_free( rntm_p, cntl_use, thread ); #ifdef PRINT_THRINFO threads[tid] = thread; #else // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( rntm_p, thread ); #endif } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called above). #ifdef PRINT_THRINFO if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads ); else bli_l3_thrinfo_print_trsm_paths( threads ); exit(1); #endif // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); } // ----------------------------------------------------------------------------- void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ) { dim_t n_threads_real = omp_get_num_threads(); // Check if the number of OpenMP threads created within this parallel // region is different from the number of threads that were requested // of BLIS. This inequality may trigger when, for example, the // following conditions are satisfied: // - an application is executing an OpenMP parallel region in which // BLIS is invoked, // - BLIS is configured for multithreading via OpenMP, // - OMP_NUM_THREADS = t > 1, // - the number of threads requested of BLIS (regardless of method) // is p <= t, // - OpenMP nesting is disabled. // In this situation, the application spawns t threads. Each application // thread calls gemm (for example). Each gemm will attempt to spawn p // threads via OpenMP. However, since nesting is disabled, the OpenMP // implementation finds that t >= p threads are already spawned, and // thus it doesn't spawn *any* additional threads for each gemm. if ( n_threads_real != n_threads ) { // If the number of threads active in the current region is not // equal to the number requested of BLIS, we then only continue // if the number of threads in the current region is 1. If, for // example, BLIS requested 4 threads but only got 3, then we // abort(). //if ( tid == 0 ) //{ if ( n_threads_real != 1 ) { bli_print_msg( "A different number of threads was " "created than was requested.", __FILE__, __LINE__ ); bli_abort(); } //n_threads = 1; // not needed since it has no effect? bli_thrcomm_init( 1, gl_comm ); bli_rntm_set_num_threads_only( 1, rntm ); bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); //} // Synchronize all threads and continue. _Pragma( "omp barrier" ) } } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_decor_openmp.h000066400000000000000000000040211427272030600242430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_decor_pthreads.c000066400000000000000000000207121427272030600245570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_PTHREADS // A data structure to assist in passing operands to additional threads. typedef struct thread_data { l3int_t func; opid_t family; pack_t schema_a; pack_t schema_b; obj_t* alpha; obj_t* a; obj_t* b; obj_t* beta; obj_t* c; cntx_t* cntx; rntm_t* rntm; cntl_t* cntl; dim_t tid; thrcomm_t* gl_comm; array_t* array; } thread_data_t; // Entry point for additional threads void* bli_l3_thread_entry( void* data_void ) { thread_data_t* data = data_void; l3int_t func = data->func; opid_t family = data->family; pack_t schema_a = data->schema_a; pack_t schema_b = data->schema_b; obj_t* alpha = data->alpha; obj_t* a = data->a; obj_t* b = data->b; obj_t* beta = data->beta; obj_t* c = data->c; cntx_t* cntx = data->cntx; rntm_t* rntm = data->rntm; cntl_t* cntl = data->cntl; dim_t tid = data->tid; array_t* array = data->array; thrcomm_t* gl_comm = data->gl_comm; // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. rntm_t rntm_l = *rntm; rntm_t* restrict rntm_p = &rntm_l; // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. bli_sba_rntm_set_pool( tid, array, rntm_p ); obj_t a_t, b_t, c_t; cntl_t* cntl_use; thrinfo_t* thread; // Alias thread-local copies of A, B, and C. These will be the objects // we pass down the algorithmic function stack. Making thread-local // aliases is highly recommended in case a thread needs to change any // of the properties of an object without affecting other threads' // objects. bli_obj_alias_to( a, &a_t ); bli_obj_alias_to( b, &b_t ); bli_obj_alias_to( c, &c_t ); // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use ); // Create the root node of the current thread's thrinfo_t structure. bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); func ( alpha, &a_t, &b_t, beta, &c_t, cntx, rntm_p, cntl_use, thread ); // Free the thread's local control tree. bli_l3_cntl_free( rntm_p, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( rntm_p, thread ); return NULL; } void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { err_t r_val; // This is part of a hack to support mixed domain in bli_gemm_front(). // Sometimes we need to specify a non-standard schema for A and B, and // we decided to transmit them via the schema field in the obj_t's // rather than pass them in as function parameters. Once the values // have been read, we immediately reset them back to their expected // values for unpacked objects. pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // Query the total number of threads from the context. const dim_t n_threads = bli_rntm_num_threads( rntm ); // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field // initialized and ready for the global communicator creation below. bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); // NOTE: We must iterate backwards so that the chief thread (thread id 0) // can spawn all other threads before proceeding with its own computation. for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) { // Set up thread data for additional threads (beyond thread 0). datas[tid].func = func; datas[tid].family = family; datas[tid].schema_a = schema_a; datas[tid].schema_b = schema_b; datas[tid].alpha = alpha; datas[tid].a = a; datas[tid].b = b; datas[tid].beta = beta; datas[tid].c = c; datas[tid].cntx = cntx; datas[tid].rntm = rntm; datas[tid].cntl = cntl; datas[tid].tid = tid; datas[tid].gl_comm = gl_comm; datas[tid].array = array; // Spawn additional threads for ids greater than 1. if ( tid != 0 ) bli_pthread_create( &pthreads[tid], NULL, &bli_l3_thread_entry, &datas[tid] ); else bli_l3_thread_entry( ( void* )(&datas[0]) ); } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called from the thread entry function). // Thread 0 waits for additional threads to finish. for ( dim_t tid = 1; tid < n_threads; tid++ ) { bli_pthread_join( pthreads[tid], NULL ); } // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif bli_free_intl( pthreads ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif bli_free_intl( datas ); } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_decor_pthreads.h000066400000000000000000000036251427272030600245700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_decor_single.c000066400000000000000000000126351427272030600242330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifndef BLIS_ENABLE_MULTITHREADING void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ) { // This is part of a hack to support mixed domain in bli_gemm_front(). // Sometimes we need to specify a non-standard schema for A and B, and // we decided to transmit them via the schema field in the obj_t's // rather than pass them in as function parameters. Once the values // have been read, we immediately reset them back to their expected // values for unpacked objects. pack_t schema_a = bli_obj_pack_schema( a ); pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, a ); bli_obj_set_pack_schema( BLIS_NOT_PACKED, b ); // For sequential execution, we use only one thread. const dim_t n_threads = 1; // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we can create the global comm below. bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); { // NOTE: We don't need to create another copy of the rntm_t since // it was already copied in one of the high-level oapi functions. rntm_t* restrict rntm_p = rntm; cntl_t* cntl_use; thrinfo_t* thread; const dim_t tid = 0; // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. // NOTE: This is commented out because, in the single-threaded case, // this is redundant since it's already been done above. //bli_sba_rntm_set_pool( tid, array, rntm_p ); // NOTE: Unlike with the _openmp.c and _pthreads.c variants, we don't // need to alias objects for A, B, and C since they were already aliased // in bli_*_front(). However, we may add aliasing here in the future so // that, with all three (_single.c, _openmp.c, _pthreads.c) implementations // consistently providing local aliases, we can then eliminate aliasing // elsewhere. // Create a default control tree for the operation, if needed. bli_l3_cntl_create_if( family, schema_a, schema_b, a, b, c, rntm_p, cntl, &cntl_use ); // Create the root node of the thread's thrinfo_t structure. bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread ); func ( alpha, a, b, beta, c, cntx, rntm_p, cntl_use, thread ); // Free the thread's local control tree. bli_l3_cntl_free( rntm_p, cntl_use, thread ); // Free the current thread's thrinfo_t structure. bli_l3_thrinfo_free( rntm_p, thread ); } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called above). // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_decor_single.h000066400000000000000000000035031427272030600242320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_sup_decor.h000066400000000000000000000051261427272030600235630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. #include "bli_l3_sup_decor_single.h" #include "bli_l3_sup_decor_openmp.h" #include "bli_l3_sup_decor_pthreads.h" #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_sup_decor_openmp.c000066400000000000000000000120031427272030600251240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_OPENMP // Define a dummy function bli_l3_sup_thread_entry(), which is needed in the // pthreads version, so that when building Windows DLLs (with OpenMP enabled // or no multithreading) we don't risk having an unresolved symbol. void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; } //#define PRINT_THRINFO err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // Query the total number of threads from the rntm_t object. const dim_t n_threads = bli_rntm_num_threads( rntm ); // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field // initialized and ready for the global communicator creation below. bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. bli_pba_rntm_set_pba( rntm ); // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); _Pragma( "omp parallel num_threads(n_threads)" ) { // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. rntm_t rntm_l = *rntm; rntm_t* restrict rntm_p = &rntm_l; // Query the thread's id from OpenMP. const dim_t tid = omp_get_thread_num(); // Check for a somewhat obscure OpenMP thread-mistmatch issue. // NOTE: This calls the same function used for the conventional/large // code path. bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p ); // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. bli_sba_rntm_set_pool( tid, array, rntm_p ); thrinfo_t* thread = NULL; // Create the root node of the thread's thrinfo_t structure. bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); func ( alpha, a, b, beta, c, cntx, rntm_p, thread ); // Free the current thread's thrinfo_t structure. bli_l3_sup_thrinfo_free( rntm_p, thread ); } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called from the thread entry function). // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); return BLIS_SUCCESS; } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_sup_decor_openmp.h000066400000000000000000000035101427272030600251340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_sup_decor_pthreads.c000066400000000000000000000156631427272030600254570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_PTHREADS // A data structure to assist in passing operands to additional threads. typedef struct thread_data { l3supint_t func; opid_t family; obj_t* alpha; obj_t* a; obj_t* b; obj_t* beta; obj_t* c; cntx_t* cntx; rntm_t* rntm; dim_t tid; thrcomm_t* gl_comm; array_t* array; } thread_data_t; // Entry point for additional threads void* bli_l3_sup_thread_entry( void* data_void ) { thread_data_t* data = data_void; l3supint_t func = data->func; opid_t family = data->family; obj_t* alpha = data->alpha; obj_t* a = data->a; obj_t* b = data->b; obj_t* beta = data->beta; obj_t* c = data->c; cntx_t* cntx = data->cntx; rntm_t* rntm = data->rntm; dim_t tid = data->tid; array_t* array = data->array; thrcomm_t* gl_comm = data->gl_comm; ( void )family; // Create a thread-local copy of the master thread's rntm_t. This is // necessary since we want each thread to be able to track its own // small block pool_t as it executes down the function stack. rntm_t rntm_l = *rntm; rntm_t* restrict rntm_p = &rntm_l; // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. bli_sba_rntm_set_pool( tid, array, rntm_p ); thrinfo_t* thread = NULL; // Create the root node of the current thread's thrinfo_t structure. bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); func ( alpha, a, b, beta, c, cntx, rntm_p, thread ); // Free the current thread's thrinfo_t structure. bli_l3_sup_thrinfo_free( rntm_p, thread ); return NULL; } err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { err_t r_val; // Query the total number of threads from the context. const dim_t n_threads = bli_rntm_num_threads( rntm ); // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. We do // this up-front only so that we have the rntm_t.sba_pool field // initialized and ready for the global communicator creation below. bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. This will be // inherited by all of the child threads when they make local copies of // the rntm below. bli_pba_rntm_set_pba( rntm ); // Allocate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); // Allocate an array of pthread objects and auxiliary data structs to pass // to the thread entry functions. #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif bli_pthread_t* pthreads = bli_malloc_intl( sizeof( bli_pthread_t ) * n_threads, &r_val ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif thread_data_t* datas = bli_malloc_intl( sizeof( thread_data_t ) * n_threads, &r_val ); // NOTE: We must iterate backwards so that the chief thread (thread id 0) // can spawn all other threads before proceeding with its own computation. for ( dim_t tid = n_threads - 1; 0 <= tid; tid-- ) { // Set up thread data for additional threads (beyond thread 0). datas[tid].func = func; datas[tid].family = family; datas[tid].alpha = alpha; datas[tid].a = a; datas[tid].b = b; datas[tid].beta = beta; datas[tid].c = c; datas[tid].cntx = cntx; datas[tid].rntm = rntm; datas[tid].tid = tid; datas[tid].gl_comm = gl_comm; datas[tid].array = array; // Spawn additional threads for ids greater than 1. if ( tid != 0 ) bli_pthread_create( &pthreads[tid], NULL, &bli_l3_sup_thread_entry, &datas[tid] ); else bli_l3_sup_thread_entry( ( void* )(&datas[0]) ); } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called from the thread entry function). // Thread 0 waits for additional threads to finish. for ( dim_t tid = 1; tid < n_threads; tid++ ) { bli_pthread_join( pthreads[tid], NULL ); } // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif bli_free_intl( pthreads ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_l3_thread_decorator().pth: " ); #endif bli_free_intl( datas ); return BLIS_SUCCESS; } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_sup_decor_pthreads.h000066400000000000000000000036411427272030600254550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_sup_decor_single.c000066400000000000000000000114511427272030600251150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifndef BLIS_ENABLE_MULTITHREADING #define SKIP_THRINFO_TREE err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, //pack_t schema_a, //pack_t schema_b, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ) { // For sequential execution, we use only one thread. const dim_t n_threads = 1; // NOTE: The sba was initialized in bli_init(). // Check out an array_t from the small block allocator. This is done // with an internal lock to ensure only one application thread accesses // the sba at a time. bli_sba_checkout_array() will also automatically // resize the array_t, if necessary. array_t* restrict array = bli_sba_checkout_array( n_threads ); // Access the pool_t* for thread 0 and embed it into the rntm. bli_sba_rntm_set_pool( 0, array, rntm ); // Set the packing block allocator field of the rntm. bli_pba_rntm_set_pba( rntm ); #ifndef SKIP_THRINFO_TREE // Allcoate a global communicator for the root thrinfo_t structures. thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads ); #endif { // NOTE: We don't need to create another copy of the rntm_t since // it was already copied in one of the high-level oapi functions. rntm_t* restrict rntm_p = rntm; // There is only one thread id (for the thief thread). const dim_t tid = 0; // Use the thread id to access the appropriate pool_t* within the // array_t, and use it to set the sba_pool field within the rntm_t. // If the pool_t* element within the array_t is NULL, it will first // be allocated/initialized. // NOTE: This is commented out because, in the single-threaded case, // this is redundant since it's already been done above. //bli_sba_rntm_set_pool( tid, array, rntm_p ); #ifndef SKIP_THRINFO_TREE thrinfo_t* thread = NULL; // Create the root node of the thread's thrinfo_t structure. bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread ); #else // This optimization allows us to use one of the global thrinfo_t // objects for single-threaded execution rather than grow one from // scratch. The key is that bli_thrinfo_sup_grow(), which is called // from within the variants, will immediately return if it detects // that the thrinfo_t* passed into it is either // &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED. thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED; ( void )tid; #endif func ( alpha, a, b, beta, c, cntx, rntm_p, thread ); #ifndef SKIP_THRINFO_TREE // Free the current thread's thrinfo_t structure. bli_l3_sup_thrinfo_free( rntm_p, thread ); #endif } // We shouldn't free the global communicator since it was already freed // by the global communicator's chief thread in bli_l3_thrinfo_free() // (called above). // Check the array_t back into the small block allocator. Similar to the // check-out, this is done using a lock embedded within the sba to ensure // mutual exclusion. bli_sba_checkin_array( array ); return BLIS_SUCCESS; } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_l3_sup_decor_single.h000066400000000000000000000035131427272030600251220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_pthread.c000066400000000000000000000353761427272030600226360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018, Southern Methodist University Copyright (C) 2018, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread_create(), pthread_join() -- int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ) { //return pthread_create( thread, attr, start_routine, arg ); start_routine( arg ); return 0; } int bli_pthread_join ( bli_pthread_t thread, void** retval ) { //return pthread_join( thread, retval ); return 0; } // -- pthread_mutex_*() -- int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ) { //return pthread_mutex_init( mutex, attr ); return 0; } int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ) { //return pthread_mutex_destroy( mutex ); return 0; } int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ) { //return pthread_mutex_lock( mutex ); return 0; } int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ) { //return pthread_mutex_trylock( mutex ); return 0; } int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ) { //return pthread_mutex_unlock( mutex ); return 0; } // -- pthread_cond_*() -- int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ) { //return pthread_cond_init( cond, attr ); return 0; } int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ) { //return pthread_cond_destroy( cond ); return 0; } int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ) { //return pthread_cond_wait( cond, mutex ); return 0; } int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ) { //return pthread_cond_broadcast( cond ); return 0; } // -- pthread_once() -- void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ) { //pthread_once( once, init ); init(); } #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- bli_pthread_t bli_pthread_self ( void ) { return 0; } // -- pthread_equal() -- int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ) { // We don't bother comparing t1 and t2 since we must, by definition, be // executing the same thread if there is not threading mechanism on the // system. return 1; } #endif #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) #include // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread_create(), pthread_join() -- typedef struct { void* (*start_routine)( void* ); void* param; void** retval; } bli_thread_param; static DWORD bli_thread_func ( void* param_ ) { bli_thread_param* param = param_; *param->retval = param->start_routine( param->param ); return 0; } int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ) { if ( attr ) return EINVAL; bli_thread_param param = { start_routine, arg, &thread->retval }; thread->handle = CreateThread( NULL, 0, bli_thread_func, ¶m, 0, NULL ); if ( !thread->handle ) return EAGAIN; return 0; } int bli_pthread_join ( bli_pthread_t thread, void** retval ) { if ( !WaitForSingleObject( thread.handle, INFINITE ) ) return EAGAIN; if ( retval ) *retval = thread.retval; return 0; } // -- pthread_mutex_*() -- int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ) { if ( attr ) return EINVAL; InitializeSRWLock( mutex ); return 0; } int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ) { return 0; } int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ) { AcquireSRWLockExclusive( mutex ); return 0; } int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ) { return TryAcquireSRWLockExclusive( mutex ) ? 0 : EBUSY; } int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ) { ReleaseSRWLockExclusive( mutex ); return 0; } // -- pthread_cond_*() -- int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ) { if ( attr ) return EINVAL; InitializeConditionVariable( cond ); return 0; } int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ) { ( void )cond; return 0; } int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ) { if ( !SleepConditionVariableSRW( cond, mutex, INFINITE, 0 ) ) return EAGAIN; return 0; } int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ) { WakeAllConditionVariable( cond ); return 0; } // -- pthread_once() -- static BOOL bli_init_once_wrapper ( bli_pthread_once_t* once, void* param, void** context ) { ( void )once; ( void )context; typedef void (*callback)( void ); ((callback)param)(); return TRUE; } void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ) { InitOnceExecuteOnce( once, bli_init_once_wrapper, init, NULL ); } #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- bli_pthread_t bli_pthread_self ( void ) { bli_pthread_t t; // Note: BLIS will only ever use bli_pthread_self() in conjunction with // bli_pthread_equal(), and thus setting the .retval field is unnecessary. // Despite this, we set it to NULL anyway. t.handle = GetCurrentThread(); t.retval = NULL; return t; } // -- pthread_equal() -- int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ) { return ( int )CompareObjectHandles( t1.handle, t2.handle ); } #endif #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // This branch is compiled for Linux and other non-Windows environments where // we assume that *some* implementation of pthreads is provided (although it // may lack barriers--see below). // -- pthread_create(), pthread_join() -- int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ) { return pthread_create( thread, attr, start_routine, arg ); } int bli_pthread_join ( bli_pthread_t thread, void** retval ) { return pthread_join( thread, retval ); } // -- pthread_mutex_*() -- int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ) { return pthread_mutex_init( mutex, attr ); } int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ) { return pthread_mutex_destroy( mutex ); } int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ) { return pthread_mutex_lock( mutex ); } int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ) { return pthread_mutex_trylock( mutex ); } int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ) { return pthread_mutex_unlock( mutex ); } // -- pthread_cond_*() -- int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ) { return pthread_cond_init( cond, attr ); } int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ) { return pthread_cond_destroy( cond ); } int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ) { return pthread_cond_wait( cond, mutex ); } int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ) { return pthread_cond_broadcast( cond ); } // -- pthread_once() -- void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ) { pthread_once( once, init ); } #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- bli_pthread_t bli_pthread_self ( void ) { return pthread_self(); } // -- pthread_equal() -- int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ) { return pthread_equal( t1, t2 ); } #endif #endif // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) // -- pthread_barrier_*() -- #if defined(BLIS_DISABLE_SYSTEM) int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ) { //return pthread_barrier_init( barrier, attr, count ); return 0; } int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ) { //return pthread_barrier_destroy( barrier ); return 0; } int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ) { //return pthread_barrier_wait( barrier ); return 0; } #elif defined(__APPLE__) || defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) #include // For OS X and Windows, we define barriers ourselves in terms of the rest // of the API, though for slightly different reasons: For Windows, we must // define barriers because we are defining *everything* from scratch. For // OS X, we must define barriers because Apple chose to omit barriers from // their implementation of POSIX threads (since barriers are actually // optional to the POSIX standard). int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ) { if ( attr ) return EINVAL; if ( count == 0 ) return EINVAL; int err; if ( (err = bli_pthread_mutex_init( &barrier->mutex, 0 )) != 0 ) return err; if ( (err = bli_pthread_cond_init( &barrier->cond, 0 )) != 0 ) { bli_pthread_mutex_destroy( &barrier->mutex ); return err; } barrier->tripCount = count; barrier->count = 0; return 0; } int bli_pthread_barrier_destroy ( bli_pthread_barrier_t *barrier ) { bli_pthread_cond_destroy( &barrier->cond ); bli_pthread_mutex_destroy( &barrier->mutex ); return 0; } int bli_pthread_barrier_wait ( bli_pthread_barrier_t *barrier ) { bli_pthread_mutex_lock( &barrier->mutex ); ++(barrier->count); if ( barrier->count >= barrier->tripCount ) { barrier->count = 0; bli_pthread_cond_broadcast( &barrier->cond ); bli_pthread_mutex_unlock( &barrier->mutex ); return 1; } else { bli_pthread_cond_wait( &barrier->cond, &(barrier->mutex) ); bli_pthread_mutex_unlock( &barrier->mutex ); return 0; } } #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(__APPLE__) && !defined(_MSC_VER) // Linux environments implement the pthread_barrier* sub-API. So, if we're // on Linux, we can simply call those functions, just as we did before for // the other functions. int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ) { return pthread_barrier_init( barrier, attr, count ); } int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ) { return pthread_barrier_destroy( barrier ); } int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ) { return pthread_barrier_wait( barrier ); } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_pthread.h000066400000000000000000000173031427272030600226310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018, Southern Methodist University Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm.c000066400000000000000000000111711427272030600226430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void* bli_thrcomm_bcast ( dim_t id, void* to_send, thrcomm_t* comm ) { if ( comm == NULL || comm->n_threads == 1 ) return to_send; if ( id == 0 ) comm->sent_object = to_send; bli_thrcomm_barrier( id, comm ); void* object = comm->sent_object; bli_thrcomm_barrier( id, comm ); return object; } // Use __sync_* builtins (assumed available) if __atomic_* ones are not present. #ifndef __ATOMIC_RELAXED #define __ATOMIC_RELAXED #define __ATOMIC_ACQUIRE #define __ATOMIC_RELEASE #define __ATOMIC_ACQ_REL #define __atomic_load_n(ptr, constraint) \ __sync_fetch_and_add(ptr, 0) #define __atomic_add_fetch(ptr, value, constraint) \ __sync_add_and_fetch(ptr, value) #define __atomic_fetch_add(ptr, value, constraint) \ __sync_fetch_and_add(ptr, value) #define __atomic_fetch_xor(ptr, value, constraint) \ __sync_fetch_and_xor(ptr, value) #endif void bli_thrcomm_barrier_atomic( dim_t t_id, thrcomm_t* comm ) { // Return early if the comm is NULL or if there is only one // thread participating. if ( comm == NULL || comm->n_threads == 1 ) return; // Read the "sense" variable. This variable is akin to a unique ID for // the current barrier. The first n-1 threads will spin on this variable // until it changes. The sense variable gets incremented by the last // thread to enter the barrier, just before it exits. But it turns out // that you don't need many unique IDs before you can wrap around. In // fact, if everything else is working, a binary variable is sufficient, // which is what we do here (i.e., 0 is incremented to 1, which is then // decremented back to 0, and so forth). gint_t orig_sense = __atomic_load_n( &comm->barrier_sense, __ATOMIC_RELAXED ); // Register ourselves (the current thread) as having arrived by // incrementing the barrier_threads_arrived variable. We must perform // this increment (and a subsequent read) atomically. dim_t my_threads_arrived = __atomic_add_fetch( &comm->barrier_threads_arrived, 1, __ATOMIC_ACQ_REL ); // If the current thread was the last thread to have arrived, then // it will take actions that effectively ends and resets the barrier. if ( my_threads_arrived == comm->n_threads ) { // Reset the variable tracking the number of threads that have arrived // to zero (which returns the barrier to the "empty" state. Then // atomically toggle the barrier sense variable. This will signal to // the other threads (which are spinning in the branch elow) that it // is now safe to exit the barrier. comm->barrier_threads_arrived = 0; __atomic_fetch_xor( &comm->barrier_sense, 1, __ATOMIC_RELEASE ); } else { // If the current thread is NOT the last thread to have arrived, then // it spins on the sense variable until that sense variable changes at // which time these threads will exit the barrier. while ( __atomic_load_n( &comm->barrier_sense, __ATOMIC_ACQUIRE ) == orig_sense ) ; // Empty loop body. } } cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm.h000066400000000000000000000051031427272030600226460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. #include "bli_thrcomm_single.h" #include "bli_thrcomm_openmp.h" #include "bli_thrcomm_pthreads.h" // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm_openmp.c000066400000000000000000000127341427272030600242270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_OPENMP thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); bli_thrcomm_init( n_threads, comm ); return comm; } void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) { if ( comm == NULL ) return; bli_thrcomm_cleanup( comm ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_free(): " ); #endif bli_sba_release( rntm, comm ); } #ifndef BLIS_TREE_BARRIER void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; comm->n_threads = n_threads; comm->barrier_sense = 0; comm->barrier_threads_arrived = 0; } void bli_thrcomm_cleanup( thrcomm_t* comm ) { if ( comm == NULL ) return; } //'Normal' barrier for openmp //barrier routine taken from art of multicore programming void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { #if 0 if ( comm == NULL || comm->n_threads == 1 ) return; gint_t my_sense = comm->barrier_sense; dim_t my_threads_arrived; _Pragma( "omp atomic capture" ) my_threads_arrived = ++(comm->barrier_threads_arrived); if ( my_threads_arrived == comm->n_threads ) { comm->barrier_threads_arrived = 0; comm->barrier_sense = !comm->barrier_sense; } else { volatile gint_t* listener = &comm->barrier_sense; while ( *listener == my_sense ) {} } #endif bli_thrcomm_barrier_atomic( t_id, comm ); } #else void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { err_t r_val; if ( comm == NULL ) return; comm->sent_object = NULL; comm->n_threads = n_threads; comm->barriers = bli_malloc_intl( sizeof( barrier_t* ) * n_threads, &r_val ); bli_thrcomm_tree_barrier_create( n_threads, BLIS_TREE_BARRIER_ARITY, comm->barriers, 0 ); } //Tree barrier used for Intel Xeon Phi barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ) { err_t r_val; barrier_t* me = bli_malloc_intl( sizeof( barrier_t ), &r_val ); me->dad = NULL; me->signal = 0; // Base Case if ( num_threads <= arity ) { //Now must be registered as a leaf for ( int i = 0; i < num_threads; i++ ) { leaves[ leaf_index + i ] = me; } me->count = num_threads; me->arity = num_threads; } else { // Otherwise this node has children int threads_per_kid = num_threads / arity; int defecit = num_threads - threads_per_kid * arity; for ( int i = 0; i < arity; i++ ) { int threads_this_kid = threads_per_kid; if ( i < defecit ) threads_this_kid++; barrier_t* kid = bli_thrcomm_tree_barrier_create( threads_this_kid, arity, leaves, leaf_index ); kid->dad = me; leaf_index += threads_this_kid; } me->count = arity; me->arity = arity; } return me; } void bli_thrcomm_cleanup( thrcomm_t* comm ) { if ( comm == NULL ) return; for ( dim_t i = 0; i < comm->n_threads; i++ ) { bli_thrcomm_tree_barrier_free( comm->barriers[i] ); } bli_free_intl( comm->barriers ); } void bli_thrcomm_tree_barrier_free( barrier_t* barrier ) { if ( barrier == NULL ) return; barrier->count--; if ( barrier->count == 0 ) { bli_thrcomm_tree_barrier_free( barrier->dad ); bli_free_intl( barrier ); } return; } void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { bli_thrcomm_tree_barrier( comm->barriers[t_id] ); } void bli_thrcomm_tree_barrier( barrier_t* barack ) { int my_signal = barack->signal; int my_count; _Pragma( "omp atomic capture" ) my_count = barack->count--; if ( my_count == 1 ) { if ( barack->dad != NULL ) { bli_thrcomm_tree_barrier( barack->dad ); } barack->count = barack->arity; barack->signal = !barack->signal; } else { volatile int* listener = &barack->signal; while ( *listener == my_signal ) {} } } #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm_openmp.h000066400000000000000000000062031427272030600242260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm_pthreads.c000066400000000000000000000067171427272030600245470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_PTHREADS thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) ); bli_thrcomm_init( n_threads, comm ); return comm; } void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) { if ( comm == NULL ) return; bli_thrcomm_cleanup( comm ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_free(): " ); #endif bli_sba_release( rntm, comm ); } #ifdef BLIS_USE_PTHREAD_BARRIER void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; comm->n_threads = n_threads; bli_pthread_barrier_init( &comm->barrier, NULL, n_threads ); } void bli_thrcomm_cleanup( thrcomm_t* comm ) { if ( comm == NULL ) return; bli_pthread_barrier_destroy( &comm->barrier ); } void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { bli_pthread_barrier_wait( &comm->barrier ); } #else void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; comm->n_threads = n_threads; comm->barrier_sense = 0; comm->barrier_threads_arrived = 0; } void bli_thrcomm_cleanup( thrcomm_t* comm ) { } void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { #if 0 if ( comm == NULL || comm->n_threads == 1 ) return; bool my_sense = comm->sense; dim_t my_threads_arrived; my_threads_arrived = __sync_add_and_fetch(&(comm->threads_arrived), 1); if ( my_threads_arrived == comm->n_threads ) { comm->threads_arrived = 0; comm->sense = !comm->sense; } else { volatile bool* listener = &comm->sense; while( *listener == my_sense ) {} } #endif bli_thrcomm_barrier_atomic( t_id, comm ); } #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm_pthreads.h000066400000000000000000000051051427272030600245420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm_single.c000066400000000000000000000052551427272030600242120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifndef BLIS_ENABLE_MULTITHREADING //Constructors and destructors for constructors thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_create(): " ); #endif thrcomm_t* comm = bli_sba_acquire( rntm, sizeof( thrcomm_t ) ); bli_thrcomm_init( n_threads, comm ); return comm; } void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ) { if ( comm == NULL ) return; bli_thrcomm_cleanup( comm ); #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrcomm_free(): " ); #endif bli_sba_release( rntm, comm ); } void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ) { if ( comm == NULL ) return; comm->sent_object = NULL; comm->n_threads = n_threads; comm->barrier_sense = 0; comm->barrier_threads_arrived = 0; } void bli_thrcomm_cleanup( thrcomm_t* comm ) { if ( comm == NULL ) return; } void bli_thrcomm_barrier( dim_t t_id, thrcomm_t* comm ) { return; } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrcomm_single.h000066400000000000000000000053411427272030600242130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thread.c000066400000000000000000001376661427272030600224630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" thrinfo_t BLIS_PACKM_SINGLE_THREADED = {}; thrinfo_t BLIS_GEMM_SINGLE_THREADED = {}; thrcomm_t BLIS_SINGLE_COMM = {}; // The global rntm_t structure. (The definition resides in bli_rntm.c.) extern rntm_t global_rntm; // A mutex to allow synchronous access to global_rntm. (The definition // resides in bli_rntm.c.) extern bli_pthread_mutex_t global_rntm_mutex; // ----------------------------------------------------------------------------- void bli_thread_init( void ) { bli_thrcomm_init( 1, &BLIS_SINGLE_COMM ); bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED ); bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED ); // Read the environment variables and use them to initialize the // global runtime object. bli_thread_init_rntm_from_env( &global_rntm ); } void bli_thread_finalize( void ) { } // ----------------------------------------------------------------------------- void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ) { dim_t n_way = bli_thread_n_way( thread ); if ( n_way == 1 ) { *start = 0; *end = n; return; } dim_t work_id = bli_thread_work_id( thread ); dim_t all_start = 0; dim_t all_end = n; dim_t size = all_end - all_start; dim_t n_bf_whole = size / bf; dim_t n_bf_left = size % bf; dim_t n_bf_lo = n_bf_whole / n_way; dim_t n_bf_hi = n_bf_whole / n_way; // In this function, we partition the space between all_start and // all_end into n_way partitions, each a multiple of block_factor // with the exception of the one partition that recieves the // "edge" case (if applicable). // // Here are examples of various thread partitionings, in units of // the block_factor, when n_way = 4. (A '+' indicates the thread // that receives the leftover edge case (ie: n_bf_left extra // rows/columns in its sub-range). // (all_start ... all_end) // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3 // 12 =0 f 0 4 3 3 3 3 // 12 >0 f 0 4 3 3 3 3+ // 13 >0 f 1 3 4 3 3 3+ // 14 >0 f 2 2 4 4 3 3+ // 15 >0 f 3 1 4 4 4 3+ // 15 =0 f 3 1 4 4 4 3 // // 12 =0 t 4 0 3 3 3 3 // 12 >0 t 4 0 3+ 3 3 3 // 13 >0 t 3 1 3+ 3 3 4 // 14 >0 t 2 2 3+ 3 4 4 // 15 >0 t 1 3 3+ 4 4 4 // 15 =0 t 1 3 3 4 4 4 // As indicated by the table above, load is balanced as equally // as possible, even in the presence of an edge case. // First, we must differentiate between cases where the leftover // "edge" case (n_bf_left) should be allocated to a thread partition // at the low end of the index range or the high end. if ( handle_edge_low == FALSE ) { // Notice that if all threads receive the same number of // block_factors, those threads are considered "high" and // the "low" thread group is empty. dim_t n_th_lo = n_bf_whole % n_way; //dim_t n_th_hi = n_way - n_th_lo; // If some partitions must have more block_factors than others // assign the slightly larger partitions to lower index threads. if ( n_th_lo != 0 ) n_bf_lo += 1; // Compute the actual widths (in units of rows/columns) of // individual threads in the low and high groups. dim_t size_lo = n_bf_lo * bf; dim_t size_hi = n_bf_hi * bf; // Precompute the starting indices of the low and high groups. dim_t lo_start = all_start; dim_t hi_start = all_start + n_th_lo * size_lo; // Compute the start and end of individual threads' ranges // as a function of their work_ids and also the group to which // they belong (low or high). if ( work_id < n_th_lo ) { *start = lo_start + (work_id ) * size_lo; *end = lo_start + (work_id+1) * size_lo; } else // if ( n_th_lo <= work_id ) { *start = hi_start + (work_id-n_th_lo ) * size_hi; *end = hi_start + (work_id-n_th_lo+1) * size_hi; // Since the edge case is being allocated to the high // end of the index range, we have to advance the last // thread's end. if ( work_id == n_way - 1 ) *end += n_bf_left; } } else // if ( handle_edge_low == TRUE ) { // Notice that if all threads receive the same number of // block_factors, those threads are considered "low" and // the "high" thread group is empty. dim_t n_th_hi = n_bf_whole % n_way; dim_t n_th_lo = n_way - n_th_hi; // If some partitions must have more block_factors than others // assign the slightly larger partitions to higher index threads. if ( n_th_hi != 0 ) n_bf_hi += 1; // Compute the actual widths (in units of rows/columns) of // individual threads in the low and high groups. dim_t size_lo = n_bf_lo * bf; dim_t size_hi = n_bf_hi * bf; // Precompute the starting indices of the low and high groups. dim_t lo_start = all_start; dim_t hi_start = all_start + n_th_lo * size_lo + n_bf_left; // Compute the start and end of individual threads' ranges // as a function of their work_ids and also the group to which // they belong (low or high). if ( work_id < n_th_lo ) { *start = lo_start + (work_id ) * size_lo; *end = lo_start + (work_id+1) * size_lo; // Since the edge case is being allocated to the low // end of the index range, we have to advance the // starts/ends accordingly. if ( work_id == 0 ) *end += n_bf_left; else { *start += n_bf_left; *end += n_bf_left; } } else // if ( n_th_lo <= work_id ) { *start = hi_start + (work_id-n_th_lo ) * size_hi; *end = hi_start + (work_id-n_th_lo+1) * size_hi; } } } siz_t bli_thread_range_l2r ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { num_t dt = bli_obj_dt( a ); dim_t m = bli_obj_length_after_trans( a ); dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_range_sub( thr, n, bf, FALSE, start, end ); return m * ( *end - *start ); } siz_t bli_thread_range_r2l ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { num_t dt = bli_obj_dt( a ); dim_t m = bli_obj_length_after_trans( a ); dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_range_sub( thr, n, bf, TRUE, start, end ); return m * ( *end - *start ); } siz_t bli_thread_range_t2b ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { num_t dt = bli_obj_dt( a ); dim_t m = bli_obj_length_after_trans( a ); dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_range_sub( thr, m, bf, FALSE, start, end ); return n * ( *end - *start ); } siz_t bli_thread_range_b2t ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { num_t dt = bli_obj_dt( a ); dim_t m = bli_obj_length_after_trans( a ); dim_t n = bli_obj_width_after_trans( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); bli_thread_range_sub( thr, m, bf, TRUE, start, end ); return n * ( *end - *start ); } // ----------------------------------------------------------------------------- dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ) { dim_t width; // In this function, we assume that we are somewhere in the process of // partitioning an m x n lower-stored region (with arbitrary diagonal // offset) n_ways along the n dimension (into column panels). The value // j identifies the left-to-right subpartition index (from 0 to n_way-1) // of the subpartition whose width we are about to compute using the // area per thread determined by the caller. n_j is the number of // columns in the remaining region of the matrix being partitioned, // and diagoff_j is that region's diagonal offset. // If this is the last subpartition, the width is simply equal to n_j. // Note that this statement handles cases where the "edge case" (if // one exists) is assigned to the high end of the index range (ie: // handle_edge_low == FALSE). if ( j == n_way - 1 ) return n_j; // At this point, we know there are at least two subpartitions left. // We also know that IF the submatrix contains a completely dense // rectangular submatrix, it will occur BEFORE the triangular (or // trapezoidal) part. // Here, we implement a somewhat minor load balancing optimization // that ends up getting employed only for relatively small matrices. // First, recall that all subpartition widths will be some multiple // of the blocking factor bf, except perhaps either the first or last // subpartition, which will receive the edge case, if it exists. // Also recall that j represents the current thread (or thread group, // or "caucus") for which we are computing a subpartition width. // If n_j is sufficiently small that we can only allocate bf columns // to each of the remaining threads, then we set the width to bf. We // do not allow the subpartition width to be less than bf, so, under // some conditions, if n_j is small enough, some of the reamining // threads may not get any work. For the purposes of this lower bound // on work (ie: width >= bf), we allow the edge case to count as a // "full" set of bf columns. { dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 ); if ( n_j_bf <= n_way - j ) { if ( j == 0 && handle_edge_low ) width = ( bf_left > 0 ? bf_left : bf ); else width = bf; // Make sure that the width does not exceed n_j. This would // occur if and when n_j_bf < n_way - j; that is, when the // matrix being partitioned is sufficiently small relative to // n_way such that there is not even enough work for every // (remaining) thread to get bf (or bf_left) columns. The // net effect of this safeguard is that some threads may get // assigned empty ranges (ie: no work), which of course must // happen in some situations. if ( width > n_j ) width = n_j; return width; } } // This block computes the width assuming that we are entirely within // a dense rectangle that precedes the triangular (or trapezoidal) // part. { // First compute the width of the current panel under the // assumption that the diagonal offset would not intersect. width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m ); // Adjust the width, if necessary. Specifically, we may need // to allocate the edge case to the first subpartition, if // requested; otherwise, we just need to ensure that the // subpartition is a multiple of the blocking factor. if ( j == 0 && handle_edge_low ) { if ( width % bf != bf_left ) width += bf_left - ( width % bf ); } else // if interior case { // Round up to the next multiple of the blocking factor. //if ( width % bf != 0 ) width += bf - ( width % bf ); // Round to the nearest multiple of the blocking factor. if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); } } // We need to recompute width if the panel, according to the width // as currently computed, would intersect the diagonal. if ( diagoff_j < width ) { dim_t offm_inc, offn_inc; // Prune away the unstored region above the diagonal, if it exists. // Note that the entire region was pruned initially, so we know that // we don't need to try to prune the right side. (Also, we discard // the offset deltas since we don't need to actually index into the // subpartition.) bli_prune_unstored_region_top_l( &diagoff_j, &m, &n_j, &offm_inc ); //bli_prune_unstored_region_right_l( &diagoff_j, &m, &n_j, &offn_inc ); // We don't need offm_inc, offn_inc here. These statements should // prevent compiler warnings. ( void )offm_inc; ( void )offn_inc; // Prepare to solve a quadratic equation to find the width of the // current (jth) subpartition given the m dimension, diagonal offset, // and area. // NOTE: We know that the +/- in the quadratic formula must be a + // here because we know that the desired solution (the subpartition // width) will be smaller than (m + diagoff), not larger. If you // don't believe me, draw a picture! const double a = -0.5; const double b = ( double )m + ( double )diagoff_j + 0.5; const double c = -0.5 * ( ( double )diagoff_j * ( ( double )diagoff_j + 1.0 ) ) - area_per_thr; const double r = b * b - 4.0 * a * c; // If the quadratic solution is not imaginary, round it and use that // as our width, but make sure it didn't round to zero. Otherwise, // discard the quadratic solution and leave width, as previously // computed, unchanged. if ( r >= 0.0 ) { const double x = ( -b + sqrt( r ) ) / ( 2.0 * a ); width = ( dim_t )bli_round( x ); if ( width == 0 ) width = 1; } // Adjust the width, if necessary. if ( j == 0 && handle_edge_low ) { if ( width % bf != bf_left ) width += bf_left - ( width % bf ); } else // if interior case { // Round up to the next multiple of the blocking factor. //if ( width % bf != 0 ) width += bf - ( width % bf ); // Round to the nearest multiple of the blocking factor. if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); } } // Make sure that the width, after being adjusted, does not cause the // subpartition to exceed n_j. if ( width > n_j ) width = n_j; return width; } siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ) { dim_t offm_inc = 0; dim_t offn_inc = 0; double tri_area; double area; // Prune away any rectangular region above where the diagonal // intersects the left edge of the subpartition, if it exists. bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); // Prune away any rectangular region to the right of where the // diagonal intersects the bottom edge of the subpartition, if // it exists. (This shouldn't ever be needed, since the caller // would presumably have already performed rightward pruning, // but it's here just in case.) bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); ( void )offm_inc; ( void )offn_inc; // Compute the area of the empty triangle so we can subtract it // from the area of the rectangle that bounds the subpartition. if ( bli_intersects_diag_n( diagoff, m, n ) ) { double tri_dim = ( double )( n - diagoff - 1 ); tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; } else { // If the diagonal does not intersect the trapezoid, then // we can compute the area as a simple rectangle. tri_area = 0.0; } area = ( double )m * ( double )n - tri_area; return ( siz_t )area; } // ----------------------------------------------------------------------------- siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ) { dim_t n_way = bli_thread_n_way( thread ); dim_t my_id = bli_thread_work_id( thread ); dim_t bf_left = n % bf; dim_t j; dim_t off_j; doff_t diagoff_j; dim_t n_left; dim_t width_j; dim_t offm_inc, offn_inc; double tri_dim, tri_area; double area_total, area_per_thr; siz_t area = 0; // In this function, we assume that the caller has already determined // that (a) the diagonal intersects the submatrix, and (b) the submatrix // is either lower- or upper-stored. if ( bli_is_lower( uplo ) ) { // Prune away the unstored region above the diagonal, if it exists, // and then to the right of where the diagonal intersects the bottom, // if it exists. (Also, we discard the offset deltas since we don't // need to actually index into the subpartition.) bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); // We don't need offm_inc, offn_inc here. These statements should // prevent compiler warnings. ( void )offm_inc; ( void )offn_inc; // Now that pruning has taken place, we know that diagoff >= 0. // Compute the total area of the submatrix, accounting for the // location of the diagonal, and divide it by the number of ways // of parallelism. tri_dim = ( double )( n - diagoff - 1 ); tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; area_total = ( double )m * ( double )n - tri_area; area_per_thr = area_total / ( double )n_way; // Initialize some variables prior to the loop: the offset to the // current subpartition, the remainder of the n dimension, and // the diagonal offset of the current subpartition. off_j = 0; diagoff_j = diagoff; n_left = n; // Iterate over the subpartition indices corresponding to each // thread/caucus participating in the n_way parallelism. for ( j = 0; j < n_way; ++j ) { // Compute the width of the jth subpartition, taking the // current diagonal offset into account, if needed. width_j = bli_thread_range_width_l ( diagoff_j, m, n_left, j, n_way, bf, bf_left, area_per_thr, handle_edge_low ); // If the current thread belongs to caucus j, this is his // subpartition. So we compute the implied index range and // end our search. if ( j == my_id ) { *j_start_thr = off_j; *j_end_thr = off_j + width_j; area = bli_find_area_trap_l( m, width_j, diagoff_j ); break; } // Shift the current subpartition's starting and diagonal offsets, // as well as the remainder of the n dimension, according to the // computed width, and then iterate to the next subpartition. off_j += width_j; diagoff_j -= width_j; n_left -= width_j; } } else // if ( bli_is_upper( uplo ) ) { // Express the upper-stored case in terms of the lower-stored case. // First, we convert the upper-stored trapezoid to an equivalent // lower-stored trapezoid by rotating it 180 degrees. bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); // Now that the trapezoid is "flipped" in the n dimension, negate // the bool that encodes whether to handle the edge case at the // low (or high) end of the index range. bli_toggle_bool( &handle_edge_low ); // Compute the appropriate range for the rotated trapezoid. area = bli_thread_range_weighted_sub ( thread, diagoff, uplo, m, n, bf, handle_edge_low, j_start_thr, j_end_thr ); // Reverse the indexing basis for the subpartition ranges so that // the indices, relative to left-to-right iteration through the // unrotated upper-stored trapezoid, map to the correct columns // (relative to the diagonal). This amounts to subtracting the // range from n. bli_reverse_index_direction( n, j_start_thr, j_end_thr ); } return area; } siz_t bli_thread_range_mdim ( dir_t direct, thrinfo_t* thr, obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl, cntx_t* cntx, dim_t* start, dim_t* end ) { bszid_t bszid = bli_cntl_bszid( cntl ); opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires // we swap the usage of the register blocksizes for the purposes of // packing A and B. if ( family == BLIS_TRSM ) { if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR; else bszid = BLIS_NR; } blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); obj_t* x; bool use_weighted; // Use the operation family to choose the one of the two matrices // being partitioned that potentially has structure, and also to // decide whether or not we need to use weighted range partitioning. // NOTE: It's important that we use non-weighted range partitioning // for hemm and symm (ie: the gemm family) because the weighted // function will mistakenly skip over unstored regions of the // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } if ( use_weighted ) { if ( direct == BLIS_FWD ) return bli_thread_range_weighted_t2b( thr, x, bmult, start, end ); else return bli_thread_range_weighted_b2t( thr, x, bmult, start, end ); } else { if ( direct == BLIS_FWD ) return bli_thread_range_t2b( thr, x, bmult, start, end ); else return bli_thread_range_b2t( thr, x, bmult, start, end ); } } siz_t bli_thread_range_ndim ( dir_t direct, thrinfo_t* thr, obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl, cntx_t* cntx, dim_t* start, dim_t* end ) { bszid_t bszid = bli_cntl_bszid( cntl ); opid_t family = bli_cntl_family( cntl ); // This is part of trsm's current implementation, whereby right side // cases are implemented in left-side micro-kernels, which requires // we swap the usage of the register blocksizes for the purposes of // packing A and B. if ( family == BLIS_TRSM ) { if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR; else bszid = BLIS_NR; } blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); obj_t* x; bool use_weighted; // Use the operation family to choose the one of the two matrices // being partitioned that potentially has structure, and also to // decide whether or not we need to use weighted range partitioning. // NOTE: It's important that we use non-weighted range partitioning // for hemm and symm (ie: the gemm family) because the weighted // function will mistakenly skip over unstored regions of the // structured matrix, even though they represent part of that matrix // that will be dense and full (after packing). if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } if ( use_weighted ) { if ( direct == BLIS_FWD ) return bli_thread_range_weighted_l2r( thr, x, bmult, start, end ); else return bli_thread_range_weighted_r2l( thr, x, bmult, start, end ); } else { if ( direct == BLIS_FWD ) return bli_thread_range_l2r( thr, x, bmult, start, end ); else return bli_thread_range_r2l( thr, x, bmult, start, end ); } } siz_t bli_thread_range_weighted_l2r ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { siz_t area; // This function assigns area-weighted ranges in the n dimension // where the total range spans 0 to n-1 with 0 at the left end and // n-1 at the right end. if ( bli_obj_intersects_diag( a ) && bli_obj_is_upper_or_lower( a ) ) { num_t dt = bli_obj_dt( a ); doff_t diagoff = bli_obj_diag_offset( a ); uplo_t uplo = bli_obj_uplo( a ); dim_t m = bli_obj_length( a ); dim_t n = bli_obj_width( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( a ) ) { bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); } area = bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, FALSE, start, end ); } else // if dense or zeros { area = bli_thread_range_l2r ( thr, a, bmult, start, end ); } return area; } siz_t bli_thread_range_weighted_r2l ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { siz_t area; // This function assigns area-weighted ranges in the n dimension // where the total range spans 0 to n-1 with 0 at the right end and // n-1 at the left end. if ( bli_obj_intersects_diag( a ) && bli_obj_is_upper_or_lower( a ) ) { num_t dt = bli_obj_dt( a ); doff_t diagoff = bli_obj_diag_offset( a ); uplo_t uplo = bli_obj_uplo( a ); dim_t m = bli_obj_length( a ); dim_t n = bli_obj_width( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( a ) ) { bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); } bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); area = bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end ); } else // if dense or zeros { area = bli_thread_range_r2l ( thr, a, bmult, start, end ); } return area; } siz_t bli_thread_range_weighted_t2b ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { siz_t area; // This function assigns area-weighted ranges in the m dimension // where the total range spans 0 to m-1 with 0 at the top end and // m-1 at the bottom end. if ( bli_obj_intersects_diag( a ) && bli_obj_is_upper_or_lower( a ) ) { num_t dt = bli_obj_dt( a ); doff_t diagoff = bli_obj_diag_offset( a ); uplo_t uplo = bli_obj_uplo( a ); dim_t m = bli_obj_length( a ); dim_t n = bli_obj_width( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( a ) ) { bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); } bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); area = bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, FALSE, start, end ); } else // if dense or zeros { area = bli_thread_range_t2b ( thr, a, bmult, start, end ); } return area; } siz_t bli_thread_range_weighted_b2t ( thrinfo_t* thr, obj_t* a, blksz_t* bmult, dim_t* start, dim_t* end ) { siz_t area; // This function assigns area-weighted ranges in the m dimension // where the total range spans 0 to m-1 with 0 at the bottom end and // m-1 at the top end. if ( bli_obj_intersects_diag( a ) && bli_obj_is_upper_or_lower( a ) ) { num_t dt = bli_obj_dt( a ); doff_t diagoff = bli_obj_diag_offset( a ); uplo_t uplo = bli_obj_uplo( a ); dim_t m = bli_obj_length( a ); dim_t n = bli_obj_width( a ); dim_t bf = bli_blksz_get_def( dt, bmult ); // Support implicit transposition. if ( bli_obj_has_trans( a ) ) { bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); } bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); area = bli_thread_range_weighted_sub ( thr, diagoff, uplo, m, n, bf, TRUE, start, end ); } else // if dense or zeros { area = bli_thread_range_b2t ( thr, a, bmult, start, end ); } return area; } // ----------------------------------------------------------------------------- void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors ) { factors->n = n; factors->sqrt_n = ( dim_t )sqrt( ( double )n ); factors->f = 2; } dim_t bli_next_prime_factor( bli_prime_factors_t* factors ) { // Return the prime factorization of the original number n one-by-one. // Return 1 after all factors have been exhausted. // Looping over possible factors in increasing order assures we will // only return prime factors (a la the Sieve of Eratosthenes). while ( factors->f <= factors->sqrt_n ) { // Special cases for factors 2-7 handle all numbers not divisible by 11 // or another larger prime. The slower loop version is used after that. // If you use a number of threads with large prime factors you get // what you deserve. if ( factors->f == 2 ) { if ( factors->n % 2 == 0 ) { factors->n /= 2; return 2; } factors->f = 3; } else if ( factors->f == 3 ) { if ( factors->n % 3 == 0 ) { factors->n /= 3; return 3; } factors->f = 5; } else if ( factors->f == 5 ) { if ( factors->n % 5 == 0 ) { factors->n /= 5; return 5; } factors->f = 7; } else if ( factors->f == 7 ) { if ( factors->n % 7 == 0 ) { factors->n /= 7; return 7; } factors->f = 11; } else { if ( factors->n % factors->f == 0 ) { factors->n /= factors->f; return factors->f; } factors->f++; } } // To get here we must be out of prime factors, leaving only n (if it is // prime) or an endless string of 1s. dim_t tmp = factors->n; factors->n = 1; return tmp; } bool bli_is_prime( dim_t n ) { bli_prime_factors_t factors; bli_prime_factorization( n, &factors ); dim_t f = bli_next_prime_factor( &factors ); if ( f == n ) return TRUE; else return FALSE; } void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ) { // Partition a number of threads into two factors nt1 and nt2 such that // nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a // slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|). // Return early small prime numbers of threads. if ( n_thread < 4 ) { *nt1 = ( work1 >= work2 ? n_thread : 1 ); *nt2 = ( work1 < work2 ? n_thread : 1 ); return; } #if 1 bli_thread_partition_2x2_fast( n_thread, work1, work2, nt1, nt2 ); #else bli_thread_partition_2x2_slow( n_thread, work1, work2, nt1, nt2 ); #endif } //#define PRINT_FACTORS void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ) { // Compute with these local variables until the end of the function, at // which time we will save the values back to nt1 and nt2. dim_t tn1 = 1; dim_t tn2 = 1; // Both algorithms need the prime factorization of n_thread. bli_prime_factors_t factors; bli_prime_factorization( n_thread, &factors ); // Fast algorithm: assign prime factors in increasing order to whichever // partition has more work to do. The work is divided by the number of // threads assigned at each iteration. This algorithm is sub-optimal in // some cases. We attempt to mitigate the cases that involve at least one // factor of 2. For example, in the partitioning of 12 with equal work // this algorithm tentatively finds 6x2. This factorization involves a // factor of 2 that can be reallocated, allowing us to convert it to the // optimal solution of 4x3. But some cases cannot be corrected this way // because they do not contain a factor of 2. For example, this algorithm // factors 105 (with equal work) into 21x5 whereas 7x15 would be optimal. #ifdef PRINT_FACTORS printf( "w1 w2 = %d %d (initial)\n", (int)work1, (int)work2 ); #endif dim_t f; while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) { #ifdef PRINT_FACTORS printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d ... f = %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2, (int)f ); #endif if ( work1 > work2 ) { work1 /= f; tn1 *= f; } else { work2 /= f; tn2 *= f; } } #ifdef PRINT_FACTORS printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2 ); #endif // Sometimes the last factor applied is prime. For example, on a square // matrix, we tentatively arrive (from the logic above) at: // - a 2x6 factorization when given 12 ways of parallelism // - a 2x10 factorization when given 20 ways of parallelism // - a 2x14 factorization when given 28 ways of parallelism // These factorizations are suboptimal under the assumption that we want // the parallelism to be as balanced as possible. Below, we make a final // attempt at rebalancing nt1 and nt2 by checking to see if the gap between // work1 and work2 is narrower if we reallocate a factor of 2. if ( work1 > work2 ) { // Example: nt = 12 // w1 w2 (initial) = 3600 3600; nt1 nt2 = 1 1 // w1 w2 (tentative) = 1800 600; nt1 nt2 = 2 6 // w1 w2 (ideal) = 900 1200; nt1 nt2 = 4 3 if ( tn2 % 2 == 0 ) { dim_t diff = work1 - work2; dim_t diff_mod = bli_abs( work1/2 - work2*2 ); if ( diff_mod < diff ) { tn1 *= 2; tn2 /= 2; } } } else if ( work1 < work2 ) { // Example: nt = 40 // w1 w2 (initial) = 3600 3600; nt1 nt2 = 1 1 // w1 w2 (tentative) = 360 900; nt1 nt2 = 10 4 // w1 w2 (ideal) = 720 450; nt1 nt2 = 5 8 if ( tn1 % 2 == 0 ) { dim_t diff = work2 - work1; dim_t diff_mod = bli_abs( work2/2 - work1*2 ); if ( diff_mod < diff ) { tn1 /= 2; tn2 *= 2; } } } #ifdef PRINT_FACTORS printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d (final)\n", (int)work1, (int)work2, (int)tn1, (int)tn2 ); #endif // Save the final result. *nt1 = tn1; *nt2 = tn2; } #include "limits.h" void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ) { // Slow algorithm: exhaustively constructs all factor pairs of n_thread and // chooses the best one. // Compute with these local variables until the end of the function, at // which time we will save the values back to nt1 and nt2. dim_t tn1 = 1; dim_t tn2 = 1; // Both algorithms need the prime factorization of n_thread. bli_prime_factors_t factors; bli_prime_factorization( n_thread, &factors ); // Eight prime factors handles n_thread up to 223092870. dim_t fact[8]; dim_t mult[8]; // There is always at least one prime factor, so use if for initialization. dim_t nfact = 1; fact[0] = bli_next_prime_factor( &factors ); mult[0] = 1; // Collect the remaining prime factors, accounting for multiplicity of // repeated factors. dim_t f; while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) { if ( f == fact[nfact-1] ) { mult[nfact-1]++; } else { nfact++; fact[nfact-1] = f; mult[nfact-1] = 1; } } // Now loop over all factor pairs. A single factor pair is denoted by how // many of each prime factor are included in the first factor (ntaken). dim_t ntake[8] = {0}; dim_t min_diff = INT_MAX; // Loop over how many prime factors to assign to the first factor in the // pair, for each prime factor. The total number of iterations is // \Prod_{i=0}^{nfact-1} mult[i]. bool done = FALSE; while ( !done ) { dim_t x = 1; dim_t y = 1; // Form the factors by integer exponentiation and accumulation. for ( dim_t i = 0 ; i < nfact ; i++ ) { x *= bli_ipow( fact[i], ntake[i] ); y *= bli_ipow( fact[i], mult[i]-ntake[i] ); } // Check if this factor pair is optimal by checking // |nt1*work2 - nt2*work1|. dim_t diff = llabs( x*work2 - y*work1 ); if ( diff < min_diff ) { min_diff = diff; tn1 = x; tn2 = y; } // Go to the next factor pair by doing an "odometer loop". for ( dim_t i = 0 ; i < nfact ; i++ ) { if ( ++ntake[i] > mult[i] ) { ntake[i] = 0; if ( i == nfact-1 ) done = TRUE; else continue; } break; } } // Save the final result. *nt1 = tn1; *nt2 = tn2; } #if 0 void bli_thread_partition_2x2_orig ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ) { // Copy nt1 and nt2 to local variables and then compute with those local // variables until the end of the function, at which time we will save the // values back to nt1 and nt2. dim_t tn1; // = *nt1; dim_t tn2; // = *nt2; // Partition a number of threads into two factors nt1 and nt2 such that // nt1/nt2 ~= work1/work2. There is a fast heuristic algorithm and a // slower optimal algorithm (which minimizes |nt1*work2 - nt2*work1|). // Return early small prime numbers of threads. if ( n_thread < 4 ) { tn1 = ( work1 >= work2 ? n_thread : 1 ); tn2 = ( work1 < work2 ? n_thread : 1 ); return; } tn1 = 1; tn2 = 1; // Both algorithms need the prime factorization of n_thread. bli_prime_factors_t factors; bli_prime_factorization( n_thread, &factors ); #if 1 // Fast algorithm: assign prime factors in increasing order to whichever // partition has more work to do. The work is divided by the number of // threads assigned at each iteration. This algorithm is sub-optimal in // some cases. We attempt to mitigate the cases that involve at least one // factor of 2. For example, in the partitioning of 12 with equal work // this algorithm tentatively finds 6x2. This factorization involves a // factor of 2 that can be reallocated, allowing us to convert it to the // optimal solution of 4x3. But some cases cannot be corrected this way // because they do not contain a factor of 2. For example, this algorithm // factors 105 (with equal work) into 21x5 whereas 7x15 would be optimal. //printf( "w1 w2 = %d %d (initial)\n", (int)work1, (int)work2 ); dim_t f; while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) { //printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d ... f = %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2, (int)f ); if ( work1 > work2 ) { work1 /= f; tn1 *= f; } else { work2 /= f; tn2 *= f; } } //printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d\n", (int)work1, (int)work2, (int)tn1, (int)tn2 ); // Sometimes the last factor applied is prime. For example, on a square // matrix, we tentatively arrive (from the logic above) at: // - a 2x6 factorization when given 12 ways of parallelism // - a 2x10 factorization when given 20 ways of parallelism // - a 2x14 factorization when given 28 ways of parallelism // These factorizations are suboptimal under the assumption that we want // the parallelism to be as balanced as possible. Below, we make a final // attempt at rebalancing nt1 and nt2 by checking to see if the gap between // work1 and work2 is narrower if we reallocate a factor of 2. if ( work1 > work2 ) { // Example: nt = 12 // w1 w2 (initial) = 3600 3600; nt1 nt2 = 1 1 // w1 w2 (tentative) = 1800 600; nt1 nt2 = 2 6 // w1 w2 (ideal) = 900 1200; nt1 nt2 = 4 3 if ( tn2 % 2 == 0 ) { dim_t diff = work1 - work2; dim_t diff_mod = bli_abs( work1/2 - work2*2 ); if ( diff_mod < diff ) { tn1 *= 2; tn2 /= 2; } } } else if ( work1 < work2 ) { // Example: nt = 40 // w1 w2 (initial) = 3600 3600; nt1 nt2 = 1 1 // w1 w2 (tentative) = 360 900; nt1 nt2 = 10 4 // w1 w2 (ideal) = 720 450; nt1 nt2 = 5 8 if ( tn1 % 2 == 0 ) { dim_t diff = work2 - work1; dim_t diff_mod = bli_abs( work2/2 - work1*2 ); if ( diff_mod < diff ) { tn1 /= 2; tn2 *= 2; } } } //printf( "w1 w2 = %4d %4d nt1 nt2 = %d %d (final)\n", (int)work1, (int)work2, (int)tn1, (int)tn2 ); #else // Slow algorithm: exhaustively constructs all factor pairs of n_thread and // chooses the best one. // Eight prime factors handles n_thread up to 223092870. dim_t fact[8]; dim_t mult[8]; // There is always at least one prime factor, so use if for initialization. dim_t nfact = 1; fact[0] = bli_next_prime_factor( &factors ); mult[0] = 1; // Collect the remaining prime factors, accounting for multiplicity of // repeated factors. dim_t f; while ( ( f = bli_next_prime_factor( &factors ) ) > 1 ) { if ( f == fact[nfact-1] ) { mult[nfact-1]++; } else { nfact++; fact[nfact-1] = f; mult[nfact-1] = 1; } } // Now loop over all factor pairs. A single factor pair is denoted by how // many of each prime factor are included in the first factor (ntaken). dim_t ntake[8] = {0}; dim_t min_diff = INT_MAX; // Loop over how many prime factors to assign to the first factor in the // pair, for each prime factor. The total number of iterations is // \Prod_{i=0}^{nfact-1} mult[i]. bool done = FALSE; while ( !done ) { dim_t x = 1; dim_t y = 1; // Form the factors by integer exponentiation and accumulation. for (dim_t i = 0 ; i < nfact ; i++ ) { x *= bli_ipow( fact[i], ntake[i] ); y *= bli_ipow( fact[i], mult[i]-ntake[i] ); } // Check if this factor pair is optimal by checking // |nt1*work2 - nt2*work1|. dim_t diff = llabs( x*work2 - y*work1 ); if ( diff < min_diff ) { min_diff = diff; tn1 = x; tn2 = y; } // Go to the next factor pair by doing an "odometer loop". for ( dim_t i = 0 ; i < nfact ; i++ ) { if ( ++ntake[i] > mult[i] ) { ntake[i] = 0; if ( i == nfact-1 ) done = TRUE; else continue; } break; } } #endif // Save the final result. *nt1 = tn1; *nt2 = tn2; } #endif // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ) { while ( y != 0 ) { dim_t t = y; y = x % y; x = t; } return x; } dim_t bli_lcm( dim_t x, dim_t y) { return x * y / bli_gcd( x, y ); } dim_t bli_ipow( dim_t base, dim_t power ) { dim_t p = 1; for ( dim_t mask = 0x1 ; mask <= power ; mask <<= 1 ) { if ( power & mask ) p *= base; base *= base; } return p; } // ----------------------------------------------------------------------------- dim_t bli_thread_get_jc_nt( void ) { // We must ensure that global_rntm has been initialized. bli_init_once(); return bli_rntm_jc_ways( &global_rntm ); } dim_t bli_thread_get_pc_nt( void ) { // We must ensure that global_rntm has been initialized. bli_init_once(); return bli_rntm_pc_ways( &global_rntm ); } dim_t bli_thread_get_ic_nt( void ) { // We must ensure that global_rntm has been initialized. bli_init_once(); return bli_rntm_ic_ways( &global_rntm ); } dim_t bli_thread_get_jr_nt( void ) { // We must ensure that global_rntm has been initialized. bli_init_once(); return bli_rntm_jr_ways( &global_rntm ); } dim_t bli_thread_get_ir_nt( void ) { // We must ensure that global_rntm has been initialized. bli_init_once(); return bli_rntm_ir_ways( &global_rntm ); } dim_t bli_thread_get_num_threads( void ) { // We must ensure that global_rntm has been initialized. bli_init_once(); return bli_rntm_num_threads( &global_rntm ); } // ---------------------------------------------------------------------------- void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ) { // We must ensure that global_rntm has been initialized. bli_init_once(); // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, &global_rntm ); // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); } void bli_thread_set_num_threads( dim_t n_threads ) { // We must ensure that global_rntm has been initialized. bli_init_once(); // Acquire the mutex protecting global_rntm. bli_pthread_mutex_lock( &global_rntm_mutex ); bli_rntm_set_num_threads_only( n_threads, &global_rntm ); // Release the mutex protecting global_rntm. bli_pthread_mutex_unlock( &global_rntm_mutex ); } // ---------------------------------------------------------------------------- void bli_thread_init_rntm_from_env ( rntm_t* rntm ) { // NOTE: We don't need to acquire the global_rntm_mutex here because this // function is only called from bli_thread_init(), which is only called // by bli_init_once(). bool auto_factor = FALSE; dim_t nt; dim_t jc, pc, ic, jr, ir; #ifdef BLIS_ENABLE_MULTITHREADING // Try to read BLIS_NUM_THREADS first. nt = bli_env_get_var( "BLIS_NUM_THREADS", -1 ); // If BLIS_NUM_THREADS was not set, try to read OMP_NUM_THREADS. if ( nt == -1 ) nt = bli_env_get_var( "OMP_NUM_THREADS", -1 ); // Read the environment variables for the number of threads (ways // of parallelism) for each individual loop. jc = bli_env_get_var( "BLIS_JC_NT", -1 ); pc = bli_env_get_var( "BLIS_PC_NT", -1 ); ic = bli_env_get_var( "BLIS_IC_NT", -1 ); jr = bli_env_get_var( "BLIS_JR_NT", -1 ); ir = bli_env_get_var( "BLIS_IR_NT", -1 ); // If any BLIS_*_NT environment variable was set, then we ignore the // value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the // BLIS_*_NT values instead (with unset variables being treated as if // they contained 1). if ( jc != -1 || pc != -1 || ic != -1 || jr != -1 || ir != -1 ) { if ( jc == -1 ) jc = 1; if ( pc == -1 ) pc = 1; if ( ic == -1 ) ic = 1; if ( jr == -1 ) jr = 1; if ( ir == -1 ) ir = 1; // Unset the value for nt. nt = -1; } // By this time, one of the following conditions holds: // - nt is -1 and the ways for each loop are -1. // - nt is -1 and the ways for each loop are all set. // - nt is set and the ways for each loop are -1. // If nt is set (ie: not -1), then we know we will perform an automatic // thread factorization (later, in bli_rntm.c). if ( nt != -1 ) auto_factor = TRUE; #else // When multithreading is disabled, always set the rntm_t ways // values to 1. nt = -1; jc = pc = ic = jr = ir = 1; #endif // Save the results back in the runtime object. bli_rntm_set_auto_factor_only( auto_factor, rntm ); bli_rntm_set_num_threads_only( nt, rntm ); bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm ); #if 0 printf( "bli_thread_init_rntm_from_env()\n" ); bli_rntm_print( rntm ); #endif } cython-blis-0.9.1/blis/_src/frame/thread/bli_thread.h000066400000000000000000000204421427272030600224470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. #include "bli_thrcomm.h" // Include thread info (thrinfo_t) object definitions and prototypes. #include "bli_thrinfo.h" #include "bli_thrinfo_sup.h" // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! #include "bli_packm_thrinfo.h" #include "bli_l3_thrinfo.h" // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. #include "bli_l3_decor.h" // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. #include "bli_l3_sup_decor.h" // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrinfo.c000066400000000000000000000475011427272030600226510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrinfo_create(): " ); #endif thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) ); bli_thrinfo_init ( thread, ocomm, ocomm_id, n_way, work_id, free_comm, bszid, sub_node ); return thread; } void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ) { bli_thrinfo_set_ocomm( ocomm, thread ); bli_thrinfo_set_ocomm_id( ocomm_id, thread ); bli_thrinfo_set_n_way( n_way, thread ); bli_thrinfo_set_work_id( work_id, thread ); bli_thrinfo_set_free_comm( free_comm, thread ); bli_thrinfo_set_bszid( bszid, thread ); bli_thrinfo_set_sub_node( sub_node, thread ); bli_thrinfo_set_sub_prenode( NULL, thread ); } void bli_thrinfo_init_single ( thrinfo_t* thread ) { bli_thrinfo_init ( thread, &BLIS_SINGLE_COMM, 0, 1, 0, FALSE, BLIS_NO_PART, thread ); } void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ) { if ( thread == NULL || thread == &BLIS_PACKM_SINGLE_THREADED || thread == &BLIS_GEMM_SINGLE_THREADED ) return; thrinfo_t* thrinfo_sub_prenode = bli_thrinfo_sub_prenode( thread ); thrinfo_t* thrinfo_sub_node = bli_thrinfo_sub_node( thread ); // Recursively free all children of the current thrinfo_t. if ( thrinfo_sub_prenode != NULL ) { bli_thrinfo_free( rntm, thrinfo_sub_prenode ); } // Recursively free all children of the current thrinfo_t. if ( thrinfo_sub_node != NULL ) { bli_thrinfo_free( rntm, thrinfo_sub_node ); } // Free the communicators, but only if the current thrinfo_t struct // is marked as needing them to be freed. The most common example of // thrinfo_t nodes NOT marked as needing their comms freed are those // associated with packm thrinfo_t nodes. if ( bli_thrinfo_needs_free_comm( thread ) ) { // The ochief always frees his communicator. if ( bli_thread_am_ochief( thread ) ) bli_thrcomm_free( rntm, bli_thrinfo_ocomm( thread ) ); } #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_thrinfo_free(): " ); #endif // Free the thrinfo_t struct. bli_sba_release( rntm, thread ); } // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { // First, consider the prenode branch of the thrinfo_t tree, which should be // expanded only if there exists a prenode branch in the cntl_t tree. if ( bli_cntl_sub_prenode( cntl ) != NULL ) { // We only need to take action if the thrinfo_t sub-node is NULL; if it // is non-NULL, then it has already been created and we'll use it as-is. if ( bli_thrinfo_sub_prenode( thread ) == NULL ) { // Assertion / sanity check. if ( bli_cntl_bszid( cntl ) != BLIS_MC ) { printf( "Assertion failed: Expanding prenode for non-IC loop?\n" ); bli_abort(); } // Now we must create the packa, jr, and ir nodes that make up // the prenode branch of current cntl_t node. // Create a new node (or, if needed, multiple nodes) along the // prenode branch of the tree and return the pointer to the // (highest) child. thrinfo_t* thread_prenode = bli_thrinfo_rgrow_prenode ( rntm, cntl, bli_cntl_sub_prenode( cntl ), thread ); // Attach the child thrinfo_t node for the secondary branch to its // parent structure. bli_thrinfo_set_sub_prenode( thread_prenode, thread ); } } // Now, grow the primary branch of the thrinfo_t tree. // NOTE: If bli_thrinfo_rgrow() is being called, the sub_node field will // always be non-NULL, and so there's no need to check it. //if ( bli_cntl_sub_node( cntl ) != NULL ) { // We only need to take action if the thrinfo_t sub-node is NULL; if it // is non-NULL, then it has already been created and we'll use it as-is. if ( bli_thrinfo_sub_node( thread ) == NULL ) { // Create a new node (or, if needed, multiple nodes) along the // main sub-node branch of the tree and return the pointer to the // (highest) child. thrinfo_t* thread_child = bli_thrinfo_rgrow ( rntm, cntl, bli_cntl_sub_node( cntl ), thread ); // Attach the child thrinfo_t node for the primary branch to its // parent structure. bli_thrinfo_set_sub_node( thread_child, thread ); } } } // ----------------------------------------------------------------------------- thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ) { thrinfo_t* thread_cur; // We must handle two cases: those where the next node in the // control tree is a partitioning node, and those where it is // a non-partitioning (ie: packing) node. if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) { // Create the child thrinfo_t node corresponding to cntl_cur, // with cntl_par being the parent. thread_cur = bli_thrinfo_create_for_cntl ( rntm, cntl_par, cntl_cur, thread_par ); } else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) { // Recursively grow the thread structure and return the top-most // thrinfo_t node of that segment. thrinfo_t* thread_seg = bli_thrinfo_rgrow ( rntm, cntl_par, bli_cntl_sub_node( cntl_cur ), thread_par ); // Create a thrinfo_t node corresponding to cntl_cur. Since the // corresponding cntl node, cntl_cur, is a non-partitioning node // (bszid = BLIS_NO_PART), this means it's a packing node. Packing // thrinfo_t nodes are formed differently than those corresponding to // partitioning nodes; specifically, their work_id's are set equal to // the their comm_id's. Also, notice that the free_comm field is set // to FALSE since cntl_cur is a non-partitioning node. The reason: // the communicator used here will be freed when thread_seg, or one // of its descendents, is freed. thread_cur = bli_thrinfo_create ( rntm, // rntm bli_thrinfo_ocomm( thread_seg ), // ocomm bli_thread_ocomm_id( thread_seg ), // ocomm_id bli_cntl_calc_num_threads_in( rntm, cntl_cur ), // n_way bli_thread_ocomm_id( thread_seg ), // work_id FALSE, // free_comm BLIS_NO_PART, // bszid thread_seg // sub_node ); } return thread_cur; } #define BLIS_NUM_STATIC_COMMS 80 thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ) { // If we are running with a single thread, all of the code can be reduced // and simplified to this. if ( bli_rntm_calc_num_threads( rntm ) == 1 ) { thrinfo_t* thread_chl = bli_thrinfo_create ( rntm, // rntm &BLIS_SINGLE_COMM, // ocomm 0, // ocomm_id 1, // n_way 0, // work_id FALSE, // free_comm BLIS_NO_PART, // bszid NULL // sub_node ); return thread_chl; } thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; thrcomm_t** new_comms = NULL; const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); const dim_t parent_n_way = bli_thread_n_way( thread_par ); const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); const dim_t parent_work_id = bli_thread_work_id( thread_par ); // Sanity check: make sure the number of threads in the parent's // communicator is divisible by the number of new sub-groups. if ( parent_nt_in % parent_n_way != 0 ) { printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); bli_abort(); } // Compute: // - the number of threads inside the new child comm, // - the current thread's id within the new communicator, // - the current thread's work id, given the ways of parallelism // to be obtained within the next loop. const dim_t child_nt_in = bli_cntl_calc_num_threads_in( rntm, cntl_chl ); const dim_t child_n_way = bli_rntm_ways_for( bszid_chl, rntm ); const dim_t child_comm_id = parent_comm_id % child_nt_in; const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); //printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl ); // The parent's chief thread creates a temporary array of thrcomm_t // pointers. if ( bli_thread_am_ochief( thread_par ) ) { err_t r_val; if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); else new_comms = static_comms; } // Broadcast the temporary array to all threads in the parent's // communicator. new_comms = bli_thread_broadcast( thread_par, new_comms ); // Chiefs in the child communicator allocate the communicator // object and store it in the array element corresponding to the // parent's work id. if ( child_comm_id == 0 ) new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); bli_thread_barrier( thread_par ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. thrinfo_t* thread_chl = bli_thrinfo_create ( rntm, // rntm new_comms[ parent_work_id ], // ocomm child_comm_id, // ocomm_id child_n_way, // n_way child_work_id, // work_id TRUE, // free_comm bszid_chl, // bszid NULL // sub_node ); bli_thread_barrier( thread_par ); // The parent's chief thread frees the temporary array of thrcomm_t // pointers. if ( bli_thread_am_ochief( thread_par ) ) { if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) bli_free_intl( new_comms ); } return thread_chl; } // ----------------------------------------------------------------------------- thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ) { thrinfo_t* thread_cur; // We must handle two cases: those where the next node in the // control tree is a partitioning node, and those where it is // a non-partitioning (ie: packing) node. if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART ) { // Create the child thrinfo_t node corresponding to cntl_cur, // with cntl_par being the parent. thread_cur = bli_thrinfo_create_for_cntl_prenode ( rntm, cntl_par, cntl_cur, thread_par ); } else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART ) { // Recursively grow the thread structure and return the top-most // thrinfo_t node of that segment. thrinfo_t* thread_seg = bli_thrinfo_rgrow_prenode ( rntm, cntl_par, bli_cntl_sub_node( cntl_cur ), thread_par ); // Create a thrinfo_t node corresponding to cntl_cur. Since the // corresponding cntl node, cntl_cur, is a non-partitioning node // (bszid = BLIS_NO_PART), this means it's a packing node. Packing // thrinfo_t nodes are formed differently than those corresponding to // partitioning nodes; specifically, their work_id's are set equal to // the their comm_id's. Also, notice that the free_comm field is set // to FALSE since cntl_cur is a non-partitioning node. The reason: // the communicator used here will be freed when thread_seg, or one // of its descendents, is freed. thread_cur = bli_thrinfo_create ( rntm, // rntm bli_thrinfo_ocomm( thread_seg ), // ocomm bli_thread_ocomm_id( thread_seg ), // ocomm_id bli_cntl_calc_num_threads_in( rntm, cntl_par ), // n_way bli_thread_ocomm_id( thread_seg ), // work_id FALSE, // free_comm BLIS_NO_PART, // bszid thread_seg // sub_node ); } return thread_cur; } thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ) { // NOTE: This function only has to work for the ic -> (pa -> jr) // thrinfo_t tree branch extension. After that, the function // bli_thrinfo_create_for_cntl() will be called for the last jr->ir // branch extension. const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl ); const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); const dim_t parent_n_way = bli_thread_n_way( thread_par ); const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); //const dim_t parent_work_id = bli_thread_work_id( thread_par ); // Sanity check: make sure the number of threads in the parent's // communicator is divisible by the number of new sub-groups. if ( parent_nt_in % parent_n_way != 0 ) { printf( "Assertion failed: parent_nt_in (%d) parent_n_way (%d) != 0\n", ( int )parent_nt_in, ( int )parent_n_way ); bli_abort(); } //dim_t child_nt_in = bli_cntl_calc_num_threads_in( rntm, cntl_chl ); //dim_t child_n_way = bli_rntm_ways_for( bszid_chl, rntm ); const dim_t child_nt_in = parent_nt_in; const dim_t child_n_way = parent_nt_in; const dim_t child_comm_id = parent_comm_id % child_nt_in; const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); bli_thread_barrier( thread_par ); // NOTE: Recall that parent_comm_id == child_comm_id, so checking for the // parent's chief-ness is equivalent to checking for chief-ness in the new // about-to-be-created communicator group. thrcomm_t* new_comm = NULL; if ( bli_thread_am_ochief( thread_par ) ) new_comm = bli_thrcomm_create( rntm, child_nt_in ); // Broadcast the new thrcomm_t address to the other threads in the // parent's group. new_comm = bli_thread_broadcast( thread_par, new_comm ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. thrinfo_t* thread_chl = bli_thrinfo_create ( rntm, // rntm new_comm, // ocomm child_comm_id, // ocomm_id child_n_way, // n_way child_work_id, // work_id TRUE, // free_comm bszid_chl, // bszid NULL // sub_node ); bli_thread_barrier( thread_par ); return thread_chl; } // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { cntl_t* cntl_jc = cntl; thrinfo_t* thrinfo_jc = thread; bli_thrinfo_grow( rntm, cntl_jc, thrinfo_jc ); // inside jc loop: cntl_t* cntl_pc = bli_cntl_sub_node( cntl_jc ); thrinfo_t* thrinfo_pc = bli_thrinfo_sub_node( thrinfo_jc ); bli_thrinfo_grow( rntm, cntl_pc, thrinfo_pc ); // inside pc loop: cntl_t* cntl_pb = bli_cntl_sub_node( cntl_pc ); thrinfo_t* thrinfo_pb = bli_thrinfo_sub_node( thrinfo_pc ); bli_thrinfo_grow( rntm, cntl_pb, thrinfo_pb ); // after pb packing: cntl_t* cntl_ic = bli_cntl_sub_node( cntl_pb ); thrinfo_t* thrinfo_ic = bli_thrinfo_sub_node( thrinfo_pb ); bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic ); // -- main branch -- // inside ic loop: cntl_t* cntl_pa = bli_cntl_sub_node( cntl_ic ); thrinfo_t* thrinfo_pa = bli_thrinfo_sub_node( thrinfo_ic ); bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa ); // after pa packing: cntl_t* cntl_jr = bli_cntl_sub_node( cntl_pa ); thrinfo_t* thrinfo_jr = bli_thrinfo_sub_node( thrinfo_pa ); bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr ); // inside jr loop: //cntl_t* cntl_ir = bli_cntl_sub_node( cntl_jr ); //thrinfo_t* thrinfo_ir = bli_thrinfo_sub_node( thrinfo_jr ); // -- trsm branch -- // inside ic loop: cntl_t* cntl_pa0 = bli_cntl_sub_prenode( cntl_ic ); thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic ); bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 ); // after pa packing: cntl_t* cntl_jr0 = bli_cntl_sub_node( cntl_pa0 ); thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 ); bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 ); // inside jr loop: //cntl_t* cntl_ir0 = bli_cntl_sub_node( cntl_jr0 ); //thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 ); } void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ) { cntl_t* cntl_ic = cntl; thrinfo_t* thrinfo_ic = thread; bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic ); // -- main branch -- // inside ic loop: cntl_t* cntl_pa = bli_cntl_sub_node( cntl_ic ); thrinfo_t* thrinfo_pa = bli_thrinfo_sub_node( thrinfo_ic ); bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa ); // after pa packing: cntl_t* cntl_jr = bli_cntl_sub_node( cntl_pa ); thrinfo_t* thrinfo_jr = bli_thrinfo_sub_node( thrinfo_pa ); bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr ); // inside jr loop: //cntl_t* cntl_ir = bli_cntl_sub_node( cntl_jr ); //thrinfo_t* thrinfo_ir = bli_thrinfo_sub_node( thrinfo_jr ); // -- trsm branch -- // inside ic loop: cntl_t* cntl_pa0 = bli_cntl_sub_prenode( cntl_ic ); thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic ); bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 ); // after pa packing: cntl_t* cntl_jr0 = bli_cntl_sub_node( cntl_pa0 ); thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 ); bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 ); // inside jr loop: //cntl_t* cntl_ir0 = bli_cntl_sub_node( cntl_jr0 ); //thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 ); } #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrinfo.h000066400000000000000000000152361427272030600226560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/bli_thrinfo_sup.c000066400000000000000000000242351427272030600235370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ) { if ( thread == &BLIS_GEMM_SINGLE_THREADED || thread == &BLIS_PACKM_SINGLE_THREADED ) return; // NOTE: If bli_thrinfo_sup_rgrow() is being called, the sub_node field will // always be non-NULL, and so there's no need to check it. //if ( bli_cntl_sub_node( cntl ) != NULL ) { // We only need to take action if the thrinfo_t sub-node is NULL; if it // is non-NULL, then it has already been created and we'll use it as-is. if ( bli_thrinfo_sub_node( thread ) == NULL ) { // Create a new node (or, if needed, multiple nodes) along the // main sub-node branch of the tree and return the pointer to the // (highest) child. thrinfo_t* thread_child = bli_thrinfo_sup_rgrow ( rntm, bszid_par, &bszid_par[1], thread ); // Attach the child thrinfo_t node for the primary branch to its // parent structure. bli_thrinfo_set_sub_node( thread_child, thread ); } } } // ----------------------------------------------------------------------------- thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ) { thrinfo_t* thread_cur; // We must handle two cases: those where the next node in the // control tree is a partitioning node, and those where it is // a non-partitioning (ie: packing) node. if ( *bszid_cur != BLIS_NO_PART ) { // Create the child thrinfo_t node corresponding to cntl_cur, // with cntl_par being the parent. thread_cur = bli_thrinfo_sup_create_for_cntl ( rntm, bszid_par, bszid_cur, thread_par ); } else // if ( *bszid_cur == BLIS_NO_PART ) { // Recursively grow the thread structure and return the top-most // thrinfo_t node of that segment. thrinfo_t* thread_seg = bli_thrinfo_sup_rgrow ( rntm, bszid_par, &bszid_cur[1], thread_par ); // Create a thrinfo_t node corresponding to cntl_cur. Since the // corresponding cntl node, cntl_cur, is a non-partitioning node // (bszid = BLIS_NO_PART), this means it's a packing node. Packing // thrinfo_t nodes are formed differently than those corresponding to // partitioning nodes; specifically, their work_id's are set equal to // the their comm_id's. Also, notice that the free_comm field is set // to FALSE since cntl_cur is a non-partitioning node. The reason: // the communicator used here will be freed when thread_seg, or one // of its descendents, is freed. thread_cur = bli_thrinfo_create ( rntm, // rntm bli_thrinfo_ocomm( thread_seg ), // ocomm bli_thread_ocomm_id( thread_seg ), // ocomm_id bli_rntm_calc_num_threads_in( bszid_cur, rntm ), // n_way bli_thread_ocomm_id( thread_seg ), // work_id FALSE, // free_comm BLIS_NO_PART, // bszid thread_seg // sub_node ); } return thread_cur; } #define BLIS_NUM_STATIC_COMMS 80 thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ) { // If we are running with a single thread, all of the code can be reduced // and simplified to this. if ( bli_rntm_calc_num_threads( rntm ) == 1 ) { thrinfo_t* thread_chl = bli_thrinfo_create ( rntm, // rntm &BLIS_SINGLE_COMM, // ocomm 0, // ocomm_id 1, // n_way 0, // work_id FALSE, // free_comm BLIS_NO_PART, // bszid NULL // sub_node ); return thread_chl; } // The remainder of this function handles the cases involving the use of // multiple BLIS threads. if ( bli_rntm_pack_a( rntm ) == FALSE && bli_rntm_pack_b( rntm ) == FALSE ) { // If we are packing neither A nor B, there are no broadcasts or barriers // needed to synchronize threads (since all threads can work completely // independently). In this special case situation, the thrinfo_t can be // created with much simpler logic. const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); // Compute: // - the number of threads inside the new child comm, // - the current thread's id within the new communicator, // - the current thread's work id, given the ways of parallelism // to be obtained within the next loop. const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); const dim_t child_comm_id = parent_comm_id % child_nt_in; const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. thrinfo_t* thread_chl = bli_thrinfo_create ( rntm, // rntm NULL, // ocomm child_comm_id, // ocomm_id child_n_way, // n_way child_work_id, // work_id TRUE, // free_comm *bszid_chl, // bszid NULL // sub_node ); return thread_chl; } else { // If we are packing at least one of A or B, then we use the general // approach that employs broadcasts and barriers. thrcomm_t* static_comms[ BLIS_NUM_STATIC_COMMS ]; thrcomm_t** new_comms = NULL; const dim_t parent_nt_in = bli_thread_num_threads( thread_par ); const dim_t parent_n_way = bli_thread_n_way( thread_par ); const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par ); const dim_t parent_work_id = bli_thread_work_id( thread_par ); // Sanity check: make sure the number of threads in the parent's // communicator is divisible by the number of new sub-groups. if ( parent_nt_in % parent_n_way != 0 ) { printf( "Assertion failed: parent_nt_in parent_n_way != 0\n" ); bli_abort(); } // Compute: // - the number of threads inside the new child comm, // - the current thread's id within the new communicator, // - the current thread's work id, given the ways of parallelism // to be obtained within the next loop. const dim_t child_nt_in = bli_rntm_calc_num_threads_in( bszid_chl, rntm ); const dim_t child_n_way = bli_rntm_ways_for( *bszid_chl, rntm ); const dim_t child_comm_id = parent_comm_id % child_nt_in; const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way ); //printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl ); // The parent's chief thread creates a temporary array of thrcomm_t // pointers. if ( bli_thread_am_ochief( thread_par ) ) { err_t r_val; if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val ); else new_comms = static_comms; } // Broadcast the temporary array to all threads in the parent's // communicator. new_comms = bli_thread_broadcast( thread_par, new_comms ); // Chiefs in the child communicator allocate the communicator // object and store it in the array element corresponding to the // parent's work id. if ( child_comm_id == 0 ) new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in ); bli_thread_barrier( thread_par ); // All threads create a new thrinfo_t node using the communicator // that was created by their chief, as identified by parent_work_id. thrinfo_t* thread_chl = bli_thrinfo_create ( rntm, // rntm new_comms[ parent_work_id ], // ocomm child_comm_id, // ocomm_id child_n_way, // n_way child_work_id, // work_id TRUE, // free_comm *bszid_chl, // bszid NULL // sub_node ); bli_thread_barrier( thread_par ); // The parent's chief thread frees the temporary array of thrcomm_t // pointers. if ( bli_thread_am_ochief( thread_par ) ) { if ( parent_n_way > BLIS_NUM_STATIC_COMMS ) bli_free_intl( new_comms ); } return thread_chl; } } cython-blis-0.9.1/blis/_src/frame/thread/bli_thrinfo_sup.h000066400000000000000000000043741427272030600235460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif cython-blis-0.9.1/blis/_src/frame/thread/old/000077500000000000000000000000001427272030600207555ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/thread/old/bli_mutex.h000066400000000000000000000040061427272030600231160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MUTEX_H #define BLIS_MUTEX_H // Include definitions (mostly mtx_t) specific to the method of // multithreading. #include "bli_mutex_single.h" #include "bli_mutex_openmp.h" #include "bli_mutex_pthreads.h" // Thread mutex prototypes. #endif cython-blis-0.9.1/blis/_src/frame/thread/old/bli_mutex_openmp.h000066400000000000000000000045651427272030600245060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MUTEX_OPENMP_H #define BLIS_MUTEX_OPENMP_H // Define mutex_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // Define mtx_t. typedef struct mtx_s { omp_lock_t mutex; } mtx_t; // Define functions to operate on OpenMP-based mtx_t. static void bli_mutex_init( mtx_t* m ) { omp_init_lock( &(m->mutex) ); } static void bli_mutex_finalize( mtx_t* m ) { omp_destroy_lock( &(m->mutex) ); } static void bli_mutex_lock( mtx_t* m ) { omp_set_lock( &(m->mutex) ); } static void bli_mutex_unlock( mtx_t* m ) { omp_unset_lock( &(m->mutex) ); } #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/old/bli_mutex_pthreads.h000066400000000000000000000046451427272030600250210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MUTEX_PTHREADS_H #define BLIS_MUTEX_PTHREADS_H // Define mutex_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #include // Define mtx_t. typedef struct mtx_s { pthread_mutex_t mutex; } mtx_t; // Define macros to operate on pthread-based mtx_t. static void bli_mutex_init( mtx_t* m ) { pthread_mutex_init( &(m->mutex), NULL ); \ } static void bli_mutex_finalize( mtx_t* m ) { pthread_mutex_destroy( &(m->mutex) ); \ } static void bli_mutex_lock( mtx_t* m ) { pthread_mutex_lock( &(m->mutex) ); \ } static void bli_mutex_unlock( mtx_t* m ) { pthread_mutex_unlock( &(m->mutex) ); \ } #endif #endif cython-blis-0.9.1/blis/_src/frame/thread/old/bli_mutex_single.h000066400000000000000000000043201427272030600244560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2016, Hewlett Packard Enterprise Development LP Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef BLIS_MUTEX_SINGLE_H #define BLIS_MUTEX_SINGLE_H // Define mtx_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING // Define mtx_t. typedef struct mtx_s { } mtx_t; // Define macros to operate on pthread-based mtx_t. static void bli_mutex_init( mtx_t* m ) { } static void bli_mutex_finalize( mtx_t* m ) { } static void bli_mutex_lock( mtx_t* m ) { } static void bli_mutex_unlock( mtx_t* m ) { } #endif #endif cython-blis-0.9.1/blis/_src/frame/util/000077500000000000000000000000001427272030600177055ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/frame/util/bli_util.h000066400000000000000000000044471427272030600216720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_util_check.h" // Prototype object APIs (expert and non-expert). #include "bli_oapi_ex.h" #include "bli_util_oapi.h" #include "bli_xapi_undef.h" #include "bli_oapi_ba.h" #include "bli_util_oapi.h" #include "bli_xapi_undef.h" // Prototype typed APIs (expert and non-expert). #include "bli_tapi_ex.h" #include "bli_util_tapi.h" #include "bli_util_ft.h" #include "bli_xapi_undef.h" #include "bli_tapi_ba.h" #include "bli_util_tapi.h" #include "bli_util_ft.h" #include "bli_xapi_undef.h" // Generate function pointer arrays for tapi functions (expert only). #include "bli_util_fpa.h" // Prototype level-1m implementations. #include "bli_util_unb_var1.h" cython-blis-0.9.1/blis/_src/frame/util/bli_util_check.c000066400000000000000000000215541427272030600230200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define object-based check functions. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ) \ { \ bli_utilv_xa_check( x, asum ); \ } GENFRONT( asumv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ) \ { \ bli_utilm_mkhst_check( x ); \ } GENFRONT( mkherm ) GENFRONT( mksymm ) GENFRONT( mktrim ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ) \ { \ bli_utilv_norm_check( x, norm ); \ } GENFRONT( norm1v ) GENFRONT( normfv ) GENFRONT( normiv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ) \ { \ bli_utilm_norm_check( x, norm ); \ } GENFRONT( norm1m ) GENFRONT( normfm ) GENFRONT( normim ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ) \ { \ bli_utilm_rand_check( x ); \ } GENFRONT( randv ) GENFRONT( randnv ) GENFRONT( randm ) GENFRONT( randnm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ) \ { \ bli_utilv_sumsqv_check( x, scale, sumsq ); \ } GENFRONT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ) \ { \ bli_l0_xxbsc_check( chi, psi, is_eq ); \ } GENFRONT( eqsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ) \ { \ bli_l1v_xy_check( x, y ); \ } GENFRONT( eqv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ) \ { \ bli_l1m_xy_check( x, y ); \ } GENFRONT( eqm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ) \ { \ bli_utilm_fprint_check( file, s1, x, format, s2 ); \ } GENFRONT( fprintv ) GENFRONT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xa_check ( obj_t* x, obj_t* asum ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( asum ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( asum ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( asum ); bli_check_error_code( e_val ); } void bli_utilm_mkhst_check ( obj_t* a ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( a ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( a ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( a ); bli_check_error_code( e_val ); e_val = bli_check_square_object( a ); bli_check_error_code( e_val ); e_val = bli_check_object_diag_offset_equals( a, 0 ); bli_check_error_code( e_val ); // Check matrix storage. e_val = bli_check_upper_or_lower_object( a ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( a ); bli_check_error_code( e_val ); } void bli_utilv_norm_check ( obj_t* x, obj_t* norm ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( norm ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( norm ); bli_check_error_code( e_val ); e_val = bli_check_object_real_proj_of( x, norm ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( norm ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( norm ); bli_check_error_code( e_val ); } void bli_utilm_norm_check ( obj_t* x, obj_t* norm ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_noninteger_object( norm ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( norm ); bli_check_error_code( e_val ); e_val = bli_check_object_real_proj_of( x, norm ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_matrix_object( x ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( norm ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( norm ); bli_check_error_code( e_val ); } void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ) { err_t e_val; // Check argument pointers. e_val = bli_check_null_pointer( file ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( s1 ); bli_check_error_code( e_val ); e_val = bli_check_null_pointer( s2 ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } void bli_utilm_rand_check ( obj_t* x ) { err_t e_val; // Check object datatypes. e_val = bli_check_noninteger_object( x ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( x ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); } void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ) { err_t e_val; // Check object datatypes. e_val = bli_check_floating_object( x ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( scale ); bli_check_error_code( e_val ); e_val = bli_check_nonconstant_object( sumsq ); bli_check_error_code( e_val ); // Check object dimensions. e_val = bli_check_vector_object( x ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( scale ); bli_check_error_code( e_val ); e_val = bli_check_scalar_object( sumsq ); bli_check_error_code( e_val ); // Check object buffers (for non-NULLness). e_val = bli_check_object_buffer( x ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( scale ); bli_check_error_code( e_val ); e_val = bli_check_object_buffer( sumsq ); bli_check_error_code( e_val ); } cython-blis-0.9.1/blis/_src/frame/util/bli_util_check.h000066400000000000000000000102501427272030600230140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); cython-blis-0.9.1/blis/_src/frame/util/bli_util_fpa.c000066400000000000000000000054711427272030600225110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define function pointer query interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ GENARRAY_FPA( PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft), \ PASTECH(opname,BLIS_TAPI_EX_SUF) ); \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ) \ { \ return PASTECH2(opname,BLIS_TAPI_EX_SUF,_fpa)[ dt ]; \ } GENFRONT( asumv ) GENFRONT( mkherm ) GENFRONT( mksymm ) GENFRONT( mktrim ) GENFRONT( norm1v ) GENFRONT( normfv ) GENFRONT( normiv ) GENFRONT( norm1m ) GENFRONT( normfm ) GENFRONT( normim ) GENFRONT( randv ) GENFRONT( randnv ) GENFRONT( randm ) GENFRONT( randnm ) GENFRONT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #undef GENFRONT #define GENFRONT( opname ) \ \ /* GENARRAY_FPA( void_fp, opname ); \ */ \ \ GENARRAY_FPA( PASTECH(opname,_vft), \ PASTECH0(opname) ); \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ) \ { \ return PASTECH(opname,_fpa)[ dt ]; \ } GENFRONT( eqsc ) GENFRONT( eqv ) GENFRONT( eqm ) GENFRONT( fprintv ) GENFRONT( fprintm ) //GENFRONT( printv ) //GENFRONT( printm ) cython-blis-0.9.1/blis/_src/frame/util/bli_util_fpa.h000066400000000000000000000046171427272030600225170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) cython-blis-0.9.1/blis/_src/frame/util/bli_util_ft.h000066400000000000000000000135251427272030600223600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC cython-blis-0.9.1/blis/_src/frame/util/bli_util_oapi.c000066400000000000000000000376661427272030600227060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the object API macros. #ifdef BLIS_ENABLE_OAPI // // Define object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ void* buf_asum = bli_obj_buffer_at_off( asum ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, asum ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ n, \ buf_x, incx, \ buf_asum, \ cntx, \ rntm \ ); \ } GENFRONT( asumv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( a ); \ \ uplo_t uploa = bli_obj_uplo( a ); \ dim_t m = bli_obj_length( a ); \ void* buf_a = bli_obj_buffer_at_off( a ); \ inc_t rs_a = bli_obj_row_stride( a ); \ inc_t cs_a = bli_obj_col_stride( a ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( a ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ uploa, \ m, \ buf_a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } GENFRONT( mkherm ) GENFRONT( mksymm ) GENFRONT( mktrim ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ void* buf_norm = bli_obj_buffer_at_off( norm ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, norm ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ n, \ buf_x, incx, \ buf_norm, \ cntx, \ rntm \ ); \ } GENFRONT( norm1v ) GENFRONT( normfv ) GENFRONT( normiv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_norm = bli_obj_buffer_at_off( norm ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, norm ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ uplox, \ m, \ n, \ buf_x, rs_x, cs_x, \ buf_norm, \ cntx, \ rntm \ ); \ } GENFRONT( norm1m ) GENFRONT( normfm ) GENFRONT( normim ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ n, \ buf_x, incx, \ cntx, \ rntm \ ); \ } GENFRONT( randv ) GENFRONT( randnv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ uplox, \ m, \ n, \ buf_x, rs_x, cs_x, \ cntx, \ rntm \ ); \ } GENFRONT( randm ) GENFRONT( randnm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_OAPI_EX_DECLS \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ void* buf_scale = bli_obj_buffer_at_off( scale ); \ void* buf_sumsq = bli_obj_buffer_at_off( sumsq ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, scale, sumsq ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) f = \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( dt ); \ \ f \ ( \ n, \ buf_x, incx, \ buf_scale, \ buf_sumsq, \ cntx, \ rntm \ ); \ } GENFRONT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_OAPI_BASIC #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ) \ { \ bli_init_once(); \ \ num_t dt_chi = bli_obj_dt( chi ); \ num_t dt_psi = bli_obj_dt( psi ); \ num_t dt; \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( chi, psi, is_eq ); \ \ /* Decide which datatype will be used to query the buffer from the constant object (if there is one). */ \ if ( bli_is_constant( dt_psi ) ) dt = dt_chi; \ else dt = dt_psi; \ \ /* If chi and psi are both constants, then we compare only the dcomplex fields. */ \ if ( bli_is_constant( dt ) ) dt = BLIS_DCOMPLEX; \ \ void* buf_chi = bli_obj_buffer_for_1x1( dt, chi ); \ void* buf_psi = bli_obj_buffer_for_1x1( dt, psi ); \ \ /* Integer objects are handled separately. */ \ if ( bli_is_int( dt ) ) \ { \ *is_eq = bli_ieqa( buf_chi, buf_psi ); \ return; \ } \ \ /* Query the conj status of each object and use the two to come up with a single "net" conj_t value. */ \ conj_t conjchi = bli_obj_conj_status( chi ); \ conj_t conjpsi = bli_obj_conj_status( psi ); \ conj_t conj = bli_apply_conj( conjchi, conjpsi ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = \ PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ conj, \ buf_chi, \ buf_psi, \ is_eq \ ); \ } GENFRONT( eqsc ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t inc_x = bli_obj_vector_inc( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t inc_y = bli_obj_vector_inc( y ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, y, is_eq ); \ \ /* Query the conj status of each object and use the two to come up with a single "net" conj_t value. */ \ conj_t conjx = bli_obj_conj_status( x ); \ conj_t conjy = bli_obj_conj_status( y ); \ conj_t conj = bli_apply_conj( conjx, conjy ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = \ PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ conj, \ n, \ buf_x, inc_x, \ buf_y, inc_y, \ is_eq \ ); \ } GENFRONT( eqv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( x ); \ \ doff_t diagoffx = bli_obj_diag_offset( x ); \ diag_t diagx = bli_obj_diag( x ); \ uplo_t uplox = bli_obj_uplo( x ); \ dim_t m = bli_obj_length( y ); \ dim_t n = bli_obj_width( y ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ void* buf_y = bli_obj_buffer_at_off( y ); \ inc_t rs_y = bli_obj_row_stride( y ); \ inc_t cs_y = bli_obj_col_stride( y ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( x, y, is_eq ); \ \ /* Query the combined trans and conj status of each object and use the two to come up with a single "net" trans_t value. */ \ trans_t transx = bli_obj_conjtrans_status( x ); \ trans_t transy = bli_obj_conjtrans_status( y ); \ trans_t trans = bli_apply_trans( transy, transx ); \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = \ PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ diagoffx, \ diagx, \ uplox, \ trans, \ m, \ n, \ buf_x, rs_x, cs_x, \ buf_y, rs_y, cs_y, \ is_eq \ ); \ } GENFRONT( eqm ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t n = bli_obj_vector_dim( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t incx = bli_obj_vector_inc( x ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ \ /* Handle constants up front. */ \ if ( dt == BLIS_CONSTANT ) \ { \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ } \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = \ PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ file, \ s1, \ n, \ buf_x, incx, \ format, \ s2 \ ); \ } GENFRONT( fprintv ) #undef GENFRONT #define GENFRONT( opname ) \ \ void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ) \ { \ bli_init_once(); \ \ num_t dt = bli_obj_dt( x ); \ \ dim_t m = bli_obj_length( x ); \ dim_t n = bli_obj_width( x ); \ void* buf_x = bli_obj_buffer_at_off( x ); \ inc_t rs_x = bli_obj_row_stride( x ); \ inc_t cs_x = bli_obj_col_stride( x ); \ \ if ( bli_error_checking_is_enabled() ) \ PASTEMAC(opname,_check)( file, s1, x, format, s2 ); \ \ /* Handle constants up front. */ \ if ( dt == BLIS_CONSTANT ) \ { \ float* sp = bli_obj_buffer_for_const( BLIS_FLOAT, x ); \ double* dp = bli_obj_buffer_for_const( BLIS_DOUBLE, x ); \ scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \ dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \ gint_t* ip = bli_obj_buffer_for_const( BLIS_INT, x ); \ \ fprintf( file, "%s\n", s1 ); \ fprintf( file, " float: %9.2e\n", bli_sreal( *sp ) ); \ fprintf( file, " double: %9.2e\n", bli_dreal( *dp ) ); \ fprintf( file, " scomplex: %9.2e + %9.2e\n", bli_creal( *cp ), \ bli_cimag( *cp ) ); \ fprintf( file, " dcomplex: %9.2e + %9.2e\n", bli_zreal( *zp ), \ bli_zimag( *zp ) ); \ fprintf( file, " int: %ld\n", ( long )(*ip) ); \ fprintf( file, "\n" ); \ return; \ } \ \ /* Query a type-specific function pointer, except one that uses void* for function arguments instead of typed pointers. */ \ PASTECH(opname,_vft) f = \ PASTEMAC(opname,_qfp)( dt ); \ \ f \ ( \ file, \ s1, \ m, \ n, \ buf_x, rs_x, cs_x, \ format, \ s2 \ ); \ } GENFRONT( fprintm ) #undef GENFRONT #define GENFRONT( opname, varname ) \ \ void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ) \ { \ bli_init_once(); \ \ /* Invoke the typed function. */ \ PASTEMAC0(varname) \ ( \ stdout, \ s1, \ x, \ format, \ s2 \ ); \ } GENFRONT( printv, fprintv ) GENFRONT( printm, fprintm ) #endif // #ifdef BLIS_OAPI_BASIC #endif cython-blis-0.9.1/blis/_src/frame/util/bli_util_oapi.h000066400000000000000000000106521427272030600226750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC /* #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) */ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC cython-blis-0.9.1/blis/_src/frame/util/bli_util_oapi_ba.c000066400000000000000000000036711427272030600233350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_oapi_ba.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_util_oapi.c" cython-blis-0.9.1/blis/_src/frame/util/bli_util_oapi_ex.c000066400000000000000000000036671427272030600233740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_oapi_ex.h" // Define the macro protecting the object API definitions. #define BLIS_ENABLE_OAPI // Include the object API definitions here. #include "bli_util_oapi.c" cython-blis-0.9.1/blis/_src/frame/util/bli_util_tapi.c000066400000000000000000000275011427272030600226760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Guard the function definitions so that they are only compiled when // #included from files that define the typed API macros. #ifdef BLIS_ENABLE_TAPI // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If the vector length is zero, set the absolute sum return value to zero and return early. */ \ if ( bli_zero_dim1( n ) ) \ { \ PASTEMAC(chr,set0s)( *asum ); \ return; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ n, \ x, incx, \ asum, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC0( asumv ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If either dimension is zero, return early. */ \ if ( bli_zero_dim2( m, m ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ uploa, \ m, \ a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( mkherm ) INSERT_GENTFUNC_BASIC0( mksymm ) INSERT_GENTFUNC_BASIC0( mktrim ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If the vector length is zero, set the norm to zero and return early. */ \ if ( bli_zero_dim1( n ) ) \ { \ PASTEMAC(chr,set0s)( *norm ); \ return; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ n, \ x, incx, \ norm, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC0( norm1v ) INSERT_GENTFUNCR_BASIC0( normfv ) INSERT_GENTFUNCR_BASIC0( normiv ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If either dimension is zero, set the norm to zero and return early. */ \ if ( bli_zero_dim2( m, n ) ) \ { \ PASTEMAC(chr,set0s)( *norm ); \ return; \ } \ \ /* Obtain a valid context from the gks if necessary. */ \ if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ m, \ n, \ x, rs_x, cs_x, \ norm, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC0( norm1m ) INSERT_GENTFUNCR_BASIC0( normfm ) INSERT_GENTFUNCR_BASIC0( normim ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If the vector length is zero, return early. */ \ if ( bli_zero_dim1( n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ \ ctype_r norm; \ \ /* Set the norm to zero. */ \ PASTEMAC(chr,set0s)( norm ); \ \ /* Iterate at least once, but continue iterating until the norm is not zero. */ \ while ( PASTEMAC(chr,eq0)( norm ) ) \ { \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ n, \ x, incx, \ cntx, \ rntm \ ); \ \ /* Check the 1-norm of the randomzied vector. In the unlikely event that the 1-norm is zero, it means that *all* elements are zero, in which case we want to re-randomize until the 1-norm is not zero. */ \ PASTEMAC2(ch,norm1v,BLIS_TAPI_EX_SUF) \ ( \ n, \ x, incx, \ &norm, \ cntx, \ rntm \ ); \ } \ } INSERT_GENTFUNCR_BASIC0( randv ) INSERT_GENTFUNCR_BASIC0( randnv ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If either dimension is zero, return early. */ \ if ( bli_zero_dim2( m, n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ \ ctype_r norm; \ \ /* Set the norm to zero. */ \ PASTEMAC(chr,set0s)( norm ); \ \ /* Iterate at least once, but continue iterating until the norm is not zero. */ \ while ( PASTEMAC(chr,eq0)( norm ) ) \ { \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ uplox, \ m, \ n, \ x, rs_x, cs_x, \ cntx, \ rntm \ ); \ \ /* Check the 1-norm of the randomzied matrix. In the unlikely event that the 1-norm is zero, it means that *all* elements are zero, in which case we want to re-randomize until the 1-norm is not zero. */ \ PASTEMAC2(ch,norm1m,BLIS_TAPI_EX_SUF) \ ( \ diagoffx, \ BLIS_NONUNIT_DIAG, \ uplox, \ m, \ n, \ x, rs_x, cs_x, \ &norm, \ cntx, \ rntm \ ); \ } \ } INSERT_GENTFUNCR_BASIC0( randm ) INSERT_GENTFUNCR_BASIC0( randnm ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname ) \ \ void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ) \ { \ bli_init_once(); \ \ BLIS_TAPI_EX_DECLS \ \ /* If x is zero length, return with scale and sumsq unchanged. */ \ if ( bli_zero_dim1( n ) ) return; \ \ /* Obtain a valid context from the gks if necessary. */ \ /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ \ /* Invoke the helper variant, which loops over the appropriate kernel to implement the current operation. */ \ PASTEMAC2(ch,opname,_unb_var1) \ ( \ n, \ x, incx, \ scale, \ sumsq, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ) \ { \ bli_init_once(); \ \ ctype chi_conj; \ \ PASTEMAC(ch,copycjs)( conjchi, *chi, chi_conj ); \ \ *is_eq = PASTEMAC(ch,eq)( chi_conj, *psi ); \ } INSERT_GENTFUNC_BASIC0( eqsc ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ) \ { \ bli_init_once(); \ \ /* If x is zero length, return with a result of TRUE. */ \ if ( bli_zero_dim1( n ) ) { *is_eq = TRUE; return; } \ \ /* Obtain a valid context from the gks if necessary. */ \ /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ \ *is_eq = PASTEMAC2(ch,opname,_unb_var1) \ ( \ conjx, \ n, \ x, incx, \ y, incy \ ); \ } INSERT_GENTFUNC_BASIC0( eqv ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ) \ { \ bli_init_once(); \ \ /* If x has a zero dimension, return with a result of TRUE. See the _unb_var() variant for why we return TRUE in this scenario. */ \ if ( bli_zero_dim2( m, n ) ) { *is_eq = TRUE; return; } \ \ /* Obtain a valid context from the gks if necessary. */ \ /*if ( cntx == NULL ) cntx = bli_gks_query_cntx();*/ \ \ /* Invoke the helper variant. */ \ *is_eq = PASTEMAC2(ch,opname,_unb_var1) \ ( \ diagoffx, \ diagx, \ uplox, \ transx, \ m, \ n, \ x, rs_x, cs_x, \ y, rs_y, cs_y \ ); \ } INSERT_GENTFUNC_BASIC0( eqm ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, varname ) \ \ void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ) \ { \ bli_init_once(); \ \ PASTEMAC(ch,varname) \ ( \ stdout, \ s1, \ n, \ x, incx, \ format, \ s2 \ ); \ } INSERT_GENTFUNC_BASIC_I( printv, fprintv ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, varname ) \ \ void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ) \ { \ bli_init_once(); \ \ PASTEMAC(ch,varname) \ ( \ stdout, \ s1, \ m, \ n, \ x, rs_x, cs_x, \ format, \ s2 \ ); \ } INSERT_GENTFUNC_BASIC_I( printm, fprintm ) #endif // #ifdef BLIS_TAPI_BASIC #endif cython-blis-0.9.1/blis/_src/frame/util/bli_util_tapi.h000066400000000000000000000133361427272030600227040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC cython-blis-0.9.1/blis/_src/frame/util/bli_util_tapi_ba.c000066400000000000000000000036671427272030600233470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // omitting expert parameters. #include "bli_tapi_ba.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_util_tapi.c" cython-blis-0.9.1/blis/_src/frame/util/bli_util_tapi_ex.c000066400000000000000000000036651427272030600233770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Include cpp macros that instantiate the API definition templates as // having expert parameters. #include "bli_tapi_ex.h" // Define the macro protecting the typed API definitions. #define BLIS_ENABLE_TAPI // Include the typed API definitions here. #include "bli_util_tapi.c" cython-blis-0.9.1/blis/_src/frame/util/bli_util_unb_var1.c000066400000000000000000001010061427272030600234470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* chi1; \ ctype_r chi1_r; \ ctype_r chi1_i; \ ctype_r absum; \ dim_t i; \ \ /* Initialize the absolute sum accumulator to zero. */ \ PASTEMAC(chr,set0s)( absum ); \ \ for ( i = 0; i < n; ++i ) \ { \ chi1 = x + (i )*incx; \ \ /* Get the real and imaginary components of chi1. */ \ PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ \ /* Replace chi1_r and chi1_i with their absolute values. */ \ chi1_r = bli_fabs( chi1_r ); \ chi1_i = bli_fabs( chi1_i ); \ \ /* Accumulate the real and imaginary components into absum. */ \ PASTEMAC(chr,adds)( chi1_r, absum ); \ PASTEMAC(chr,adds)( chi1_i, absum ); \ } \ \ /* Store the final value of absum to the output variable. */ \ PASTEMAC(chr,copys)( absum, *asum ); \ } INSERT_GENTFUNCR_BASIC0( asumv_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype_r* zeror = PASTEMAC(chr,0); \ doff_t diagoffa; \ \ /* If the dimension is zero, return early. */ \ if ( bli_zero_dim1( m ) ) return; \ \ /* In order to avoid the main diagonal, we must nudge the diagonal either up or down by one, depending on which triangle is currently stored. */ \ if ( bli_is_upper( uploa ) ) diagoffa = 1; \ else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \ \ /* We will be reflecting the stored region over the diagonal into the unstored region, so a transposition is necessary. Furthermore, since we are creating a Hermitian matrix, we must also conjugate. */ \ PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ ( \ diagoffa, \ BLIS_NONUNIT_DIAG, \ uploa, \ BLIS_CONJ_TRANSPOSE, \ m, \ m, \ a, rs_a, cs_a, \ a, rs_a, cs_a, \ cntx, \ rntm \ ); \ \ /* Set the imaginary parts of the diagonal elements to zero. */ \ PASTEMAC2(ch,setid,BLIS_TAPI_EX_SUF) \ ( \ 0, \ m, \ m, \ zeror, \ a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC0( mkherm_unb_var1 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ doff_t diagoffa; \ \ /* If the dimension is zero, return early. */ \ if ( bli_zero_dim1( m ) ) return; \ \ /* In order to avoid the main diagonal, we must nudge the diagonal either up or down by one, depending on which triangle is currently stored. */ \ if ( bli_is_upper( uploa ) ) diagoffa = 1; \ else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \ \ /* We will be reflecting the stored region over the diagonal into the unstored region, so a transposition is necessary. */ \ PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \ ( \ diagoffa, \ BLIS_NONUNIT_DIAG, \ uploa, \ BLIS_TRANSPOSE, \ m, \ m, \ a, rs_a, cs_a, \ a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( mksymm_unb_var1 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* zero = PASTEMAC(ch,0); \ doff_t diagoffa; \ \ /* If the dimension is zero, return early. */ \ if ( bli_zero_dim1( m ) ) return; \ \ /* Toggle uplo so that it refers to the unstored triangle. */ \ bli_toggle_uplo( &uploa ); \ \ /* In order to avoid the main diagonal, we must nudge the diagonal either up or down by one, depending on which triangle is to be zeroed. */ \ if ( bli_is_upper( uploa ) ) diagoffa = 1; \ else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \ \ /* Set the unstored triangle to zero. */ \ PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ diagoffa, \ BLIS_NONUNIT_DIAG, \ uploa, \ m, \ m, \ zero, \ a, rs_a, cs_a, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNC_BASIC0( mktrim_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* chi1; \ ctype_r abs_chi1; \ ctype_r absum; \ dim_t i; \ \ /* Initialize the absolute sum accumulator to zero. */ \ PASTEMAC(chr,set0s)( absum ); \ \ for ( i = 0; i < n; ++i ) \ { \ chi1 = x + (i )*incx; \ \ /* Compute the absolute value (or complex magnitude) of chi1. */ \ PASTEMAC2(ch,chr,abval2s)( *chi1, abs_chi1 ); \ \ /* Accumulate the absolute value of chi1 into absum. */ \ PASTEMAC(chr,adds)( abs_chi1, absum ); \ } \ \ /* Store final value of absum to the output variable. */ \ PASTEMAC(chr,copys)( absum, *norm ); \ } INSERT_GENTFUNCR_BASIC0( norm1v_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype_r* zero = PASTEMAC(chr,0); \ ctype_r* one = PASTEMAC(chr,1); \ ctype_r scale; \ ctype_r sumsq; \ ctype_r sqrt_sumsq; \ \ /* Initialize scale and sumsq to begin the summation. */ \ PASTEMAC(chr,copys)( *zero, scale ); \ PASTEMAC(chr,copys)( *one, sumsq ); \ \ /* Compute the sum of the squares of the vector. */ \ PASTEMAC(ch,kername) \ ( \ n, \ x, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ \ /* Compute: norm = scale * sqrt( sumsq ) */ \ PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \ PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \ \ /* Store the final value to the output variable. */ \ PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \ } //INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 ) GENTFUNCR( scomplex, float, c, s, normfv_unb_var1, sumsqv_unb_var1 ) GENTFUNCR( dcomplex, double, z, d, normfv_unb_var1, sumsqv_unb_var1 ) #undef GENTFUNCR // We've disabled the dotv-based implementation because that method of // computing the sum of the squares of x inherently does not check for // overflow. Instead, we use the fallback method based on sumsqv, which // takes care to not overflow unnecessarily (ie: takes care for the // sqrt( sum of the squares of x ) to not overflow if the sum of the // squares of x would normally overflow. See GitHub issue #332 for // discussion. #if 0 //defined(FE_OVERFLOW) && !defined(__APPLE__) #define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype_r* zero = PASTEMAC(chr,0); \ ctype_r* one = PASTEMAC(chr,1); \ ctype_r scale; \ ctype_r sumsq; \ ctype_r sqrt_sumsq; \ \ /* Initialize scale and sumsq to begin the summation. */ \ PASTEMAC(chr,copys)( *zero, scale ); \ PASTEMAC(chr,copys)( *one, sumsq ); \ \ /* An optimization: first try to use dotv to compute the sum of the squares of the vector. If no floating-point exceptions (specifically, overflow and invalid exceptions) were produced, then we accept the computed value and returne early. The cost of this optimization is the "sunk" cost of the initial dotv when sumsqv must be used instead. However, we expect that the vast majority of use cases will not produce exceptions, and therefore only one pass through the data, via dotv, will be required. */ \ if ( TRUE ) \ { \ int f_exp_raised;\ ctype sumsqc; \ \ feclearexcept( FE_ALL_EXCEPT );\ \ PASTEMAC2(ch,dotv,BLIS_TAPI_EX_SUF) \ ( \ BLIS_NO_CONJUGATE, \ BLIS_NO_CONJUGATE, \ n,\ x, incx, \ x, incx, \ &sumsqc, \ cntx, \ rntm \ ); \ \ PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \ \ f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\ \ if ( !f_exp_raised ) \ { \ PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \ return; \ } \ } \ \ /* Compute the sum of the squares of the vector. */ \ PASTEMAC(ch,kername) \ ( \ n, \ x, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ \ /* Compute: norm = scale * sqrt( sumsq ) */ \ PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \ PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \ \ /* Store the final value to the output variable. */ \ PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \ } #else #define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype_r* zero = PASTEMAC(chr,0); \ ctype_r* one = PASTEMAC(chr,1); \ ctype_r scale; \ ctype_r sumsq; \ ctype_r sqrt_sumsq; \ \ /* Initialize scale and sumsq to begin the summation. */ \ PASTEMAC(chr,copys)( *zero, scale ); \ PASTEMAC(chr,copys)( *one, sumsq ); \ \ /* Compute the sum of the squares of the vector. */ \ \ PASTEMAC(ch,kername) \ ( \ n, \ x, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ \ /* Compute: norm = scale * sqrt( sumsq ) */ \ PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \ PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \ \ /* Store the final value to the output variable. */ \ PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \ } #endif GENTFUNCR( float, float, s, s, normfv_unb_var1, sumsqv_unb_var1 ) GENTFUNCR( double, double, d, d, normfv_unb_var1, sumsqv_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* chi1; \ ctype_r abs_chi1; \ ctype_r abs_chi1_max; \ dim_t i; \ \ /* Initialize the maximum absolute value to zero. */ \ PASTEMAC(chr,set0s)( abs_chi1_max ); \ \ for ( i = 0; i < n; ++i ) \ { \ chi1 = x + (i )*incx; \ \ /* Compute the absolute value (or complex magnitude) of chi1. */ \ PASTEMAC2(ch,chr,abval2s)( *chi1, abs_chi1 ); \ \ /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was larger than any previously seen. This behavior mimics that of LAPACK's ?lange(). */ \ if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ { \ PASTEMAC(chr,copys)( abs_chi1, abs_chi1_max ); \ } \ } \ \ /* Store the final value to the output variable. */ \ PASTEMAC(chr,copys)( abs_chi1_max, *norm ); \ } INSERT_GENTFUNCR_BASIC0( normiv_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* one = PASTEMAC(ch,1); \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype_r absum_max; \ ctype_r absum_j; \ ctype_r abval_chi1; \ uplo_t uplox_eff; \ dim_t n_iter; \ dim_t n_elem, n_elem_max; \ inc_t ldx, incx; \ dim_t j, i; \ dim_t ij0, n_shift; \ \ /* Initialize the maximum absolute column sum to zero. */ \ PASTEMAC(chr,set0s)( absum_max ); \ \ /* If either dimension is zero, return with absum_max equal to zero. */ \ if ( bli_zero_dim2( m, n ) ) \ { \ PASTEMAC(chr,copys)( absum_max, *norm ); \ return; \ } \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_1m_noswap \ ( \ diagoffx, BLIS_NONUNIT_DIAG, \ uplox, m, n, rs_x, cs_x, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \ &ij0, &n_shift \ ); \ \ /* If the matrix is zeros, return with absum_max equal to zero. */ \ if ( bli_is_zeros( uplox_eff ) ) \ { \ PASTEMAC(chr,copys)( absum_max, *norm ); \ return; \ } \ \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ n_elem = n_elem_max; \ \ x0 = x + (j )*ldx + (0 )*incx; \ \ /* Compute the norm of the current column. */ \ PASTEMAC(ch,kername) \ ( \ n_elem, \ x0, incx, \ &absum_j, \ cntx, \ rntm \ ); \ \ /* If absum_j is greater than the previous maximum value, then save it. */ \ if ( absum_max < absum_j || bli_isnan( absum_j ) ) \ { \ PASTEMAC(chr,copys)( absum_j, absum_max ); \ } \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ x0 = x + (ij0+j )*ldx + (0 )*incx; \ chi1 = x + (ij0+j )*ldx + (n_elem-1)*incx; \ \ /* Compute the norm of the super-diagonal elements. */ \ PASTEMAC(ch,kername) \ ( \ n_elem - 1, \ x0, incx, \ &absum_j, \ cntx, \ rntm \ ); \ \ if ( bli_is_unit_diag( diagx ) ) chi1 = one; \ \ /* Handle the diagonal element separately in case it's unit. */ \ PASTEMAC2(ch,chr,abval2s)( *chi1, abval_chi1 ); \ PASTEMAC(chr,adds)( abval_chi1, absum_j ); \ \ /* If absum_j is greater than the previous maximum value, then save it. */ \ if ( absum_max < absum_j || bli_isnan( absum_j ) ) \ { \ PASTEMAC(chr,copys)( absum_j, absum_max ); \ } \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ n_elem = n_elem_max - i; \ \ chi1 = x + (j )*ldx + (ij0+i )*incx; \ x2 = x + (j )*ldx + (ij0+i+1)*incx; \ \ /* Compute the norm of the sub-diagonal elements. */ \ PASTEMAC(ch,kername) \ ( \ n_elem - 1, \ x2, incx, \ &absum_j, \ cntx, \ rntm \ ); \ \ if ( bli_is_unit_diag( diagx ) ) chi1 = one; \ \ /* Handle the diagonal element separately in case it's unit. */ \ PASTEMAC2(ch,chr,abval2s)( *chi1, abval_chi1 ); \ PASTEMAC(chr,adds)( abval_chi1, absum_j ); \ \ /* If absum_j is greater than the previous maximum value, then save it. */ \ if ( absum_max < absum_j || bli_isnan( absum_j ) ) \ { \ PASTEMAC(chr,copys)( absum_j, absum_max ); \ } \ } \ } \ } \ \ /* Store final value of absum_max to the output variable. */ \ PASTEMAC(chr,copys)( absum_max, *norm ); \ } INSERT_GENTFUNCR_BASIC( norm1m_unb_var1, norm1v_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* one = PASTEMAC(ch,1); \ ctype_r* one_r = PASTEMAC(chr,1); \ ctype_r* zero_r = PASTEMAC(chr,0); \ ctype* x0; \ ctype* chi1; \ ctype* x2; \ ctype_r scale; \ ctype_r sumsq; \ ctype_r sqrt_sumsq; \ uplo_t uplox_eff; \ dim_t n_iter; \ dim_t n_elem, n_elem_max; \ inc_t ldx, incx; \ dim_t j, i; \ dim_t ij0, n_shift; \ \ /* Return a norm of zero if either dimension is zero. */ \ if ( bli_zero_dim2( m, n ) ) \ { \ PASTEMAC(chr,set0s)( *norm ); \ return; \ } \ \ /* Set various loop parameters. Here, we pretend that diagx is equal to BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \ bli_set_dims_incs_uplo_1m \ ( \ diagoffx, BLIS_NONUNIT_DIAG, \ uplox, m, n, rs_x, cs_x, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \ &ij0, &n_shift \ ); \ \ /* Check the effective uplo; if it's zeros, then our norm is zero. */ \ if ( bli_is_zeros( uplox_eff ) ) \ { \ PASTEMAC(chr,set0s)( *norm ); \ return; \ } \ \ /* Initialize scale and sumsq to begin the summation. */ \ PASTEMAC(chr,copys)( *zero_r, scale ); \ PASTEMAC(chr,copys)( *one_r, sumsq ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ n_elem = n_elem_max; \ \ x0 = x + (j )*ldx + (0 )*incx; \ \ /* Compute the norm of the current column. */ \ PASTEMAC(ch,kername) \ ( \ n_elem, \ x0, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ x0 = x + (ij0+j )*ldx + (0 )*incx; \ chi1 = x + (ij0+j )*ldx + (n_elem-1)*incx; \ \ /* Sum the squares of the super-diagonal elements. */ \ PASTEMAC(ch,kername) \ ( \ n_elem - 1, \ x0, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ \ if ( bli_is_unit_diag( diagx ) ) chi1 = one; \ \ /* Handle the diagonal element separately in case it's unit. */ \ PASTEMAC(ch,kername) \ ( \ 1, \ chi1, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ n_elem = n_elem_max - i; \ \ chi1 = x + (j )*ldx + (ij0+i )*incx; \ x2 = x + (j )*ldx + (ij0+i+1)*incx; \ \ /* Sum the squares of the sub-diagonal elements. */ \ PASTEMAC(ch,kername) \ ( \ n_elem - 1, \ x2, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ \ if ( bli_is_unit_diag( diagx ) ) chi1 = one; \ \ /* Handle the diagonal element separately in case it's unit. */ \ PASTEMAC(ch,kername) \ ( \ 1, \ chi1, incx, \ &scale, \ &sumsq, \ cntx, \ rntm \ ); \ } \ } \ } \ \ /* Compute: norm = scale * sqrt( sumsq ) */ \ PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \ PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \ \ /* Store the final value to the output variable. */ \ PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \ } INSERT_GENTFUNCR_BASIC( normfm_unb_var1, sumsqv_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ /* Induce a transposition so that rows become columns. */ \ bli_swap_dims( &m, &n ); \ bli_swap_incs( &rs_x, &cs_x ); \ bli_toggle_uplo( &uplox ); \ bli_negate_diag_offset( &diagoffx ); \ \ /* Now we can simply compute the 1-norm of this transposed matrix, which will be equivalent to the infinity-norm of the original matrix. */ \ PASTEMAC(ch,kername) \ ( \ diagoffx, \ diagx, \ uplox, \ m, \ n, \ x, rs_x, cs_x, \ norm, \ cntx, \ rntm \ ); \ } INSERT_GENTFUNCR_BASIC( normim_unb_var1, norm1m_unb_var1 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, randmac ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* chi1; \ dim_t i; \ \ chi1 = x; \ \ for ( i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,randmac)( *chi1 ); \ \ chi1 += incx; \ } \ } INSERT_GENTFUNC_BASIC( randv_unb_var1, rands ) INSERT_GENTFUNC_BASIC( randnv_unb_var1, randnp2s ) #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, kername ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ ctype* one = PASTEMAC(ch,1); \ ctype* x0; \ ctype* x1; \ ctype* x2; \ ctype* chi1; \ ctype beta; \ ctype omega; \ double max_m_n; \ uplo_t uplox_eff; \ dim_t n_iter; \ dim_t n_elem, n_elem_max; \ inc_t ldx, incx; \ dim_t j, i; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. Here, we pretend that diagx is equal to BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \ bli_set_dims_incs_uplo_1m \ ( \ diagoffx, BLIS_NONUNIT_DIAG, \ uplox, m, n, rs_x, cs_x, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \ &ij0, &n_shift \ ); \ \ if ( bli_is_zeros( uplox_eff ) ) return; \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ n_elem = n_elem_max; \ \ x1 = x + (j )*ldx + (0 )*incx; \ \ /*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \ PASTEMAC(ch,kername) \ ( \ n_elem, \ x1, incx, \ cntx, \ rntm \ ); \ } \ } \ else \ { \ max_m_n = bli_max( m, n ); \ \ PASTEMAC2(d,ch,sets)( max_m_n, 0.0, omega ); \ PASTEMAC(ch,copys)( *one, beta ); \ PASTEMAC(ch,invscals)( omega, beta ); \ \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ x1 = x + (ij0+j )*ldx + (0 )*incx; \ x0 = x1; \ chi1 = x1 + (n_elem-1)*incx; \ \ /*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \ PASTEMAC(ch,kername) \ ( \ n_elem, \ x1, incx, \ cntx, \ rntm \ ); \ \ ( void )x0; \ ( void )chi1; \ /* We want positive diagonal elements between 1 and 2. */ \ /* PASTEMAC(ch,abval2s)( *chi1, *chi1 ); \ PASTEMAC(ch,adds)( *one, *chi1 ); \ */ \ \ /* Scale the super-diagonal elements by 1/max(m,n). */ \ /* PASTEMAC(ch,scalv) \ ( \ BLIS_NO_CONJUGATE, \ n_elem - 1, \ &beta, \ x0, incx, \ cntx \ ); \ */ \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( j = 0; j < n_iter; ++j ) \ { \ i = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ n_elem = n_elem_max - i; \ \ x1 = x + (j )*ldx + (ij0+i )*incx; \ x2 = x1 + incx; \ chi1 = x1; \ \ /*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \ PASTEMAC(ch,kername) \ ( \ n_elem, \ x1, incx, \ cntx, \ rntm \ ); \ \ ( void )x2; \ ( void )chi1; \ /* We want positive diagonal elements between 1 and 2. */ \ /* PASTEMAC(ch,abval2s)( *chi1, *chi1 ); \ PASTEMAC(ch,adds)( *one, *chi1 ); \ */ \ \ /* Scale the sub-diagonal elements by 1/max(m,n). */ \ /* PASTEMAC(ch,scalv) \ ( \ BLIS_NO_CONJUGATE, \ n_elem - 1, \ &beta, \ x2, incx, \ cntx \ ); \ */ \ } \ } \ } \ } INSERT_GENTFUNC_BASIC( randm_unb_var1, randv_unb_var1 ) INSERT_GENTFUNC_BASIC( randnm_unb_var1, randnv_unb_var1 ) #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ) \ { \ const ctype_r zero_r = *PASTEMAC(chr,0); \ const ctype_r one_r = *PASTEMAC(chr,1); \ \ ctype* chi1; \ ctype_r chi1_r; \ ctype_r chi1_i; \ ctype_r scale_r; \ ctype_r sumsq_r; \ ctype_r abs_chi1_r; \ dim_t i; \ \ /* NOTE: This function attempts to mimic the algorithm for computing the Frobenius norm in netlib LAPACK's ?lassq(). */ \ \ /* Copy scale and sumsq to local variables. */ \ PASTEMAC(chr,copys)( *scale, scale_r ); \ PASTEMAC(chr,copys)( *sumsq, sumsq_r ); \ \ chi1 = x; \ \ for ( i = 0; i < n; ++i ) \ { \ /* Get the real and imaginary components of chi1. */ \ PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ \ abs_chi1_r = bli_fabs( chi1_r ); \ \ /* Accumulate real component into sumsq, adjusting scale if needed. */ \ if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \ { \ if ( scale_r < abs_chi1_r ) \ { \ sumsq_r = one_r + \ sumsq_r * ( scale_r / abs_chi1_r ) * \ ( scale_r / abs_chi1_r ); \ \ PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \ } \ else \ { \ sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \ ( abs_chi1_r / scale_r ); \ } \ } \ \ abs_chi1_r = bli_fabs( chi1_i ); \ \ /* Accumulate imaginary component into sumsq, adjusting scale if needed. */ \ if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \ { \ if ( scale_r < abs_chi1_r ) \ { \ sumsq_r = one_r + \ sumsq_r * ( scale_r / abs_chi1_r ) * \ ( scale_r / abs_chi1_r ); \ \ PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \ } \ else \ { \ sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \ ( abs_chi1_r / scale_r ); \ } \ } \ \ chi1 += incx; \ } \ \ /* Store final values of scale and sumsq to output variables. */ \ PASTEMAC(chr,copys)( scale_r, *scale ); \ PASTEMAC(chr,copys)( sumsq_r, *sumsq ); \ } INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ bool PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ) \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ ctype* chi1 = x + (i )*incx; \ ctype* psi1 = y + (i )*incy; \ \ ctype chi1c; \ \ if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *chi1, chi1c ); } \ else { PASTEMAC(ch,copys)( *chi1, chi1c ); } \ \ if ( !PASTEMAC(ch,eq)( chi1c, *psi1 ) ) \ return FALSE; \ } \ \ return TRUE; \ } INSERT_GENTFUNC_BASIC0( eqv_unb_var1 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ bool PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ) \ { \ uplo_t uplox_eff; \ conj_t conjx; \ dim_t n_iter; \ dim_t n_elem_max; \ inc_t ldx, incx; \ inc_t ldy, incy; \ dim_t ij0, n_shift; \ \ /* Set various loop parameters. */ \ bli_set_dims_incs_uplo_2m \ ( \ diagoffx, diagx, transx, \ uplox, m, n, rs_x, cs_x, rs_y, cs_y, \ &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \ &ij0, &n_shift \ ); \ \ /* In the odd case where we are comparing against a complete unstored matrix, we assert equality. Why? We assume the matrices are equal unless we can find two corresponding elements that are unequal. So if there are no elements, there is no inequality. Granted, this logic is strange to think about no matter what, and thankfully it should never be used under normal usage. */ \ if ( bli_is_zeros( uplox_eff ) ) return TRUE; \ \ /* Extract the conjugation component from the transx parameter. */ \ conjx = bli_extract_conj( transx ); \ \ /* Handle dense and upper/lower storage cases separately. */ \ if ( bli_is_dense( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = n_elem_max; \ \ ctype* x1 = x + (j )*ldx + (0 )*incx; \ ctype* y1 = y + (j )*ldy + (0 )*incy; \ \ for ( dim_t i = 0; i < n_elem; ++i ) \ { \ ctype* x11 = x1 + (i )*incx; \ ctype* y11 = y1 + (i )*incy; \ ctype x11c; \ \ if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \ else { PASTEMAC(ch,copys)( *x11, x11c ); } \ \ if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \ return FALSE; \ } \ } \ } \ else \ { \ if ( bli_is_upper( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \ \ ctype* x1 = x + (ij0+j )*ldx + (0 )*incx; \ ctype* y1 = y + (ij0+j )*ldy + (0 )*incy; \ \ for ( dim_t i = 0; i < n_elem; ++i ) \ { \ ctype* x11 = x1 + (i )*incx; \ ctype* y11 = y1 + (i )*incy; \ ctype x11c; \ \ if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \ else { PASTEMAC(ch,copys)( *x11, x11c ); } \ \ if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \ return FALSE; \ } \ } \ } \ else if ( bli_is_lower( uplox_eff ) ) \ { \ for ( dim_t j = 0; j < n_iter; ++j ) \ { \ const dim_t offi = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \ const dim_t n_elem = n_elem_max - offi; \ \ ctype* x1 = x + (j )*ldx + (ij0+offi )*incx; \ ctype* y1 = y + (j )*ldy + (ij0+offi )*incy; \ \ for ( dim_t i = 0; i < n_elem; ++i ) \ { \ ctype* x11 = x1 + (i )*incx; \ ctype* y11 = y1 + (i )*incy; \ ctype x11c; \ \ if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \ else { PASTEMAC(ch,copys)( *x11, x11c ); } \ \ if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \ return FALSE; \ } \ } \ } \ } \ \ return TRUE; \ } INSERT_GENTFUNC_BASIC0( eqm_unb_var1 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ) \ { \ dim_t i; \ ctype* chi1; \ char default_spec[32] = PASTEMAC(ch,formatspec)(); \ \ if ( format == NULL ) format = default_spec; \ \ chi1 = x; \ \ fprintf( file, "%s\n", s1 ); \ \ for ( i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,fprints)( file, format, *chi1 ); \ fprintf( file, "\n" ); \ \ chi1 += incx; \ } \ \ fprintf( file, "%s\n", s2 ); \ } INSERT_GENTFUNC_BASIC0_I( fprintv ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ) \ { \ dim_t i, j; \ ctype* chi1; \ char default_spec[32] = PASTEMAC(ch,formatspec)(); \ \ if ( format == NULL ) format = default_spec; \ \ fprintf( file, "%s\n", s1 ); \ \ for ( i = 0; i < m; ++i ) \ { \ for ( j = 0; j < n; ++j ) \ { \ chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \ \ PASTEMAC(ch,fprints)( file, format, *chi1 ); \ fprintf( file, " " ); \ } \ \ fprintf( file, "\n" ); \ } \ \ fprintf( file, "%s\n", s2 ); \ fflush( file ); \ } INSERT_GENTFUNC_BASIC0_I( fprintm ) cython-blis-0.9.1/blis/_src/frame/util/bli_util_unb_var1.h000066400000000000000000000127111427272030600234600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) cython-blis-0.9.1/blis/_src/include/000077500000000000000000000000001427272030600172615ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/darwin-firestorm/000077500000000000000000000000001427272030600225555ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/darwin-firestorm/blis.h000066400000000000000000045710671427272030600237030ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_FIRESTORM // Enabled sub-configurations (config_list) #define BLIS_CONFIG_FIRESTORM // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ARMV8A #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 0 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" // skipped #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" // skipped #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" // skipped #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" // skipped #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" // skipped #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" // skipped #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM // begin bli_family_firestorm.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 12 #define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 #define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 #define BLIS_DEFAULT_NC_S 3072 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 #define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 #define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif // end bli_family_firestorm.h #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC #include "bli_family_generic.h" // skipped #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" // skipped #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" // skipped #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" // skipped #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" // skipped #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A // begin bli_kernels_armv8a.h PACKM_KER_PROT( float, s, packm_armv8a_int_8xk ) PACKM_KER_PROT( float, s, packm_armv8a_int_12xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_6xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_8xk ) GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 ) GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x4 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_8x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x3 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_6x4mn ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_3x8mn ) // end bli_kernels_armv8a.h #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_FIRESTORM // Enabled sub-configurations (config_list) #define BLIS_CONFIG_FIRESTORM // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ARMV8A #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 0 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/darwin-generic/000077500000000000000000000000001427272030600221575ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/darwin-generic/blis.h000066400000000000000000045637541427272030600233110ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_GENERIC // Enabled sub-configurations (config_list) #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 0 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" // skipped #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" // skipped #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" // skipped #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" // skipped #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" // skipped #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" // skipped #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" // skipped #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" // skipped #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" // skipped #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" // skipped #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_GENERIC // Enabled sub-configurations (config_list) #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 0 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/darwin-x86_64/000077500000000000000000000000001427272030600215015ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/darwin-x86_64/blis.h000066400000000000000000047110661427272030600226220ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN3 #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN3 #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 // begin bli_family_x86_64.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64.h #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX // begin bli_family_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif // end bli_family_skx.h #endif #ifdef BLIS_FAMILY_KNL // begin bli_family_knl.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 4 #define BLIS_THREAD_RATIO_N 1 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // -- MEMORY ALLOCATION -------------------------------------------------------- //#define BLIS_TREE_BARRIER //#define BLIS_TREE_BARRIER_ARITY 4 #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16_knc #define BLIS_DEFAULT_MC_S 240 #define BLIS_DEFAULT_KC_S 240 #define BLIS_DEFAULT_NC_S 14400 #define BLIS_DEFAULT_MR_S 30 #define BLIS_DEFAULT_NR_S 16 #define BLIS_PACKDIM_MR_S 32 #define BLIS_PACKDIM_NR_S 16 #if 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8_knc #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #elif 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_30XK_KERNEL bli_dpackm_30xk_opt #else #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_24x8 #define BLIS_DEFAULT_MR_D 24 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 24 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt #endif #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) #endif //#endif // end bli_family_knl.h #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 // begin bli_family_zen3.h #ifndef BLI_FAMILY_ZEN3_ #define BLI_FAMILY_ZEN3_ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. // #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 #endif // end bli_family_zen3.h #endif #ifdef BLIS_FAMILY_ZEN2 // begin bli_family_zen2.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 #endif // end bli_family_zen2.h #endif #ifdef BLIS_FAMILY_ZEN // begin bli_family_zen.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif // end bli_family_zen.h #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX // begin bli_kernels_skx.h GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) // end bli_kernels_skx.h #endif #ifdef BLIS_KERNELS_KNL // begin bli_kernels_knl.h GEMM_UKR_PROT( float, s, gemm_knl_asm_24x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 ) PACKM_KER_PROT( float, s, packm_knl_asm_24xk ) PACKM_KER_PROT( float, s, packm_knl_asm_16xk ) PACKM_KER_PROT( double, d, packm_knl_asm_24xk ) PACKM_KER_PROT( double, d, packm_knl_asm_8xk ) // unused: GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) PACKM_KER_PROT( double, d, packm_knl_asm_30xk ) // end bli_kernels_knl.h #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 // begin bli_kernels_zen2.h // -- level-1f -- AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) // end bli_kernels_zen2.h #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN3 #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN3 #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/darwin-x86_64_no_skx/000077500000000000000000000000001427272030600230625ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/darwin-x86_64_no_skx/blis.h000066400000000000000000046565451427272030600242150ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_SKX // Enabled sub-configurations (config_list) #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX // begin bli_family_x86_64_no_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64_no_skx.h #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_SKX // Enabled sub-configurations (config_list) #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/darwin-x86_64_no_zen2/000077500000000000000000000000001427272030600231335ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/darwin-x86_64_no_zen2/blis.h000066400000000000000000046705641427272030600242630ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN2 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 // begin bli_family_x86_64_no_zen2.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64_no_zen2.h #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX // begin bli_family_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif // end bli_family_skx.h #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN // begin bli_family_zen.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif // end bli_family_zen.h #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX // begin bli_kernels_skx.h GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) // end bli_kernels_skx.h #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN2 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/darwin-x86_64_no_zen3/000077500000000000000000000000001427272030600231345ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/darwin-x86_64_no_zen3/blis.h000066400000000000000000047045031427272030600242530ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN3 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 // begin bli_family_x86_64_no_zen3.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64_no_zen3.h #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX // begin bli_family_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif // end bli_family_skx.h #endif #ifdef BLIS_FAMILY_KNL // begin bli_family_knl.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 4 #define BLIS_THREAD_RATIO_N 1 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // -- MEMORY ALLOCATION -------------------------------------------------------- //#define BLIS_TREE_BARRIER //#define BLIS_TREE_BARRIER_ARITY 4 #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16_knc #define BLIS_DEFAULT_MC_S 240 #define BLIS_DEFAULT_KC_S 240 #define BLIS_DEFAULT_NC_S 14400 #define BLIS_DEFAULT_MR_S 30 #define BLIS_DEFAULT_NR_S 16 #define BLIS_PACKDIM_MR_S 32 #define BLIS_PACKDIM_NR_S 16 #if 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8_knc #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #elif 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_30XK_KERNEL bli_dpackm_30xk_opt #else #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_24x8 #define BLIS_DEFAULT_MR_D 24 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 24 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt #endif #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) #endif //#endif // end bli_family_knl.h #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 // begin bli_family_zen2.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 #endif // end bli_family_zen2.h #endif #ifdef BLIS_FAMILY_ZEN // begin bli_family_zen.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif // end bli_family_zen.h #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX // begin bli_kernels_skx.h GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) // end bli_kernels_skx.h #endif #ifdef BLIS_KERNELS_KNL // begin bli_kernels_knl.h GEMM_UKR_PROT( float, s, gemm_knl_asm_24x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 ) PACKM_KER_PROT( float, s, packm_knl_asm_24xk ) PACKM_KER_PROT( float, s, packm_knl_asm_16xk ) PACKM_KER_PROT( double, d, packm_knl_asm_24xk ) PACKM_KER_PROT( double, d, packm_knl_asm_8xk ) // unused: GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) PACKM_KER_PROT( double, d, packm_knl_asm_30xk ) // end bli_kernels_knl.h #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 // begin bli_kernels_zen2.h // -- level-1f -- AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) // end bli_kernels_zen2.h #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN3 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-arm64/000077500000000000000000000000001427272030600213475ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-arm64/blis.h000066400000000000000000046022771427272030600224730ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_ARM64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_ARMSVE #define BLIS_CONFIG_FIRESTORM #define BLIS_CONFIG_THUNDERX2 #define BLIS_CONFIG_CORTEXA57 #define BLIS_CONFIG_CORTEXA53 #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ARMSVE #define BLIS_KERNELS_ARMV8A #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" // skipped #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" // skipped #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" // skipped #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" // skipped #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" // skipped #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" // skipped #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 // begin bli_family_arm64.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 #define W_L1_SVE_DEFAULT 4 #define C_L1_SVE_DEFAULT 256 #define N_L2_SVE_DEFAULT 2048 #define W_L2_SVE_DEFAULT 16 #define C_L2_SVE_DEFAULT 256 #define N_L3_SVE_DEFAULT 8192 #define W_L3_SVE_DEFAULT 16 #define C_L3_SVE_DEFAULT 256 //#endif // end bli_family_arm64.h #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE // begin bli_family_armsve.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 256 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 // SVE-specific configs. #define N_L1_SVE_DEFAULT 64 #define W_L1_SVE_DEFAULT 4 #define C_L1_SVE_DEFAULT 256 #define N_L2_SVE_DEFAULT 2048 #define W_L2_SVE_DEFAULT 16 #define C_L2_SVE_DEFAULT 256 #define N_L3_SVE_DEFAULT 8192 #define W_L3_SVE_DEFAULT 16 #define C_L3_SVE_DEFAULT 256 //#endif // end bli_family_armsve.h #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM // begin bli_family_firestorm.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 12 #define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 #define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 #define BLIS_DEFAULT_NC_S 3072 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 #define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 #define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif // end bli_family_firestorm.h #endif #ifdef BLIS_FAMILY_THUNDERX2 // begin bli_family_thunderx2.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 // end bli_family_thunderx2.h #endif #ifdef BLIS_FAMILY_CORTEXA57 // begin bli_family_cortexa57.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 12 #define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 #define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 #define BLIS_DEFAULT_NC_S 3072 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 #define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 #define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif // end bli_family_cortexa57.h #endif #ifdef BLIS_FAMILY_CORTEXA53 // begin bli_family_cortexa53.h // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 // end bli_family_cortexa53.h #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" // skipped #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" // skipped #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" // skipped #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" // skipped #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE // begin bli_kernels_armsve.h // begin ./3/bli_armsve_utils.h // skipped #include "blis.h" dim_t bli_vl_bytes_armsve(void); void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_c_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_z_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); // end ./3/bli_armsve_utils.h // GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( scomplex, c, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed ) // GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) // GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed ) // Use SVE intrinsics only for referred cases. #if !defined(BLIS_FAMILY_A64FX) PACKM_KER_PROT( double, d, packm_armsve256_int_8xk ) PACKM_KER_PROT( double, d, packm_armsve512_int_12xk ) #endif PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk ) PACKM_KER_PROT( double, d, packm_armsve512_asm_10xk ) // end bli_kernels_armsve.h #endif #ifdef BLIS_KERNELS_ARMV8A // begin bli_kernels_armv8a.h PACKM_KER_PROT( float, s, packm_armv8a_int_8xk ) PACKM_KER_PROT( float, s, packm_armv8a_int_12xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_6xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_8xk ) GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 ) GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x4 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_8x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x3 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_6x4mn ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_3x8mn ) // end bli_kernels_armv8a.h #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_ARM64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_ARMSVE #define BLIS_CONFIG_FIRESTORM #define BLIS_CONFIG_THUNDERX2 #define BLIS_CONFIG_CORTEXA57 #define BLIS_CONFIG_CORTEXA53 #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ARMSVE #define BLIS_KERNELS_ARMV8A #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-arm64_no_sve/000077500000000000000000000000001427272030600227205ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-arm64_no_sve/blis.h000066400000000000000000045753511427272030600240460ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_ARM64_NO_SVE // Enabled sub-configurations (config_list) #define BLIS_CONFIG_FIRESTORM #define BLIS_CONFIG_CORTEXA57 #define BLIS_CONFIG_CORTEXA53 #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ARMV8A #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" // skipped #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" // skipped #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" // skipped #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" // skipped #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" // skipped #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" // skipped #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64_NO_SVE // begin bli_family_arm64_no_sve.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#endif // end bli_family_arm64_no_sve.h #endif #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM // begin bli_family_firestorm.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 12 #define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 #define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 #define BLIS_DEFAULT_NC_S 3072 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 #define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 #define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif // end bli_family_firestorm.h #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 // begin bli_family_cortexa57.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_8x12 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 12 #define BLIS_DEFAULT_MC_S 120 //1536 //336 //416 // 1280 //160 // 160 // 160 //2048 //336 #define BLIS_DEFAULT_KC_S 640 //1536 //336 //704 //1280 //672 //528 // 856 //2048 //528 #define BLIS_DEFAULT_NC_S 3072 #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_6x8 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DEFAULT_MC_D 120 //1536 //160 //80 //176 #define BLIS_DEFAULT_KC_D 240 //1536 //304 //336 //368 #define BLIS_DEFAULT_NC_D 3072 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MC_C 64 #define BLIS_DEFAULT_KC_C 128 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 128 #define BLIS_DEFAULT_NC_Z 4096 #endif //#endif // end bli_family_cortexa57.h #endif #ifdef BLIS_FAMILY_CORTEXA53 // begin bli_family_cortexa53.h // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 // end bli_family_cortexa53.h #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" // skipped #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" // skipped #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" // skipped #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" // skipped #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A // begin bli_kernels_armv8a.h PACKM_KER_PROT( float, s, packm_armv8a_int_8xk ) PACKM_KER_PROT( float, s, packm_armv8a_int_12xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_6xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_8xk ) GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 ) GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x4 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_8x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x3 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_6x4mn ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_3x8mn ) // end bli_kernels_armv8a.h #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_ARM64_NO_SVE // Enabled sub-configurations (config_list) #define BLIS_CONFIG_FIRESTORM #define BLIS_CONFIG_CORTEXA57 #define BLIS_CONFIG_CORTEXA53 #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ARMV8A #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-generic/000077500000000000000000000000001427272030600220325ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-generic/blis.h000066400000000000000000045637541427272030600231640ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_GENERIC // Enabled sub-configurations (config_list) #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" // skipped #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" // skipped #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" // skipped #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" // skipped #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" // skipped #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" // skipped #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" // skipped #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" // skipped #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" // skipped #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" // skipped #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_GENERIC // Enabled sub-configurations (config_list) #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-power9/000077500000000000000000000000001427272030600216435ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-power9/blis.h000066400000000000000000045646601427272030600227720ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_POWER9 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_POWER9 // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_POWER9 #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 0 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" // skipped #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" // skipped #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" // skipped #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" // skipped #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" // skipped #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" // skipped #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 // begin bli_family_power9.h #define BLIS_POOL_ADDR_ALIGN_SIZE_A 4096 #define BLIS_POOL_ADDR_ALIGN_SIZE_B 4096 #define BLIS_POOL_ADDR_OFFSET_SIZE_A 192 #define BLIS_POOL_ADDR_OFFSET_SIZE_B 152 // Disable right-side hemm, symm, and trmm[3] to accommodate the broadcasting of // elements within the packed matrix B. #define BLIS_DISABLE_HEMM_RIGHT #define BLIS_DISABLE_SYMM_RIGHT #define BLIS_DISABLE_TRMM_RIGHT #define BLIS_DISABLE_TRMM3_RIGHT // end bli_family_power9.h #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC #include "bli_family_generic.h" // skipped #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" // skipped #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" // skipped #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" // skipped #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" // skipped #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 // begin bli_kernels_power9.h // -- level-3 -- // gemm (asm d12x6) GEMM_UKR_PROT( double, d, gemm_power9_asm_12x6 )// end bli_kernels_power9.h #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_POWER9 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_POWER9 // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_POWER9 #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 0 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-x86_64/000077500000000000000000000000001427272030600213545ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-x86_64/blis.h000066400000000000000000047110661427272030600224750ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN3 #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN3 #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 // begin bli_family_x86_64.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64.h #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX // begin bli_family_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif // end bli_family_skx.h #endif #ifdef BLIS_FAMILY_KNL // begin bli_family_knl.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 4 #define BLIS_THREAD_RATIO_N 1 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // -- MEMORY ALLOCATION -------------------------------------------------------- //#define BLIS_TREE_BARRIER //#define BLIS_TREE_BARRIER_ARITY 4 #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16_knc #define BLIS_DEFAULT_MC_S 240 #define BLIS_DEFAULT_KC_S 240 #define BLIS_DEFAULT_NC_S 14400 #define BLIS_DEFAULT_MR_S 30 #define BLIS_DEFAULT_NR_S 16 #define BLIS_PACKDIM_MR_S 32 #define BLIS_PACKDIM_NR_S 16 #if 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8_knc #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #elif 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_30XK_KERNEL bli_dpackm_30xk_opt #else #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_24x8 #define BLIS_DEFAULT_MR_D 24 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 24 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt #endif #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) #endif //#endif // end bli_family_knl.h #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 // begin bli_family_zen3.h #ifndef BLI_FAMILY_ZEN3_ #define BLI_FAMILY_ZEN3_ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. // #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 #endif // end bli_family_zen3.h #endif #ifdef BLIS_FAMILY_ZEN2 // begin bli_family_zen2.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 #endif // end bli_family_zen2.h #endif #ifdef BLIS_FAMILY_ZEN // begin bli_family_zen.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif // end bli_family_zen.h #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX // begin bli_kernels_skx.h GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) // end bli_kernels_skx.h #endif #ifdef BLIS_KERNELS_KNL // begin bli_kernels_knl.h GEMM_UKR_PROT( float, s, gemm_knl_asm_24x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 ) PACKM_KER_PROT( float, s, packm_knl_asm_24xk ) PACKM_KER_PROT( float, s, packm_knl_asm_16xk ) PACKM_KER_PROT( double, d, packm_knl_asm_24xk ) PACKM_KER_PROT( double, d, packm_knl_asm_8xk ) // unused: GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) PACKM_KER_PROT( double, d, packm_knl_asm_30xk ) // end bli_kernels_knl.h #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 // begin bli_kernels_zen2.h // -- level-1f -- AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) // end bli_kernels_zen2.h #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN3 #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN3 #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-x86_64_no_skx/000077500000000000000000000000001427272030600227355ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-x86_64_no_skx/blis.h000066400000000000000000046565451427272030600240700ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_SKX // Enabled sub-configurations (config_list) #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX // begin bli_family_x86_64_no_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64_no_skx.h #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_SKX // Enabled sub-configurations (config_list) #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-x86_64_no_zen2/000077500000000000000000000000001427272030600230065ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-x86_64_no_zen2/blis.h000066400000000000000000046705641427272030600241360ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN2 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 // begin bli_family_x86_64_no_zen2.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64_no_zen2.h #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX // begin bli_family_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif // end bli_family_skx.h #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN // begin bli_family_zen.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif // end bli_family_zen.h #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX // begin bli_kernels_skx.h GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) // end bli_kernels_skx.h #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN2 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/linux-x86_64_no_zen3/000077500000000000000000000000001427272030600230075ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/linux-x86_64_no_zen3/blis.h000066400000000000000000047045031427272030600241260ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN3 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 // begin bli_family_x86_64_no_zen3.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64_no_zen3.h #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX // begin bli_family_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif // end bli_family_skx.h #endif #ifdef BLIS_FAMILY_KNL // begin bli_family_knl.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 4 #define BLIS_THREAD_RATIO_N 1 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // -- MEMORY ALLOCATION -------------------------------------------------------- //#define BLIS_TREE_BARRIER //#define BLIS_TREE_BARRIER_ARITY 4 #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16_knc #define BLIS_DEFAULT_MC_S 240 #define BLIS_DEFAULT_KC_S 240 #define BLIS_DEFAULT_NC_S 14400 #define BLIS_DEFAULT_MR_S 30 #define BLIS_DEFAULT_NR_S 16 #define BLIS_PACKDIM_MR_S 32 #define BLIS_PACKDIM_NR_S 16 #if 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8_knc #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #elif 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_30XK_KERNEL bli_dpackm_30xk_opt #else #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_24x8 #define BLIS_DEFAULT_MR_D 24 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 24 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt #endif #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) #endif //#endif // end bli_family_knl.h #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 // begin bli_family_zen2.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 #endif // end bli_family_zen2.h #endif #ifdef BLIS_FAMILY_ZEN // begin bli_family_zen.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif // end bli_family_zen.h #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX // begin bli_kernels_skx.h GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) // end bli_kernels_skx.h #endif #ifdef BLIS_KERNELS_KNL // begin bli_kernels_knl.h GEMM_UKR_PROT( float, s, gemm_knl_asm_24x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 ) PACKM_KER_PROT( float, s, packm_knl_asm_24xk ) PACKM_KER_PROT( float, s, packm_knl_asm_16xk ) PACKM_KER_PROT( double, d, packm_knl_asm_24xk ) PACKM_KER_PROT( double, d, packm_knl_asm_8xk ) // unused: GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) PACKM_KER_PROT( double, d, packm_knl_asm_30xk ) // end bli_kernels_knl.h #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 // begin bli_kernels_zen2.h // -- level-1f -- AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) // end bli_kernels_zen2.h #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64_NO_ZEN3 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/windows-generic/000077500000000000000000000000001427272030600223655ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/windows-generic/blis.h000066400000000000000000047071251427272030600235070ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_GENERIC // Enabled sub-configurations (config_list) #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 #include "bli_family_x86_64.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX #include "bli_family_skx.h" // skipped #endif #ifdef BLIS_FAMILY_KNL #include "bli_family_knl.h" // skipped #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL #include "bli_family_haswell.h" // skipped #endif #ifdef BLIS_FAMILY_SANDYBRIDGE #include "bli_family_sandybridge.h" // skipped #endif #ifdef BLIS_FAMILY_PENRYN #include "bli_family_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 #include "bli_family_zen3.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN2 #include "bli_family_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_ZEN #include "bli_family_zen.h" // skipped #endif #ifdef BLIS_FAMILY_EXCAVATOR #include "bli_family_excavator.h" // skipped #endif #ifdef BLIS_FAMILY_STEAMROLLER #include "bli_family_steamroller.h" // skipped #endif #ifdef BLIS_FAMILY_PILEDRIVER #include "bli_family_piledriver.h" // skipped #endif #ifdef BLIS_FAMILY_BULLDOZER #include "bli_family_bulldozer.h" // skipped #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX #include "bli_kernels_skx.h" // skipped #endif #ifdef BLIS_KERNELS_KNL #include "bli_kernels_knl.h" // skipped #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL #include "bli_kernels_haswell.h" // skipped #endif #ifdef BLIS_KERNELS_SANDYBRIDGE #include "bli_kernels_sandybridge.h" // skipped #endif #ifdef BLIS_KERNELS_PENRYN #include "bli_kernels_penryn.h" // skipped #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 #include "bli_kernels_zen2.h" // skipped #endif #ifdef BLIS_KERNELS_ZEN #include "bli_kernels_zen.h" // skipped #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER #include "bli_kernels_piledriver.h" // skipped #endif #ifdef BLIS_KERNELS_BULLDOZER #include "bli_kernels_bulldozer.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_GENERIC // Enabled sub-configurations (config_list) #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/include/windows-x86_64/000077500000000000000000000000001427272030600217075ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/include/windows-x86_64/blis.h000066400000000000000000050366711427272030600230330ustar00rootroot00000000000000 #ifndef BLIS_H #define BLIS_H // Allow C++ users to include this header file in their source code. However, // we make the extern "C" conditional on whether we're using a C++ compiler, // since regular C compilers don't understand the extern "C" construct. #ifdef __cplusplus extern "C" { #endif // NOTE: PLEASE DON'T CHANGE THE ORDER IN WHICH HEADERS ARE INCLUDED UNLESS // YOU ARE SURE THAT IT DOESN'T BREAK INTER-HEADER MACRO DEPENDENCIES. // -- configure definitions -- // NOTE: bli_config.h header must be included before any BLIS header. // It is bootstrapped by ./configure and does not depend on later // headers. Moreover, these configuration variables are necessary to change // some default behaviors (e.g. disable OS-detection in bli_system.h in case // of --disable-system). // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN3 #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN3 #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // -- System and language-related headers -- // NOTE: bli_system.h header must be included before bli_config_macro_defs.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_lang_defs.h #ifndef BLIS_LANG_DEFS_H #define BLIS_LANG_DEFS_H // -- Undefine restrict for C++ and C89/90 -- #ifdef __cplusplus // Language is C++; define restrict as nothing. #ifndef restrict #define restrict #endif #elif __STDC_VERSION__ >= 199901L // Language is C99 (or later); do nothing since restrict is recognized. #else // Language is pre-C99; define restrict as nothing. #ifndef restrict #define restrict #endif #endif // -- Define typeof() operator if using non-GNU compiler -- #ifndef __GNUC__ #define typeof __typeof__ #else #ifndef typeof #define typeof __typeof__ #endif #endif // -- BLIS Thread Local Storage Keyword -- // __thread for TLS is supported by GCC, CLANG, ICC, and IBMC. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support __thread, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__GNUC__) || defined(__clang__) || defined(__ICC) || defined(__IBMC__) #define BLIS_THREAD_LOCAL __thread #else #define BLIS_THREAD_LOCAL #endif // -- BLIS constructor/destructor function attribute -- // __attribute__((constructor/destructor)) is supported by GCC only. // There is a small risk here as __GNUC__ can also be defined by some other // compiler (other than ICC and CLANG which we know define it) that // doesn't support this, as __GNUC__ is not quite unique to GCC. // But the possibility of someone using such non-main-stream compiler // for building BLIS is low. #if defined(__ICC) || defined(__INTEL_COMPILER) // ICC defines __GNUC__ but doesn't support this #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #elif defined(__clang__) // CLANG supports __attribute__, but its documentation doesn't // mention support for constructor/destructor. Compiling with // clang and testing shows that it does support. #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #elif defined(__GNUC__) #define BLIS_ATTRIB_CTOR __attribute__((constructor)) #define BLIS_ATTRIB_DTOR __attribute__((destructor)) #else #define BLIS_ATTRIB_CTOR #define BLIS_ATTRIB_DTOR #endif #endif // end bli_lang_defs.h // -- configure default definitions -- // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // -- Common BLIS definitions -- // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h // begin bli_macro_defs.h #ifndef BLIS_MACRO_DEFS_H #define BLIS_MACRO_DEFS_H // -- Concatenation macros -- #define BLIS_FUNC_PREFIX_STR "bli" // We add an extra layer the definitions of these string-pasting macros // because sometimes it is needed if, for example, one of the PASTE // macros is invoked with an "op" argument that is itself a macro. #define PASTEMAC0_(op) bli_ ## op #define PASTEMAC0(op) PASTEMAC0_(op) #define PASTEMAC_(ch,op) bli_ ## ch ## op #define PASTEMAC(ch,op) PASTEMAC_(ch,op) #define PASTEMAC2_(ch1,ch2,op) bli_ ## ch1 ## ch2 ## op #define PASTEMAC2(ch1,ch2,op) PASTEMAC2_(ch1,ch2,op) #define PASTEMAC3_(ch1,ch2,ch3,op) bli_ ## ch1 ## ch2 ## ch3 ## op #define PASTEMAC3(ch1,ch2,ch3,op) PASTEMAC3_(ch1,ch2,ch3,op) #define PASTEMAC4_(ch1,ch2,ch3,ch4,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## op #define PASTEMAC4(ch1,ch2,ch3,ch4,op) PASTEMAC4_(ch1,ch2,ch3,ch4,op) #define PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## op #define PASTEMAC5(ch1,ch2,ch3,ch4,ch5,op) PASTEMAC5_(ch1,ch2,ch3,ch4,ch5,op) #define PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) bli_ ## ch1 ## ch2 ## ch3 ## ch4 ## ch5 ## ch6 ## op #define PASTEMAC6(ch1,ch2,ch3,ch4,ch5,ch6,op) PASTEMAC6_(ch1,ch2,ch3,ch4,ch5,ch6,op) #define PASTEBLACHK_(op) bla_ ## op ## _check #define PASTEBLACHK(op) PASTEBLACHK_(op) #define PASTECH0_(op) op #define PASTECH0(op) PASTECH0_(op) #define PASTECH_(ch,op) ch ## op #define PASTECH(ch,op) PASTECH_(ch,op) #define PASTECH2_(ch1,ch2,op) ch1 ## ch2 ## op #define PASTECH2(ch1,ch2,op) PASTECH2_(ch1,ch2,op) #define PASTECH3_(ch1,ch2,ch3,op) ch1 ## ch2 ## ch3 ## op #define PASTECH3(ch1,ch2,ch3,op) PASTECH3_(ch1,ch2,ch3,op) #define MKSTR(s1) #s1 #define STRINGIFY_INT( s ) MKSTR( s ) // Fortran-77 name-mangling macros. #define PASTEF770(name) name ## _ #define PASTEF77(ch1,name) ch1 ## name ## _ #define PASTEF772(ch1,ch2,name) ch1 ## ch2 ## name ## _ #define PASTEF773(ch1,ch2,ch3,name) ch1 ## ch2 ## ch3 ## name ## _ // -- Include other groups of macros // begin bli_genarray_macro_defs.h #ifndef BLIS_GENARRAY_MACRO_DEFS_H #define BLIS_GENARRAY_MACRO_DEFS_H // -- Macros to generate function arrays --------------------------------------- // -- "Smart" one-operand macro -- #define GENARRAY_FPA(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname) \ } // -- "Smart" one-operand macro (with integer support) -- #define GENARRAY_FPA_I(tname,opname) \ \ static tname PASTECH(opname,_fpa)[BLIS_NUM_FP_TYPES+1] = \ { \ ( tname )PASTEMAC(s,opname), \ ( tname )PASTEMAC(c,opname), \ ( tname )PASTEMAC(d,opname), \ ( tname )PASTEMAC(z,opname), \ ( tname )PASTEMAC(i,opname) \ } // -- "Smart" two-operand macro -- #define GENARRAY_FPA2(tname,op) \ \ static tname PASTECH(op,_fpa2)[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { ( tname )PASTEMAC2(s,s,op), ( tname )PASTEMAC2(s,c,op), ( tname )PASTEMAC2(s,d,op), ( tname )PASTEMAC2(s,z,op) }, \ { ( tname )PASTEMAC2(c,s,op), ( tname )PASTEMAC2(c,c,op), ( tname )PASTEMAC2(c,d,op), ( tname )PASTEMAC2(c,z,op) }, \ { ( tname )PASTEMAC2(d,s,op), ( tname )PASTEMAC2(d,c,op), ( tname )PASTEMAC2(d,d,op), ( tname )PASTEMAC2(d,z,op) }, \ { ( tname )PASTEMAC2(z,s,op), ( tname )PASTEMAC2(z,c,op), ( tname )PASTEMAC2(z,d,op), ( tname )PASTEMAC2(z,z,op) } \ } // -- "Smart" two-operand macro -- // -- One-operand macro -- #define GENARRAY(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op) \ } #define GENARRAY_I(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES+1] = \ { \ PASTEMAC(s,op), \ PASTEMAC(c,op), \ PASTEMAC(d,op), \ PASTEMAC(z,op), \ PASTEMAC(i,op) \ } // -- One-operand macro (with custom prefix) -- #define GENARRAY_PREF(arrayname,prefix,op) \ \ arrayname[BLIS_NUM_FP_TYPES] = \ { \ PASTECH2(prefix,s,op), \ PASTECH2(prefix,c,op), \ PASTECH2(prefix,d,op), \ PASTECH2(prefix,z,op) \ } // -- Two-operand macros -- #define GENARRAY2_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), PASTEMAC2(s,d,op), PASTEMAC2(s,z,op) }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), PASTEMAC2(c,d,op), PASTEMAC2(c,z,op) }, \ { PASTEMAC2(d,s,op), PASTEMAC2(d,c,op), PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { PASTEMAC2(z,s,op), PASTEMAC2(z,c,op), PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), PASTEMAC2(s,c,op), NULL, NULL, }, \ { PASTEMAC2(c,s,op), PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), PASTEMAC2(d,z,op) }, \ { NULL, NULL, PASTEMAC2(z,d,op), PASTEMAC2(z,z,op) } \ } #define GENARRAY2_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { PASTEMAC2(s,s,op), NULL, NULL, NULL, }, \ { NULL, PASTEMAC2(c,c,op), NULL, NULL, }, \ { NULL, NULL, PASTEMAC2(d,d,op), NULL, }, \ { NULL, NULL, NULL, PASTEMAC2(z,z,op) } \ } // -- Three-operand macros -- #define GENARRAY3_ALL(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), PASTEMAC3(s,s,d,op), PASTEMAC3(s,s,z,op) }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), PASTEMAC3(s,c,d,op), PASTEMAC3(s,c,z,op) }, \ { PASTEMAC3(s,d,s,op), PASTEMAC3(s,d,c,op), PASTEMAC3(s,d,d,op), PASTEMAC3(s,d,z,op) }, \ { PASTEMAC3(s,z,s,op), PASTEMAC3(s,z,c,op), PASTEMAC3(s,z,d,op), PASTEMAC3(s,z,z,op) } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), PASTEMAC3(c,s,d,op), PASTEMAC3(c,s,z,op) }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), PASTEMAC3(c,c,d,op), PASTEMAC3(c,c,z,op) }, \ { PASTEMAC3(c,d,s,op), PASTEMAC3(c,d,c,op), PASTEMAC3(c,d,d,op), PASTEMAC3(c,d,z,op) }, \ { PASTEMAC3(c,z,s,op), PASTEMAC3(c,z,c,op), PASTEMAC3(c,z,d,op), PASTEMAC3(c,z,z,op) } \ }, \ { \ { PASTEMAC3(d,s,s,op), PASTEMAC3(d,s,c,op), PASTEMAC3(d,s,d,op), PASTEMAC3(d,s,z,op) }, \ { PASTEMAC3(d,c,s,op), PASTEMAC3(d,c,c,op), PASTEMAC3(d,c,d,op), PASTEMAC3(d,c,z,op) }, \ { PASTEMAC3(d,d,s,op), PASTEMAC3(d,d,c,op), PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { PASTEMAC3(d,z,s,op), PASTEMAC3(d,z,c,op), PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { PASTEMAC3(z,s,s,op), PASTEMAC3(z,s,c,op), PASTEMAC3(z,s,d,op), PASTEMAC3(z,s,z,op) }, \ { PASTEMAC3(z,c,s,op), PASTEMAC3(z,c,c,op), PASTEMAC3(z,c,d,op), PASTEMAC3(z,c,z,op) }, \ { PASTEMAC3(z,d,s,op), PASTEMAC3(z,d,c,op), PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { PASTEMAC3(z,z,s,op), PASTEMAC3(z,z,c,op), PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_EXT(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), PASTEMAC3(s,s,c,op), NULL, NULL, }, \ { PASTEMAC3(s,c,s,op), PASTEMAC3(s,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { PASTEMAC3(c,s,s,op), PASTEMAC3(c,s,c,op), NULL, NULL, }, \ { PASTEMAC3(c,c,s,op), PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), PASTEMAC3(d,d,z,op) }, \ { NULL, NULL, PASTEMAC3(d,z,d,op), PASTEMAC3(d,z,z,op) } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(z,d,d,op), PASTEMAC3(z,d,z,op) }, \ { NULL, NULL, PASTEMAC3(z,z,d,op), PASTEMAC3(z,z,z,op) } \ } \ } #define GENARRAY3_MIN(arrayname,op) \ \ arrayname[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES] = \ { \ { \ { PASTEMAC3(s,s,s,op), NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, PASTEMAC3(c,c,c,op), NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, PASTEMAC3(d,d,d,op), NULL, }, \ { NULL, NULL, NULL, NULL, } \ }, \ { \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, NULL, }, \ { NULL, NULL, NULL, PASTEMAC3(z,z,z,op) } \ } \ } #endif // end bli_genarray_macro_defs.h // begin bli_gentdef_macro_defs.h #ifndef BLIS_GENTDEF_MACRO_DEFS_H #define BLIS_GENTDEF_MACRO_DEFS_H // // -- MACROS TO INSERT TYPEDEF-GENERATING MACROS ------------------------------- // // -- function typedef macro (both typed and void) -- #define INSERT_GENTDEF( opname ) \ \ GENTDEF( float, s, opname, _ft ) \ GENTDEF( double, d, opname, _ft ) \ GENTDEF( scomplex, c, opname, _ft ) \ GENTDEF( dcomplex, z, opname, _ft ) \ \ GENTDEF( void, s, opname, _vft ) \ GENTDEF( void, d, opname, _vft ) \ GENTDEF( void, c, opname, _vft ) \ GENTDEF( void, z, opname, _vft ) \ \ GENTDEF( void, , opname, _vft ) // -- function typedef macro (both typed and void) with real projection -- #define INSERT_GENTDEFR( opname ) \ \ GENTDEFR( float, float, s, s, opname, _ft ) \ GENTDEFR( double, double, d, d, opname, _ft ) \ GENTDEFR( scomplex, float, c, s, opname, _ft ) \ GENTDEFR( dcomplex, double, z, d, opname, _ft ) \ \ GENTDEFR( void, void, s, s, opname, _vft ) \ GENTDEFR( void, void, d, d, opname, _vft ) \ GENTDEFR( void, void, c, s, opname, _vft ) \ GENTDEFR( void, void, z, d, opname, _vft ) \ \ GENTDEFR( void, void, , , opname, _vft ) #endif // end bli_gentdef_macro_defs.h // begin bli_gentfunc_macro_defs.h #ifndef BLIS_GENTFUNC_MACRO_DEFS_H #define BLIS_GENTFUNC_MACRO_DEFS_H // // -- MACROS TO INSERT FUNCTION-GENERATING MACROS ------------------------------ // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTFUNC_BLAS( blasname, blisname ) \ \ GENTFUNC( float, s, blasname, blisname ) \ GENTFUNC( double, d, blasname, blisname ) \ GENTFUNC( scomplex, c, blasname, blisname ) \ GENTFUNC( dcomplex, z, blasname, blisname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTFUNCRO_BLAS( blasname, blisname ) \ \ GENTFUNCRO( float, s, blasname, blisname ) \ GENTFUNCRO( double, d, blasname, blisname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTFUNCCO_BLAS( blasname, blisname ) \ \ GENTFUNCCO( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCCO( dcomplex, double, z, d, blasname, blisname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( float, s, , BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( double, d, , BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) \ \ GENTFUNCDOT( scomplex, c, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( scomplex, c, u, BLIS_NO_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, c, BLIS_CONJUGATE, blasname, blisname ) \ GENTFUNCDOT( dcomplex, z, u, BLIS_NO_CONJUGATE, blasname, blisname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTFUNCDOT_BLAS( blasname, blisname ) \ \ INSERT_GENTFUNCDOTR_BLAS( blasname, blisname ) \ INSERT_GENTFUNCDOTC_BLAS( blasname, blisname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTFUNCR_BLAS( rblasname, cblasname, blisname ) \ \ GENTFUNCR( float, float, s, s, rblasname, blisname ) \ GENTFUNCR( double, double, d, d, rblasname, blisname ) \ GENTFUNCR( scomplex, float, c, s, cblasname, blisname ) \ GENTFUNCR( dcomplex, double, z, d, cblasname, blisname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTFUNCR2_BLAS( blasname, blisname ) \ \ GENTFUNCR2( float, float, s, , blasname, blisname ) \ GENTFUNCR2( double, double, d, , blasname, blisname ) \ GENTFUNCR2( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \ \ GENTFUNCSCAL( float, float, s, , blasname, blisname ) \ GENTFUNCSCAL( double, double, d, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, scomplex, c, , blasname, blisname ) \ GENTFUNCSCAL( dcomplex, dcomplex, z, , blasname, blisname ) \ GENTFUNCSCAL( scomplex, float, c, s, blasname, blisname ) \ GENTFUNCSCAL( dcomplex, double, z, d, blasname, blisname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNC( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNC( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC0( tfuncname ) \ \ GENTFUNCR( float, float, s, s, tfuncname ) \ GENTFUNCR( double, double, d, d, tfuncname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCR_BASIC( tfuncname, varname ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname ) \ GENTFUNCR( double, double, d, d, tfuncname, varname ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTFUNCR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTFUNCR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTFUNCR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with real domain only -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRO_BASIC0( tfuncname ) \ \ GENTFUNCRO( float, s, tfuncname ) \ GENTFUNCRO( double, d, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNCRO_BASIC( tfuncname, varname ) \ \ GENTFUNCRO( float, s, tfuncname, varname ) \ GENTFUNCRO( double, d, tfuncname, varname ) \ // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC0( tfuncname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCCO_BASIC( tfuncname, varname ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTFUNCCO_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTFUNCCO( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC_BASIC0_I( tfuncname ) \ \ GENTFUNC( float, s, tfuncname ) \ GENTFUNC( double, d, tfuncname ) \ GENTFUNC( scomplex, c, tfuncname ) \ GENTFUNC( dcomplex, z, tfuncname ) \ GENTFUNC( gint_t, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC_BASIC_I( tfuncname, varname ) \ \ GENTFUNC( float, s, tfuncname, varname ) \ GENTFUNC( double, d, tfuncname, varname ) \ GENTFUNC( scomplex, c, tfuncname, varname ) \ GENTFUNC( dcomplex, z, tfuncname, varname ) \ GENTFUNC( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCI_BASIC0( tfuncname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNCI_BASIC( tfuncname, varname ) \ \ GENTFUNCI( float, gint_t, s, i, tfuncname, varname ) \ GENTFUNCI( double, gint_t, d, i, tfuncname, varname ) \ GENTFUNCI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTFUNCI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNCRI_BASIC0( tfuncname ) \ \ GENTFUNCRI( float, float, gint_t, s, s, i, tfuncname ) \ GENTFUNCRI( double, double, gint_t, d, d, i, tfuncname ) \ GENTFUNCRI( scomplex, float, gint_t, c, s, i, tfuncname ) \ GENTFUNCRI( dcomplex, double, gint_t, z, d, i, tfuncname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_BASIC0( tfuncname ) \ \ GENTFUNC2( float, float, s, s, tfuncname ) \ GENTFUNC2( double, double, d, d, tfuncname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_BASIC( tfuncname, varname ) \ \ GENTFUNC2( float, float, s, s, tfuncname, varname ) \ GENTFUNC2( double, double, d, d, tfuncname, varname ) \ GENTFUNC2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTFUNC2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_D0( tfuncname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_D( tfuncname, varname ) \ \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIX_P0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_P( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2_MIXDP0( tfuncname ) \ \ GENTFUNC2( float, double, s, d, tfuncname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname ) \ \ GENTFUNC2( double, float, d, s, tfuncname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2( float, double, s, d, tfuncname, varname ) \ GENTFUNC2( float, scomplex, s, c, tfuncname, varname ) \ GENTFUNC2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTFUNC2( double, float, d, s, tfuncname, varname ) \ GENTFUNC2( double, scomplex, d, c, tfuncname, varname ) \ GENTFUNC2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTFUNC2( scomplex, float, c, s, tfuncname, varname ) \ GENTFUNC2( scomplex, double, c, d, tfuncname, varname ) \ GENTFUNC2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTFUNC2( dcomplex, float, z, s, tfuncname, varname ) \ GENTFUNC2( dcomplex, double, z, d, tfuncname, varname ) \ GENTFUNC2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_BASIC0( tfuncname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_BASIC( tfuncname, varname ) \ \ GENTFUNC2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_D0( tfuncname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_D( tfuncname, varname ) \ \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIX_P0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_P( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) // -- Mixed domain/precision (all) two-operand macro with real projection of second operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC2R_MIXDP0( tfuncname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTFUNC2R_MIX_DP( tfuncname, varname ) \ \ GENTFUNC2R( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC2R( float, dcomplex, double, s, z, d, tfuncname, varname ) \ \ GENTFUNC2R( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC2R( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ \ GENTFUNC2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC2R( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC2R( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ \ GENTFUNC2R( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC2R( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC2R( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC0( tfuncname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_BASIC( tfuncname, varname ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, float, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, double, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, scomplex, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, dcomplex, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D0( tfuncname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_D( tfuncname, varname ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, scomplex, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, float, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, scomplex, s, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, dcomplex, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, double, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, dcomplex, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, float, float, c, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, scomplex, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, float, c, c, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, double, z, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, dcomplex, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, double, z, z, d, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P0( tfuncname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3_MIX_P( tfuncname, varname ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, float, double, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, float, dcomplex, s, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, double, float, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, double, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, scomplex, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, double, dcomplex, s, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, scomplex, double, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, scomplex, dcomplex, s, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( float, dcomplex, float, s, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, double, s, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, scomplex, s, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( float, dcomplex, dcomplex, s, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( double, float, float, d, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, double, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, scomplex, d, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, float, dcomplex, d, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, double, float, d, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, double, scomplex, d, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, scomplex, float, d, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, double, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, scomplex, d, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, scomplex, dcomplex, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( double, dcomplex, float, d, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( double, dcomplex, scomplex, d, z, c, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( scomplex, float, double, c, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, float, dcomplex, c, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, double, float, c, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, double, c, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, scomplex, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, double, dcomplex, c, d, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, scomplex, double, c, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, scomplex, dcomplex, c, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( scomplex, dcomplex, float, c, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, double, c, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, scomplex, c, z, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( scomplex, dcomplex, dcomplex, c, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3( dcomplex, float, float, z, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, double, z, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, scomplex, z, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, float, dcomplex, z, s, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, double, float, z, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, double, scomplex, z, d, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, scomplex, float, z, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, double, z, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, scomplex, z, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, scomplex, dcomplex, z, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3( dcomplex, dcomplex, float, z, z, s, tfuncname, varname1, varname2 ) \ GENTFUNC3( dcomplex, dcomplex, scomplex, z, z, c, tfuncname, varname1, varname2 ) // -- Basic three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC0( tfuncname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_BASIC( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, float, float, s, s, s, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, double, double, d, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, tfuncname, varname1, varname2 ) // -- Mixed domain three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D0( tfuncname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_D( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_D2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, scomplex, float, s, s, c, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, float, scomplex, s, c, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, scomplex, scomplex, s, c, c, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, dcomplex, double, d, d, z, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, double, dcomplex, d, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, float, float, scomplex, c, s, s, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, scomplex, scomplex, c, s, c, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, float, scomplex, c, c, s, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, double, dcomplex, z, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, tfuncname, varname1, varname2 ) // -- Mixed precision three-operand with union of operands 1 and 2 -- // -- (no auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P0( tfuncname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTFUNC3U12_MIX_P( tfuncname, varname ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTFUNC3U12_MIX_P2( tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, float, double, float, s, s, d, s, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, float, dcomplex, float, s, s, z, s, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, double, float, double, s, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, double, double, s, d, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, scomplex, double, s, d, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, double, dcomplex, double, s, d, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, scomplex, double, scomplex, s, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( float, dcomplex, float, dcomplex, s, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, double, dcomplex, s, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( double, float, float, double, d, s, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, double, double, d, s, d, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, scomplex, double, d, s, c, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, float, dcomplex, double, d, s, z, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, double, float, double, d, d, s, d, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, double, scomplex, double, d, d, c, d, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, scomplex, float, dcomplex, d, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, double, dcomplex, d, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( double, dcomplex, float, dcomplex, d, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( scomplex, float, double, scomplex, c, s, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, double, float, dcomplex, c, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, double, dcomplex, c, d, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, scomplex, double, scomplex, c, c, d, c, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, tfuncname, varname1, varname2 ) \ \ \ GENTFUNC3U12( dcomplex, float, float, dcomplex, z, s, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, double, dcomplex, z, s, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, double, float, dcomplex, z, d, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, tfuncname, varname1, varname2 ) \ \ GENTFUNC3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, tfuncname, varname1, varname2 ) \ GENTFUNC3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, tfuncname, varname1, varname2 ) #endif // end bli_gentfunc_macro_defs.h // begin bli_gentprot_macro_defs.h #ifndef BLIS_GENTPROT_MACRO_DEFS_H #define BLIS_GENTPROT_MACRO_DEFS_H // // -- MACROS TO INSERT PROTOTYPE-GENERATING MACROS ----------------------------- // // -- Macros for generating BLAS routines -------------------------------------- // -- Basic one-operand macro -- #define INSERT_GENTPROT_BLAS( blasname ) \ \ GENTPROT( float, s, blasname ) \ GENTPROT( double, d, blasname ) \ GENTPROT( scomplex, c, blasname ) \ GENTPROT( dcomplex, z, blasname ) // -- Basic one-operand macro with real domain only -- #define INSERT_GENTPROTRO_BLAS( blasname ) \ \ GENTPROTRO( float, s, blasname ) \ GENTPROTRO( double, d, blasname ) // -- Basic one-operand macro with complex domain only and real projection -- #define INSERT_GENTPROTCO_BLAS( blasname ) \ \ GENTPROTCO( scomplex, float, c, s, blasname ) \ GENTPROTCO( dcomplex, double, z, d, blasname ) // -- Basic one-operand macro with conjugation (real funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTR_BLAS( blasname ) \ \ GENTPROTDOT( float, s, , blasname ) \ GENTPROTDOT( double, d, , blasname ) // -- Basic one-operand macro with conjugation (complex funcs only, used only for dot, ger) -- #define INSERT_GENTPROTDOTC_BLAS( blasname ) \ \ GENTPROTDOT( scomplex, c, c, blasname ) \ GENTPROTDOT( scomplex, c, u, blasname ) \ GENTPROTDOT( dcomplex, z, c, blasname ) \ GENTPROTDOT( dcomplex, z, u, blasname ) // -- Basic one-operand macro with conjugation (used only for dot, ger) -- #define INSERT_GENTPROTDOT_BLAS( blasname ) \ \ INSERT_GENTPROTDOTR_BLAS( blasname ) \ INSERT_GENTPROTDOTC_BLAS( blasname ) // -- Basic one-operand macro with real projection -- #define INSERT_GENTPROTR_BLAS( rblasname, cblasname ) \ \ GENTPROTR( float, float, s, s, rblasname ) \ GENTPROTR( double, double, d, d, rblasname ) \ GENTPROTR( scomplex, float, c, s, cblasname ) \ GENTPROTR( dcomplex, double, z, d, cblasname ) // -- Alternate two-operand macro (one char for complex, one for real proj) -- #define INSERT_GENTPROTR2_BLAS( blasname ) \ \ GENTPROTR2( float, float, , s, blasname ) \ GENTPROTR2( double, double, , d, blasname ) \ GENTPROTR2( scomplex, float, c, s, blasname ) \ GENTPROTR2( dcomplex, double, z, d, blasname ) // -- Extended two-operand macro (used only for scal) -- #define INSERT_GENTPROTSCAL_BLAS( blasname ) \ \ GENTPROTSCAL( float, float, , s, blasname ) \ GENTPROTSCAL( double, double, , d, blasname ) \ GENTPROTSCAL( scomplex, scomplex, , c, blasname ) \ GENTPROTSCAL( dcomplex, dcomplex, , z, blasname ) \ GENTPROTSCAL( float, scomplex, s, c, blasname ) \ GENTPROTSCAL( double, dcomplex, d, z, blasname ) // -- Macros for functions with one operand ------------------------------------ // -- Basic one-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0( tfuncname ) \ \ GENTPROT( float, s, tfuncname ) \ GENTPROT( double, d, tfuncname ) \ GENTPROT( scomplex, c, tfuncname ) \ GENTPROT( dcomplex, z, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROT_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2 ) \ GENTPROT( double, d, tfuncname, varname1, varname2 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROT_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROT_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROT( float, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( double, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( scomplex, c, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROT( dcomplex, z, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand with real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC0( tfuncname ) \ \ GENTPROTR( float, float, s, s, tfuncname ) \ GENTPROTR( double, double, d, d, tfuncname ) \ GENTPROTR( scomplex, float, c, s, tfuncname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTR_BASIC( tfuncname, varname ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname ) \ GENTPROTR( double, double, d, d, tfuncname, varname ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- (three auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC3( tfuncname, varname1, varname2, varname3 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 ) // -- (four auxiliary arguments) -- #define INSERT_GENTPROTR_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \ \ GENTPROTR( float, float, s, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( double, double, d, d, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( scomplex, float, c, s, tfuncname, varname1, varname2, varname3, varname4 ) \ GENTPROTR( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 ) // -- Basic one-operand macro with complex domain only and real projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC0( tfuncname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTCO_BASIC( tfuncname, varname ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname ) // -- (two auxiliary arguments) -- #define INSERT_GENTPROTCO_BASIC2( tfuncname, varname1, varname2 ) \ \ GENTPROTCO( scomplex, float, c, s, tfuncname, varname1, varname2 ) \ GENTPROTCO( dcomplex, double, z, d, tfuncname, varname1, varname2 ) // -- Basic one-operand macro with integer instance -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT_BASIC0_I( funcname ) \ \ GENTPROT( float, s, funcname ) \ GENTPROT( double, d, funcname ) \ GENTPROT( scomplex, c, funcname ) \ GENTPROT( dcomplex, z, funcname ) \ GENTPROT( gint_t, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT_BASIC_I( tfuncname, varname ) \ \ GENTPROT( float, s, tfuncname, varname ) \ GENTPROT( double, d, tfuncname, varname ) \ GENTPROT( scomplex, c, tfuncname, varname ) \ GENTPROT( dcomplex, z, tfuncname, varname ) \ GENTPROT( gint_t, i, tfuncname, varname ) // -- Basic one-operand with integer projection -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTI_BASIC0( funcname ) \ \ GENTPROTI( float, gint_t, s, i, funcname ) \ GENTPROTI( double, gint_t, d, i, funcname ) \ GENTPROTI( scomplex, gint_t, c, i, funcname ) \ GENTPROTI( dcomplex, gint_t, z, i, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROTI_BASIC( tfuncname, varname ) \ \ GENTPROTI( float, gint_t, s, i, tfuncname, varname ) \ GENTPROTI( double, gint_t, d, i, tfuncname, varname ) \ GENTPROTI( scomplex, gint_t, c, i, tfuncname, varname ) \ GENTPROTI( dcomplex, gint_t, z, i, tfuncname, varname ) // -- Basic one-operand with real and integer projections -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROTRI_BASIC( funcname ) \ \ GENTPROTRI( float, float, gint_t, s, s, i, funcname ) \ GENTPROTRI( double, double, gint_t, d, d, i, funcname ) \ GENTPROTRI( scomplex, float, gint_t, c, s, i, funcname ) \ GENTPROTRI( dcomplex, double, gint_t, z, d, i, funcname ) // -- Macros for functions with two primary operands --------------------------- // -- Basic two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_BASIC0( funcname ) \ \ GENTPROT2( float, float, s, s, funcname ) \ GENTPROT2( double, double, d, d, funcname ) \ GENTPROT2( scomplex, scomplex, c, c, funcname ) \ GENTPROT2( dcomplex, dcomplex, z, z, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_BASIC( tfuncname, varname ) \ \ GENTPROT2( float, float, s, s, tfuncname, varname ) \ GENTPROT2( double, double, d, d, tfuncname, varname ) \ GENTPROT2( scomplex, scomplex, c, c, tfuncname, varname ) \ GENTPROT2( dcomplex, dcomplex, z, z, tfuncname, varname ) // -- Mixed domain two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_D0( funcname ) \ \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( scomplex, float, c, s, funcname ) \ \ GENTPROT2( double, dcomplex, d, z, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_D( tfuncname, varname ) \ \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) // -- Mixed precision two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIX_P0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) \ // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_P( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) \ // -- Mixed domain/precision (all) two-operand macro -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2_MIXDP0( funcname ) \ \ GENTPROT2( float, double, s, d, funcname ) \ GENTPROT2( float, scomplex, s, c, funcname ) \ GENTPROT2( float, dcomplex, s, z, funcname ) \ \ GENTPROT2( double, float, d, s, funcname ) \ GENTPROT2( double, scomplex, d, c, funcname ) \ GENTPROT2( double, dcomplex, d, z, funcname ) \ \ GENTPROT2( scomplex, float, c, s, funcname ) \ GENTPROT2( scomplex, double, c, d, funcname ) \ GENTPROT2( scomplex, dcomplex, c, z, funcname ) \ \ GENTPROT2( dcomplex, float, z, s, funcname ) \ GENTPROT2( dcomplex, double, z, d, funcname ) \ GENTPROT2( dcomplex, scomplex, z, c, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2_MIX_DP( tfuncname, varname ) \ \ GENTPROT2( float, double, s, d, tfuncname, varname ) \ GENTPROT2( float, scomplex, s, c, tfuncname, varname ) \ GENTPROT2( float, dcomplex, s, z, tfuncname, varname ) \ \ GENTPROT2( double, float, d, s, tfuncname, varname ) \ GENTPROT2( double, scomplex, d, c, tfuncname, varname ) \ GENTPROT2( double, dcomplex, d, z, tfuncname, varname ) \ \ GENTPROT2( scomplex, float, c, s, tfuncname, varname ) \ GENTPROT2( scomplex, double, c, d, tfuncname, varname ) \ GENTPROT2( scomplex, dcomplex, c, z, tfuncname, varname ) \ \ GENTPROT2( dcomplex, float, z, s, tfuncname, varname ) \ GENTPROT2( dcomplex, double, z, d, tfuncname, varname ) \ GENTPROT2( dcomplex, scomplex, z, c, tfuncname, varname ) // -- Basic two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_BASIC0( funcname ) \ \ GENTPROT2R( float, float, float, s, s, s, funcname ) \ GENTPROT2R( double, double, double, d, d, d, funcname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, funcname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, funcname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_BASIC( tfuncname, varname ) \ \ GENTPROT2R( float, float, float, s, s, s, tfuncname, varname ) \ GENTPROT2R( double, double, double, d, d, d, tfuncname, varname ) \ GENTPROT2R( scomplex, scomplex, float, c, c, s, tfuncname, varname ) \ GENTPROT2R( dcomplex, dcomplex, double, z, z, d, tfuncname, varname ) // -- Mixed domain two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_D0( tfuncname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_D( tfuncname, varname ) \ \ GENTPROT2R( float, scomplex, float, s, c, s, tfuncname, varname ) \ GENTPROT2R( scomplex, float, float, c, s, s, tfuncname, varname ) \ \ GENTPROT2R( double, dcomplex, double, d, z, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, double, double, z, d, d, tfuncname, varname ) // -- Mixed precision two-operand with real projection of first operand -- // -- (no auxiliary arguments) -- #define INSERT_GENTPROT2R_MIX_P0( tfuncname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname ) // -- (one auxiliary argument) -- #define INSERT_GENTPROT2R_MIX_P( tfuncname, varname ) \ \ GENTPROT2R( float, double, float, s, d, s, tfuncname, varname ) \ GENTPROT2R( float, dcomplex, float, s, z, s, tfuncname, varname ) \ \ GENTPROT2R( double, float, double, d, s, d, tfuncname, varname ) \ GENTPROT2R( double, scomplex, double, d, c, d, tfuncname, varname ) \ \ GENTPROT2R( scomplex, double, float, c, d, s, tfuncname, varname ) \ GENTPROT2R( scomplex, dcomplex, float, c, z, s, tfuncname, varname ) \ \ GENTPROT2R( dcomplex, float, double, z, s, d, tfuncname, varname ) \ GENTPROT2R( dcomplex, scomplex, double, z, c, d, tfuncname, varname ) // -- Macros for functions with three primary operands ------------------------- // -- Basic three-operand macro -- #define INSERT_GENTPROT3_BASIC( funcname ) \ \ GENTPROT3( float, float, float, s, s, s, funcname ) \ GENTPROT3( double, double, double, d, d, d, funcname ) \ GENTPROT3( scomplex, scomplex, scomplex, c, c, c, funcname ) \ GENTPROT3( dcomplex, dcomplex, dcomplex, z, z, z, funcname ) // -- Mixed domain three-operand macro -- #define INSERT_GENTPROT3_MIX_D( funcname ) \ \ GENTPROT3( float, float, scomplex, s, s, c, funcname ) \ GENTPROT3( float, scomplex, float, s, c, s, funcname ) \ GENTPROT3( float, scomplex, scomplex, s, c, c, funcname ) \ \ GENTPROT3( double, double, dcomplex, d, d, z, funcname ) \ GENTPROT3( double, dcomplex, double, d, z, d, funcname ) \ GENTPROT3( double, dcomplex, dcomplex, d, z, z, funcname ) \ \ GENTPROT3( scomplex, float, float, c, s, s, funcname ) \ GENTPROT3( scomplex, float, scomplex, c, s, c, funcname ) \ GENTPROT3( scomplex, scomplex, float, c, c, s, funcname ) \ \ GENTPROT3( dcomplex, double, double, z, d, d, funcname ) \ GENTPROT3( dcomplex, double, dcomplex, z, d, z, funcname ) \ GENTPROT3( dcomplex, dcomplex, double, z, z, d, funcname ) // -- Mixed precision three-operand macro -- #define INSERT_GENTPROT3_MIX_P( funcname ) \ \ GENTPROT3( float, float, double, s, s, d, funcname ) \ GENTPROT3( float, float, dcomplex, s, s, z, funcname ) \ \ GENTPROT3( float, double, float, s, d, s, funcname ) \ GENTPROT3( float, double, double, s, d, d, funcname ) \ GENTPROT3( float, double, scomplex, s, d, c, funcname ) \ GENTPROT3( float, double, dcomplex, s, d, z, funcname ) \ \ GENTPROT3( float, scomplex, double, s, c, d, funcname ) \ GENTPROT3( float, scomplex, dcomplex, s, c, z, funcname ) \ \ GENTPROT3( float, dcomplex, float, s, z, s, funcname ) \ GENTPROT3( float, dcomplex, double, s, z, d, funcname ) \ GENTPROT3( float, dcomplex, scomplex, s, z, c, funcname ) \ GENTPROT3( float, dcomplex, dcomplex, s, z, z, funcname ) \ \ \ GENTPROT3( double, float, float, d, s, s, funcname ) \ GENTPROT3( double, float, double, d, s, d, funcname ) \ GENTPROT3( double, float, scomplex, d, s, c, funcname ) \ GENTPROT3( double, float, dcomplex, d, s, z, funcname ) \ \ GENTPROT3( double, double, float, d, d, s, funcname ) \ GENTPROT3( double, double, scomplex, d, d, c, funcname ) \ \ GENTPROT3( double, scomplex, float, d, c, s, funcname ) \ GENTPROT3( double, scomplex, double, d, c, d, funcname ) \ GENTPROT3( double, scomplex, scomplex, d, c, c, funcname ) \ GENTPROT3( double, scomplex, dcomplex, d, c, z, funcname ) \ \ GENTPROT3( double, dcomplex, float, d, z, s, funcname ) \ GENTPROT3( double, dcomplex, scomplex, d, z, c, funcname ) \ \ \ GENTPROT3( scomplex, float, double, c, s, d, funcname ) \ GENTPROT3( scomplex, float, dcomplex, c, s, z, funcname ) \ \ GENTPROT3( scomplex, double, float, c, d, s, funcname ) \ GENTPROT3( scomplex, double, double, c, d, d, funcname ) \ GENTPROT3( scomplex, double, scomplex, c, d, c, funcname ) \ GENTPROT3( scomplex, double, dcomplex, c, d, z, funcname ) \ \ GENTPROT3( scomplex, scomplex, double, c, c, d, funcname ) \ GENTPROT3( scomplex, scomplex, dcomplex, c, c, z, funcname ) \ \ GENTPROT3( scomplex, dcomplex, float, c, z, s, funcname ) \ GENTPROT3( scomplex, dcomplex, double, c, z, d, funcname ) \ GENTPROT3( scomplex, dcomplex, scomplex, c, z, c, funcname ) \ GENTPROT3( scomplex, dcomplex, dcomplex, c, z, z, funcname ) \ \ \ GENTPROT3( dcomplex, float, float, z, s, s, funcname ) \ GENTPROT3( dcomplex, float, double, z, s, d, funcname ) \ GENTPROT3( dcomplex, float, scomplex, z, s, c, funcname ) \ GENTPROT3( dcomplex, float, dcomplex, z, s, z, funcname ) \ \ GENTPROT3( dcomplex, double, float, z, d, s, funcname ) \ GENTPROT3( dcomplex, double, scomplex, z, d, c, funcname ) \ \ GENTPROT3( dcomplex, scomplex, float, z, c, s, funcname ) \ GENTPROT3( dcomplex, scomplex, double, z, c, d, funcname ) \ GENTPROT3( dcomplex, scomplex, scomplex, z, c, c, funcname ) \ GENTPROT3( dcomplex, scomplex, dcomplex, z, c, z, funcname ) \ \ GENTPROT3( dcomplex, dcomplex, float, z, z, s, funcname ) \ GENTPROT3( dcomplex, dcomplex, scomplex, z, z, c, funcname ) \ // -- Basic three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_BASIC( funcname ) \ \ GENTPROT3U12( float, float, float, float, s, s, s, s, funcname ) \ GENTPROT3U12( double, double, double, double, d, d, d, d, funcname ) \ GENTPROT3U12( scomplex, scomplex, scomplex, scomplex, c, c, c, c, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, dcomplex, dcomplex, z, z, z, z, funcname ) // -- Mixed domain three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_D( funcname ) \ \ GENTPROT3U12( float, float, scomplex, float, s, s, c, s, funcname ) \ GENTPROT3U12( float, scomplex, float, scomplex, s, c, s, c, funcname ) \ GENTPROT3U12( float, scomplex, scomplex, scomplex, s, c, c, c, funcname ) \ \ GENTPROT3U12( double, double, dcomplex, double, d, d, z, d, funcname ) \ GENTPROT3U12( double, dcomplex, double, dcomplex, d, z, d, z, funcname ) \ GENTPROT3U12( double, dcomplex, dcomplex, dcomplex, d, z, z, z, funcname ) \ \ GENTPROT3U12( scomplex, float, float, scomplex, c, s, s, c, funcname ) \ GENTPROT3U12( scomplex, float, scomplex, scomplex, c, s, c, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, float, scomplex, c, c, s, c, funcname ) \ \ GENTPROT3U12( dcomplex, double, double, dcomplex, z, d, d, z, funcname ) \ GENTPROT3U12( dcomplex, double, dcomplex, dcomplex, z, d, z, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, double, dcomplex, z, z, d, z, funcname ) // -- Mixed precision three-operand with union of operands 1 and 2 -- #define INSERT_GENTPROT3U12_MIX_P( funcname ) \ \ GENTPROT3U12( float, float, double, float, s, s, d, s, funcname ) \ GENTPROT3U12( float, float, dcomplex, float, s, s, z, s, funcname ) \ \ GENTPROT3U12( float, double, float, double, s, d, s, d, funcname ) \ GENTPROT3U12( float, double, double, double, s, d, d, d, funcname ) \ GENTPROT3U12( float, double, scomplex, double, s, d, c, d, funcname ) \ GENTPROT3U12( float, double, dcomplex, double, s, d, z, d, funcname ) \ \ GENTPROT3U12( float, scomplex, double, scomplex, s, c, d, c, funcname ) \ GENTPROT3U12( float, scomplex, dcomplex, scomplex, s, c, z, c, funcname ) \ \ GENTPROT3U12( float, dcomplex, float, dcomplex, s, z, s, z, funcname ) \ GENTPROT3U12( float, dcomplex, double, dcomplex, s, z, d, z, funcname ) \ GENTPROT3U12( float, dcomplex, scomplex, dcomplex, s, z, c, z, funcname ) \ GENTPROT3U12( float, dcomplex, dcomplex, dcomplex, s, z, z, z, funcname ) \ \ \ GENTPROT3U12( double, float, float, double, d, s, s, d, funcname ) \ GENTPROT3U12( double, float, double, double, d, s, d, d, funcname ) \ GENTPROT3U12( double, float, scomplex, double, d, s, c, d, funcname ) \ GENTPROT3U12( double, float, dcomplex, double, d, s, z, d, funcname ) \ \ GENTPROT3U12( double, double, float, double, d, d, s, d, funcname ) \ GENTPROT3U12( double, double, scomplex, double, d, d, c, d, funcname ) \ \ GENTPROT3U12( double, scomplex, float, dcomplex, d, c, s, z, funcname ) \ GENTPROT3U12( double, scomplex, double, dcomplex, d, c, d, z, funcname ) \ GENTPROT3U12( double, scomplex, scomplex, dcomplex, d, c, c, z, funcname ) \ GENTPROT3U12( double, scomplex, dcomplex, dcomplex, d, c, z, z, funcname ) \ \ GENTPROT3U12( double, dcomplex, float, dcomplex, d, z, s, z, funcname ) \ GENTPROT3U12( double, dcomplex, scomplex, dcomplex, d, z, c, z, funcname ) \ \ \ GENTPROT3U12( scomplex, float, double, scomplex, c, s, d, c, funcname ) \ GENTPROT3U12( scomplex, float, dcomplex, scomplex, c, s, z, c, funcname ) \ \ GENTPROT3U12( scomplex, double, float, dcomplex, c, d, s, z, funcname ) \ GENTPROT3U12( scomplex, double, double, dcomplex, c, d, d, z, funcname ) \ GENTPROT3U12( scomplex, double, scomplex, dcomplex, c, d, c, z, funcname ) \ GENTPROT3U12( scomplex, double, dcomplex, dcomplex, c, d, z, z, funcname ) \ \ GENTPROT3U12( scomplex, scomplex, double, scomplex, c, c, d, c, funcname ) \ GENTPROT3U12( scomplex, scomplex, dcomplex, scomplex, c, c, z, c, funcname ) \ \ GENTPROT3U12( scomplex, dcomplex, float, dcomplex, c, z, s, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, double, dcomplex, c, z, d, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, scomplex, dcomplex, c, z, c, z, funcname ) \ GENTPROT3U12( scomplex, dcomplex, dcomplex, dcomplex, c, z, z, z, funcname ) \ \ \ GENTPROT3U12( dcomplex, float, float, dcomplex, z, s, s, z, funcname ) \ GENTPROT3U12( dcomplex, float, double, dcomplex, z, s, d, z, funcname ) \ GENTPROT3U12( dcomplex, float, scomplex, dcomplex, z, s, c, z, funcname ) \ GENTPROT3U12( dcomplex, float, dcomplex, dcomplex, z, s, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, double, float, dcomplex, z, d, s, z, funcname ) \ GENTPROT3U12( dcomplex, double, scomplex, dcomplex, z, d, c, z, funcname ) \ \ GENTPROT3U12( dcomplex, scomplex, float, dcomplex, z, c, s, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, double, dcomplex, z, c, d, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, scomplex, dcomplex, z, c, c, z, funcname ) \ GENTPROT3U12( dcomplex, scomplex, dcomplex, dcomplex, z, c, z, z, funcname ) \ \ GENTPROT3U12( dcomplex, dcomplex, float, dcomplex, z, z, s, z, funcname ) \ GENTPROT3U12( dcomplex, dcomplex, scomplex, dcomplex, z, z, c, z, funcname ) #endif // end bli_gentprot_macro_defs.h // begin bli_misc_macro_defs.h #ifndef BLIS_MISC_MACRO_DEFS_H #define BLIS_MISC_MACRO_DEFS_H // -- Miscellaneous macros -- // min, max, abs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_min( a, b ) ( (a) < (b) ? (a) : (b) ) #define bli_max( a, b ) ( (a) > (b) ? (a) : (b) ) #define bli_abs( a ) ( (a) <= 0 ? -(a) : (a) ) // fmin, fmax, fabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fmin( a, b ) bli_min( a, b ) #define bli_fmax( a, b ) bli_max( a, b ) #define bli_fabs( a ) ( (a) <= 0.0 ? -(a) : (a) ) // fminabs, fmaxabs // NOTE: These must remain macros since we don't know the types of a and b. #define bli_fminabs( a, b ) \ \ bli_fmin( bli_fabs( a ), \ bli_fabs( b ) ) #define bli_fmaxabs( a, b ) \ \ bli_fmax( bli_fabs( a ), \ bli_fabs( b ) ) // round BLIS_INLINE double bli_round( double a ) { return round( a ); } // round_to_mult BLIS_INLINE guint_t bli_round_to_mult( guint_t val, guint_t mult ) { return ( guint_t ) ( ( ( ( guint_t )val + ( guint_t )mult / 2 ) / mult ) * mult ); } // isnan, isinf // NOTE: These must remain macros, since isinf() and isnan() are macros // (defined in math.h) that likely depend on the type of the argument 'a' // below. #define bli_isinf( a ) isinf( a ) #define bli_isnan( a ) isnan( a ) // is_odd, is_even BLIS_INLINE bool bli_is_odd( gint_t a ) { return ( bool )( a % 2 == 1 ); } BLIS_INLINE bool bli_is_even( gint_t a ) { return ( bool )( a % 2 == 0 ); } // swap_dims BLIS_INLINE void bli_swap_dims( dim_t* dim1, dim_t* dim2 ) { dim_t temp = *dim1; *dim1 = *dim2; *dim2 = temp; } // swap_incs BLIS_INLINE void bli_swap_incs( inc_t* inc1, inc_t* inc2 ) { inc_t temp = *inc1; *inc1 = *inc2; *inc2 = temp; } // toggle_bool BLIS_INLINE void bli_toggle_bool( bool* b ) { if ( *b == TRUE ) *b = FALSE; else *b = TRUE; } // return datatype for char #define bli_stype ( BLIS_FLOAT ) #define bli_dtype ( BLIS_DOUBLE ) #define bli_ctype ( BLIS_SCOMPLEX ) #define bli_ztype ( BLIS_DCOMPLEX ) // return C type for char #define bli_sctype float #define bli_dctype double #define bli_cctype scomplex #define bli_zctype dcomplex // return real proj of C type for char #define bli_sctyper float #define bli_dctyper double #define bli_cctyper float #define bli_zctyper double // return default format specifier for char // NOTE: These must remain macros due to the way they are used to initialize // local char arrays. #define bli_sformatspec() "%9.2e" #define bli_dformatspec() "%9.2e" #define bli_cformatspec() "%9.2e + %9.2e " #define bli_zformatspec() "%9.2e + %9.2e " #define bli_iformatspec() "%6d" #endif // end bli_misc_macro_defs.h // begin bli_edge_case_macro_defs.h #ifndef BLIS_EDGE_CASE_MACRO_DEFS_H #define BLIS_EDGE_CASE_MACRO_DEFS_H // // Macros for edge-case handling within gemm microkernels. // // -- Setup helper macros -- #define GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _beta = beta; \ PASTEMAC(ch,ctype)* restrict _c = c; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMM_UKR_SETUP_CT_POST(ch) \ \ PASTEMAC(ch,ctype) _zero; \ PASTEMAC(ch,set0s)( _zero ); \ \ if ( _use_ct ) \ { \ c = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ beta = &_zero; \ } // -- Setup macros -- #define GEMM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMM_UKR_SETUP_CT_POST(ch); #define GEMM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,xpbys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _beta, \ _c, _rs_c, _cs_c \ ); \ } \ // // Macros for edge-case handling within gemmtrsm microkernels. // // -- Setup helper macros -- #define GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment) \ \ PASTEMAC(ch,ctype)* restrict _c = c11; \ const inc_t _rs_c = rs_c; \ const inc_t _cs_c = cs_c; \ PASTEMAC(ch,ctype) _ct[ BLIS_STACK_BUF_MAX_SIZE / sizeof( PASTEMAC(ch,type) ) ] \ __attribute__((aligned(alignment))); \ const inc_t _rs_ct = row_major ? nr : 1; \ const inc_t _cs_ct = row_major ? 1 : mr; #define GEMMTRSM_UKR_SETUP_CT_POST(ch) \ \ if ( _use_ct ) \ { \ c11 = _ct; \ rs_c = _rs_ct; \ cs_c = _cs_ct; \ } // -- Setup macros -- #define GEMMTRSM_UKR_SETUP_CT(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_AMBI(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( cs_c != 1 && rs_c != 1 ) || \ m != mr || n != nr; \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ANY(ch,mr,nr,row_major) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,1); \ const bool _use_ct = ( m != mr || n != nr ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); #define GEMMTRSM_UKR_SETUP_CT_ALIGNED(ch,mr,nr,row_major,alignment) \ \ \ GEMMTRSM_UKR_SETUP_CT_PRE(ch,mr,nr,row_major,alignment); \ const bool _use_ct = ( row_major ? cs_c != 1 : rs_c != 1 ) || \ m != mr || n != nr || \ ( (uintptr_t)_c % alignment ) || \ ( ( ( row_major ? _rs_c : _cs_c )*sizeof( PASTEMAC(ch,ctype) ) ) % alignment ); \ GEMMTRSM_UKR_SETUP_CT_POST(ch); // -- Flush macros -- #define GEMMTRSM_UKR_FLUSH_CT(ch) \ \ \ if ( _use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ _ct, _rs_ct, _cs_ct, \ _c, _rs_c, _cs_c \ ); \ } \ #endif // end bli_edge_case_macro_defs.h // begin bli_param_macro_defs.h #ifndef BLIS_PARAM_MACRO_DEFS_H #define BLIS_PARAM_MACRO_DEFS_H // -- Parameter query macros -- // buffer BLIS_INLINE bool bli_is_aligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size == 0 ); } BLIS_INLINE bool bli_is_unaligned_to( siz_t p, siz_t size ) { return ( bool ) ( p % size != 0 ); } BLIS_INLINE siz_t bli_offset_past_alignment( siz_t p, siz_t size ) { return ( siz_t ) ( p % size ); } // datatype BLIS_INLINE bool bli_is_float( num_t dt ) { return ( bool ) ( dt == BLIS_FLOAT ); } BLIS_INLINE bool bli_is_double( num_t dt ) { return ( bool ) ( dt == BLIS_DOUBLE ); } BLIS_INLINE bool bli_is_scomplex( num_t dt ) { return ( bool ) ( dt == BLIS_SCOMPLEX ); } BLIS_INLINE bool bli_is_dcomplex( num_t dt ) { return ( bool ) ( dt == BLIS_DCOMPLEX ); } BLIS_INLINE bool bli_is_constant( num_t dt ) { return ( bool ) ( dt == BLIS_CONSTANT ); } BLIS_INLINE bool bli_is_int( num_t dt ) { return ( bool ) ( dt == BLIS_INT ); } BLIS_INLINE bool bli_is_real( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_double( dt ) ); } BLIS_INLINE bool bli_is_complex( num_t dt ) { return ( bool ) ( bli_is_scomplex( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE bool bli_is_single_prec( num_t dt ) { return ( bool ) ( bli_is_float( dt ) || bli_is_scomplex( dt ) ); } BLIS_INLINE bool bli_is_double_prec( num_t dt ) { return ( bool ) ( bli_is_double( dt ) || bli_is_dcomplex( dt ) ); } BLIS_INLINE dom_t bli_dt_domain( num_t dt ) { return ( dom_t ) ( dt & BLIS_DOMAIN_BIT ); } BLIS_INLINE bool bli_dt_dom_is_real( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_REAL ); } BLIS_INLINE bool bli_dt_dom_is_complex( num_t dt ) { return ( bool ) ( ( dt & BLIS_DOMAIN_BIT ) == BLIS_COMPLEX ); } BLIS_INLINE prec_t bli_dt_prec( num_t dt ) { return ( prec_t ) ( dt & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_dt_prec_is_single( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_SINGLE_PREC ); } BLIS_INLINE bool bli_dt_prec_is_double( num_t dt ) { return ( bool ) ( ( dt & BLIS_PRECISION_BIT ) == BLIS_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_real( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_complex( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_dt_proj_to_single_prec( num_t dt ) { return ( num_t ) ( dt & ~BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_dt_proj_to_double_prec( num_t dt ) { return ( num_t ) ( dt | BLIS_BITVAL_DOUBLE_PREC ); } // trans BLIS_INLINE bool bli_is_notrans( trans_t trans ) { return ( bool ) ( trans == BLIS_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_trans( trans_t trans ) { return ( bool ) ( trans == BLIS_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjnotrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_NO_TRANSPOSE ); } BLIS_INLINE bool bli_is_conjtrans( trans_t trans ) { return ( bool ) ( trans == BLIS_CONJ_TRANSPOSE ); } BLIS_INLINE bool bli_does_notrans( trans_t trans ) { return ( bool ) ( (~trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_trans( trans_t trans ) { return ( bool ) ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_does_noconj( trans_t trans ) { return ( bool ) ( (~trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_does_conj( trans_t trans ) { return ( bool ) ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE trans_t bli_extract_trans( trans_t trans ) { return ( trans_t ) ( trans & BLIS_TRANS_BIT ); } BLIS_INLINE conj_t bli_extract_conj( trans_t trans ) { return ( conj_t ) ( trans & BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_trans_toggled( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_TRANS_BIT ); } BLIS_INLINE trans_t bli_trans_toggled_conj( trans_t trans ) { return ( trans_t ) ( trans ^ BLIS_CONJ_BIT ); } BLIS_INLINE trans_t bli_apply_trans( trans_t transapp, trans_t trans ) { return ( trans_t ) ( trans ^ transapp ); } BLIS_INLINE void bli_toggle_trans( trans_t* trans ) { *trans = bli_trans_toggled( *trans ); } // side BLIS_INLINE bool bli_is_left( side_t side ) { return ( bool ) ( side == BLIS_LEFT ); } BLIS_INLINE bool bli_is_right( side_t side ) { return ( bool ) ( side == BLIS_RIGHT ); } BLIS_INLINE side_t bli_side_toggled( side_t side ) { return ( bli_is_left( side ) ? BLIS_RIGHT : BLIS_LEFT ); } BLIS_INLINE void bli_toggle_side( side_t* side ) { *side = bli_side_toggled( *side ); } // uplo BLIS_INLINE bool bli_is_lower( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_LOWER ); } BLIS_INLINE bool bli_is_upper( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_UPPER ); } BLIS_INLINE bool bli_is_upper_or_lower( uplo_t uplo ) { return ( bool ) ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ); } BLIS_INLINE bool bli_is_dense( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_DENSE ); } BLIS_INLINE bool bli_is_zeros( uplo_t uplo ) { return ( bool ) ( uplo == BLIS_ZEROS ); } BLIS_INLINE uplo_t bli_uplo_toggled( uplo_t uplo ) { return ( uplo_t ) ( bli_is_upper_or_lower( uplo ) ? ( ( uplo ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT ) : uplo ); } BLIS_INLINE void bli_toggle_uplo( uplo_t* uplo ) { *uplo = bli_uplo_toggled( *uplo ); } // structure BLIS_INLINE bool bli_is_general( struc_t struc ) { return ( bool ) ( struc == BLIS_GENERAL ); } BLIS_INLINE bool bli_is_hermitian( struc_t struc ) { return ( bool ) ( struc == BLIS_HERMITIAN ); } BLIS_INLINE bool bli_is_symmetric( struc_t struc ) { return ( bool ) ( struc == BLIS_SYMMETRIC ); } BLIS_INLINE bool bli_is_triangular( struc_t struc ) { return ( bool ) ( struc == BLIS_TRIANGULAR ); } BLIS_INLINE bool bli_is_herm_or_symm( struc_t struc ) { return ( bool ) ( bli_is_hermitian( struc ) || bli_is_symmetric( struc ) ); } // conj BLIS_INLINE bool bli_is_noconj( conj_t conj ) { return ( bool ) ( conj == BLIS_NO_CONJUGATE ); } BLIS_INLINE bool bli_is_conj( conj_t conj ) { return ( bool ) ( conj == BLIS_CONJUGATE ); } BLIS_INLINE conj_t bli_conj_toggled( conj_t conj ) { return ( conj_t ) ( conj ^ BLIS_CONJ_BIT ); } BLIS_INLINE conj_t bli_apply_conj( conj_t conjapp, conj_t conj ) { return ( conj_t ) ( conj ^ conjapp ); } BLIS_INLINE void bli_toggle_conj( conj_t* conj ) { *conj = bli_conj_toggled( *conj ); } // diag BLIS_INLINE bool bli_is_nonunit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_NONUNIT_DIAG ); } BLIS_INLINE bool bli_is_unit_diag( diag_t diag ) { return ( bool ) ( diag == BLIS_UNIT_DIAG ); } // err_t-related BLIS_INLINE bool bli_is_success( err_t err ) { return ( bool ) ( err == BLIS_SUCCESS ); } BLIS_INLINE bool bli_is_failure( err_t err ) { return ( bool ) ( err != BLIS_SUCCESS ); } // dimension-related BLIS_INLINE bool bli_zero_dim1( dim_t m ) { return ( bool ) ( m == 0 ); } BLIS_INLINE bool bli_zero_dim2( dim_t m, dim_t n ) { return ( bool ) ( m == 0 || n == 0 ); } BLIS_INLINE bool bli_zero_dim3( dim_t m, dim_t n, dim_t k ) { return ( bool ) ( m == 0 || n == 0 || k == 0 ); } BLIS_INLINE bool bli_nonzero_dim( dim_t m ) { return ( bool ) ( m > 0 ); } BLIS_INLINE bool bli_vector_dim( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ? n : m ); } BLIS_INLINE bool bli_is_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 || n == 1 ); } BLIS_INLINE bool bli_is_row_vector( dim_t m, dim_t n ) { return ( bool ) ( m == 1 ); } BLIS_INLINE bool bli_is_col_vector( dim_t m, dim_t n ) { return ( bool ) ( n == 1 ); } BLIS_INLINE void bli_set_dim_with_side( side_t side, dim_t m, dim_t n, dim_t* dim ) { if ( bli_is_left( side ) ) *dim = m; else *dim = n; } BLIS_INLINE void bli_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, dim_t* mt, dim_t* nt ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; } else { *mt = n; *nt = m; } } BLIS_INLINE void bli_set_dims_incs_with_trans( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs, dim_t* mt, dim_t* nt, inc_t* rst, inc_t* cst ) { if ( bli_does_notrans( trans ) ) { *mt = m; *nt = n; *rst = rs; *cst = cs; } else { *mt = n; *nt = m; *rst = cs; *cst = rs; } } // blocksize-related BLIS_INLINE dim_t bli_determine_blocksize_dim_f( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( bli_min( b_alg, dim - i ) ); } BLIS_INLINE dim_t bli_determine_blocksize_dim_b( dim_t i, dim_t dim, dim_t b_alg ) { return ( dim_t ) ( i == 0 && dim % b_alg != 0 ? dim % b_alg : b_alg ); } // stride-related BLIS_INLINE inc_t bli_vector_inc( trans_t trans, dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( inc_t ) ( bli_does_notrans( trans ) ? ( m == 1 ? cs : rs ) : ( m == 1 ? rs : cs ) ); } BLIS_INLINE bool bli_is_row_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == 1 ); } BLIS_INLINE bool bli_is_col_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == 1 ); } BLIS_INLINE bool bli_is_row_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( cs == 1 && ( rs > 1 || n == 1 ) ); } BLIS_INLINE bool bli_is_col_stored_f( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( rs == 1 && ( cs > 1 || m == 1 ) ); } BLIS_INLINE bool bli_is_gen_stored( inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) != 1 && bli_abs( cs ) != 1 ); } BLIS_INLINE bool bli_is_row_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( cs ) == bli_abs( rs ) ? n < m : bli_abs( cs ) < bli_abs( rs ) ); } BLIS_INLINE bool bli_is_col_tilted( dim_t m, dim_t n, inc_t rs, inc_t cs ) { return ( bool ) ( bli_abs( rs ) == bli_abs( cs ) ? m < n : bli_abs( rs ) < bli_abs( cs ) ); } BLIS_INLINE bool bli_has_nonunit_inc1( inc_t s1 ) { return ( bool ) ( s1 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc2( inc_t s1, inc_t s2 ) { return ( bool ) ( s1 != 1 || s2 != 1 ); } BLIS_INLINE bool bli_has_nonunit_inc3( inc_t s1, inc_t s2, inc_t s3 ) { return ( bool ) ( s1 != 1 || s2 != 1 || s3 != 1 ); } // diag offset-related BLIS_INLINE void bli_negate_diag_offset( doff_t* diagoff ) { *diagoff = -(*diagoff); } BLIS_INLINE void bli_shift_diag_offset_to_grow_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff -= 1; else if ( bli_is_lower( uplo ) ) *diagoff += 1; } BLIS_INLINE void bli_shift_diag_offset_to_shrink_uplo( uplo_t uplo, doff_t* diagoff ) { if ( bli_is_upper( uplo ) ) *diagoff += 1; else if ( bli_is_lower( uplo ) ) *diagoff -= 1; } BLIS_INLINE doff_t bli_diag_offset_with_trans( trans_t trans, doff_t diagoff ) { return ( doff_t ) ( bli_does_trans( trans ) ? -diagoff : diagoff ); } BLIS_INLINE bool bli_is_strictly_above_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )n <= -diagoff ) : ( ( doff_t )m <= -diagoff ) ); } BLIS_INLINE bool bli_is_strictly_below_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_does_trans( trans ) ? ( ( doff_t )m <= diagoff ) : ( ( doff_t )n <= diagoff ) ); } BLIS_INLINE bool bli_is_outside_diag( doff_t diagoff, trans_t trans, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag( diagoff, trans, m, n ) || bli_is_strictly_below_diag( diagoff, trans, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart( doff_t diagoff, trans_t trans, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag( diagoff, trans, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag( diagoff, trans, m, n ) ) ); } BLIS_INLINE bool bli_is_strictly_above_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )m <= -diagoff ); } BLIS_INLINE bool bli_is_strictly_below_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( ( doff_t )n <= diagoff ); } BLIS_INLINE bool bli_intersects_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( !bli_is_strictly_above_diag_n( diagoff, m, n ) && !bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_outside_diag_n( doff_t diagoff, dim_t m, dim_t n ) { return ( bool ) ( bli_is_strictly_above_diag_n( diagoff, m, n ) || bli_is_strictly_below_diag_n( diagoff, m, n ) ); } BLIS_INLINE bool bli_is_stored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) ); } BLIS_INLINE bool bli_is_unstored_subpart_n( doff_t diagoff, uplo_t uplo, dim_t m, dim_t n ) { return ( bool ) ( ( bli_is_upper( uplo ) && bli_is_strictly_below_diag_n( diagoff, m, n ) ) || ( bli_is_lower( uplo ) && bli_is_strictly_above_diag_n( diagoff, m, n ) ) ); } // pruning-related BLIS_INLINE void bli_prune_unstored_region_top_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the left side of the matrix, // ignore the area above that intersection. if ( *diagoff < 0 ) { *m = *m + *diagoff; *offm_inc = - *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_right_l( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the bottom side of the matrix, // ignore the area to the right of that intersection. if ( *n > *diagoff + *m ) { *n = *diagoff + *m; } } BLIS_INLINE void bli_prune_unstored_region_left_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offn_inc ) { *offn_inc = 0; // If the diagonal intersects the top side of the matrix, // ignore the area to the left of that intersection. if ( *diagoff > 0 ) { *n = *n - *diagoff; *offn_inc = + *diagoff; *diagoff = 0; } } BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m, dim_t* n, dim_t* offm_inc ) { *offm_inc = 0; // If the diagonal intersects the right side of the matrix, // ignore the area below that intersection. if ( *m > -(*diagoff) + *n ) { *m = -(*diagoff) + *n; } } // thread range-related BLIS_INLINE void bli_rotate180_trapezoid( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { *diagoff = *n - *diagoff - *m; bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m, dim_t* n ) { bli_swap_dims( m, n ); bli_negate_diag_offset( diagoff ); bli_toggle_uplo( uplo ); } BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end ) { dim_t start2 = n - *start; dim_t end2 = n - *end; *start = end2; *end = start2; } // mdim_t-related BLIS_INLINE bool bli_is_m_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_M ); } BLIS_INLINE bool bli_is_n_dim( mdim_t mdim ) { return ( bool ) ( mdim == BLIS_N ); } BLIS_INLINE mdim_t bli_dim_toggled( mdim_t mdim ) { return ( mdim_t ) ( mdim == BLIS_M ? BLIS_N : BLIS_M ); } BLIS_INLINE void bli_toggle_dim( mdim_t* mdim ) { *mdim = bli_dim_toggled( *mdim ); } // stor3_t-related BLIS_INLINE stor3_t bli_stor3_from_strides( inc_t rs_c, inc_t cs_c, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b ) { // If any matrix is general-stored, return the stor3_t id for the // general-purpose sup microkernel. if ( bli_is_gen_stored( rs_c, cs_c ) || bli_is_gen_stored( rs_a, cs_a ) || bli_is_gen_stored( rs_b, cs_b ) ) return BLIS_XXX; // Otherwise, compute and return the stor3_t id as follows. const bool c_is_col = bli_is_col_stored( rs_c, cs_c ); const bool a_is_col = bli_is_col_stored( rs_a, cs_a ); const bool b_is_col = bli_is_col_stored( rs_b, cs_b ); return ( stor3_t )( 4 * c_is_col + 2 * a_is_col + 1 * b_is_col ); } BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id ) { #if 1 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )7, // BLIS_RRR = 0 -> BLIS_CCC = 7 ( stor3_t )5, // BLIS_RRC = 1 -> BLIS_CRC = 5 ( stor3_t )6, // BLIS_RCR = 2 -> BLIS_CCR = 6 ( stor3_t )4, // BLIS_RCC = 3 -> BLIS_CRR = 4 ( stor3_t )3, // BLIS_CRR = 4 -> BLIS_RCC = 3 ( stor3_t )1, // BLIS_CRC = 5 -> BLIS_RRC = 1 ( stor3_t )2, // BLIS_CCR = 6 -> BLIS_RCR = 2 ( stor3_t )0, // BLIS_CCC = 7 -> BLIS_RRR = 0 }; return map[id]; #else return ( ( id & 0x4 ) ^ 0x4 ) | // flip c bit ( ( ( id & 0x1 ) ^ 0x1 ) << 1 ) | // flip b bit and move to a position ( ( ( id & 0x2 ) ^ 0x2 ) >> 1 ); // flip a bit and move to b position #endif } BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )1, // BLIS_RRR = 0 -> BLIS_RRC = 1 ( stor3_t )0, // BLIS_RRC = 1 -> BLIS_RRR = 0 ( stor3_t )3, // BLIS_RCR = 2 -> BLIS_RCC = 3 ( stor3_t )2, // BLIS_RCC = 3 -> BLIS_RCR = 2 ( stor3_t )5, // BLIS_CRR = 4 -> BLIS_CRC = 5 ( stor3_t )4, // BLIS_CRC = 5 -> BLIS_CRR = 4 ( stor3_t )7, // BLIS_CCR = 6 -> BLIS_CCC = 7 ( stor3_t )6, // BLIS_CCC = 7 -> BLIS_CCR = 6 }; return map[id]; #else return ( stor3_t )( id ^ 0x1 ); #endif } BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) { #if 0 stor3_t map[ BLIS_NUM_3OP_RC_COMBOS ] = { ( stor3_t )2, // BLIS_RRR = 0 -> BLIS_RCR = 2 ( stor3_t )3, // BLIS_RRC = 1 -> BLIS_RCC = 3 ( stor3_t )0, // BLIS_RCR = 2 -> BLIS_RRR = 0 ( stor3_t )1, // BLIS_RCC = 3 -> BLIS_RRC = 1 ( stor3_t )6, // BLIS_CRR = 4 -> BLIS_CCR = 6 ( stor3_t )7, // BLIS_CRC = 5 -> BLIS_CCC = 7 ( stor3_t )4, // BLIS_CCR = 6 -> BLIS_CRR = 4 ( stor3_t )5, // BLIS_CCC = 7 -> BLIS_CRC = 5 }; return map[id]; #else return ( stor3_t )( id ^ 0x2 ); #endif } // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == n_iter - 1 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != n_iter - 1 || n_left == 0 ); } BLIS_INLINE bool bli_is_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i == 0 && n_left != 0 ); } BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) { return ( bool ) ( i != 0 || n_left == 0 ); } BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 ); } BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { return ( bool ) ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB return bli_is_last_iter_sl( i, end_iter, tid, nth ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } // packbuf_t-related BLIS_INLINE guint_t bli_packbuf_index( packbuf_t buf_type ) { return ( guint_t ) ( ( buf_type & BLIS_PACK_BUFFER_BITS ) >> BLIS_PACK_BUFFER_SHIFT ); } // pack_t-related BLIS_INLINE bool bli_is_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_is_row_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_is_col_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_is_panel_packed( pack_t schema ) { return ( bool ) ( schema & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE bool bli_is_1r_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1R ); } BLIS_INLINE bool bli_is_1e_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == BLIS_BITVAL_1E ); } BLIS_INLINE bool bli_is_1m_packed( pack_t schema ) { return ( bool ) ( bli_is_1r_packed( schema ) || bli_is_1e_packed( schema ) ); } BLIS_INLINE bool bli_is_nat_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) == 0 ); } BLIS_INLINE bool bli_is_ind_packed( pack_t schema ) { return ( bool ) ( ( schema & BLIS_PACK_FORMAT_BITS ) != 0 ); } BLIS_INLINE guint_t bli_pack_schema_index( pack_t schema ) { return ( guint_t ) ( ( schema & BLIS_PACK_FORMAT_BITS ) >> BLIS_PACK_FORMAT_SHIFT ); } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument. BLIS_INLINE void bli_set_dims_incs_uplo_1m ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, effective uplo/diagoff, etc for ONE matrix // argument (without column-wise stride optimization). BLIS_INLINE void bli_set_dims_incs_uplo_1m_noswap ( doff_t diagoffa, diag_t diaga, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, BLIS_NO_TRANSPOSE, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, BLIS_NO_TRANSPOSE, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions and increments for TWO matrix arguments. BLIS_INLINE void bli_set_dims_incs_2m ( trans_t transa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, dim_t* n_elem, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb ) { { *n_iter = n; *n_elem = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); } if ( bli_is_row_tilted( *n_elem, *n_iter, *incb, *ldb ) && bli_is_row_tilted( *n_elem, *n_iter, *inca, *lda ) ) { bli_swap_dims( n_iter, n_elem ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); } } } // Set dimensions, increments, effective uplo/diagoff, etc for TWO matrix // arguments. BLIS_INLINE void bli_set_dims_incs_uplo_2m ( doff_t diagoffa, diag_t diaga, trans_t transa, uplo_t uploa, dim_t m, dim_t n, inc_t rs_a, inc_t cs_a, inc_t rs_b, inc_t cs_b, uplo_t* uplo_eff, dim_t* n_elem_max, dim_t* n_iter, inc_t* inca, inc_t* lda, inc_t* incb, inc_t* ldb, dim_t* ij0, dim_t* n_shift ) { // This is to prevent the compiler from warning about uninitialized // variables. *ij0 = 0; *n_shift = 0; // If matrix A is entirely "unstored", that is, if either: // - A is lower-stored and entirely above the diagonal, or // - A is upper-stored and entirely below the diagonal // then we mark the storage as implicitly zero. if ( bli_is_unstored_subpart( diagoffa, transa, uploa, m, n ) ) { *uplo_eff = BLIS_ZEROS; } else { doff_t diagoffa_use_ = diagoffa; doff_t diagoff_eff_; dim_t n_iter_max_; if ( bli_is_unit_diag( diaga ) ) bli_shift_diag_offset_to_shrink_uplo( uploa, &diagoffa_use_ ); // If matrix A is entirely "stored", that is, if either: // - A is upper-stored and entirely above the diagonal, or // - A is lower-stored and entirely below the diagonal // then we mark the storage as dense. if ( bli_is_stored_subpart( diagoffa_use_, transa, uploa, m, n ) ) uploa = BLIS_DENSE; n_iter_max_ = n; *n_elem_max = m; *inca = rs_a; *lda = cs_a; *incb = rs_b; *ldb = cs_b; *uplo_eff = uploa; diagoff_eff_ = diagoffa_use_; if ( bli_does_trans( transa ) ) { bli_swap_incs( inca, lda ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_row_tilted( *n_elem_max, n_iter_max_, *incb, *ldb ) && bli_is_row_tilted( *n_elem_max, n_iter_max_, *inca, *lda ) ) { bli_swap_dims( &n_iter_max_, n_elem_max ); bli_swap_incs( inca, lda ); bli_swap_incs( incb, ldb ); bli_toggle_uplo( uplo_eff ); bli_negate_diag_offset( &diagoff_eff_ ); } if ( bli_is_dense( *uplo_eff ) ) { *n_iter = n_iter_max_; } else if ( bli_is_upper( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = 0; *n_shift = -diagoff_eff_; *n_elem_max = bli_min( *n_elem_max, *n_shift + bli_min( m, n ) ); *n_iter = n_iter_max_; } else { *ij0 = diagoff_eff_; *n_shift = 0; *n_iter = n_iter_max_ - diagoff_eff_; } } else // if ( bli_is_lower( *uplo_eff ) ) { if ( diagoff_eff_ < 0 ) { *ij0 = -diagoff_eff_; *n_shift = 0; *n_elem_max = *n_elem_max + diagoff_eff_; *n_iter = bli_min( *n_elem_max, bli_min( m, n ) ); } else { *ij0 = 0; *n_shift = diagoff_eff_; *n_iter = bli_min( n_iter_max_, *n_shift + bli_min( m, n ) ); } } } } // Set dimensions, increments, etc for ONE matrix argument when operating // on the diagonal. BLIS_INLINE void bli_set_dims_incs_1d ( doff_t diagoffx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, dim_t* offx, dim_t* n_elem, inc_t* incx ) { if ( diagoffx < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffx), n ); *offx = ( dim_t )(-diagoffx) * rs_x; } else { *n_elem = bli_min( n - ( dim_t )( diagoffx), m ); *offx = ( dim_t )( diagoffx) * cs_x; } *incx = rs_x + cs_x; \ } // Set dimensions, increments, etc for TWO matrix arguments when operating // on diagonals. BLIS_INLINE void bli_set_dims_incs_2d ( doff_t diagoffx, trans_t transx, dim_t m, dim_t n, inc_t rs_x, inc_t cs_x, inc_t rs_y, inc_t cs_y, dim_t* offx, dim_t* offy, dim_t* n_elem, inc_t* incx, inc_t* incy ) { doff_t diagoffy_ = bli_diag_offset_with_trans( transx, diagoffx ); if ( diagoffx < 0 ) *offx = -diagoffx * rs_x; else *offx = diagoffx * cs_x; if ( diagoffy_ < 0 ) { *n_elem = bli_min( m - ( dim_t )(-diagoffy_), n ); *offy = -diagoffy_ * rs_y; } else { *n_elem = bli_min( n - ( dim_t )( diagoffy_), m ); *offy = diagoffy_ * cs_y; } *incx = rs_x + cs_x; *incy = rs_y + cs_y; } #endif // end bli_param_macro_defs.h // begin bli_obj_macro_defs.h #ifndef BLIS_OBJ_MACRO_DEFS_H #define BLIS_OBJ_MACRO_DEFS_H // -- Object query/modification macros -- // Info query BLIS_INLINE num_t bli_obj_dt( obj_t* obj ) { return ( num_t ) ( obj->info & BLIS_DATATYPE_BITS ); } BLIS_INLINE bool bli_obj_is_float( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE ); } BLIS_INLINE bool bli_obj_is_double( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE ); } BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE ); } BLIS_INLINE bool bli_obj_is_int( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE ); } BLIS_INLINE bool bli_obj_is_const( obj_t* obj ) { return ( bool ) ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE ); } BLIS_INLINE dom_t bli_obj_domain( obj_t* obj ) { return ( dom_t ) ( obj->info & BLIS_DOMAIN_BIT ); } BLIS_INLINE prec_t bli_obj_prec( obj_t* obj ) { return ( prec_t ) ( obj->info & BLIS_PRECISION_BIT ); } BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj ) { return ( bool ) ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC ); } BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC ); } BLIS_INLINE bool bli_obj_is_real( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL && !bli_obj_is_const( obj ) ); } BLIS_INLINE bool bli_obj_is_complex( obj_t* obj ) { return ( bool ) ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX && !bli_obj_is_const( obj ) ); } BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj ) { return ( num_t ) ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX ); } BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT ); } BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT ); } BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj ) { return ( num_t ) ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj ) { return ( dom_t ) ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } // NOTE: This function queries info2. BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj ) { return ( prec_t ) ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT ); } BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_CONJTRANS_BITS ); } BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj ) { return ( trans_t ) ( obj->info & BLIS_TRANS_BIT ); } BLIS_INLINE bool bli_obj_has_trans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS ); } BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj ) { return ( bool ) ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS ); } BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj ) { return ( conj_t ) ( obj->info & BLIS_CONJ_BIT ); } BLIS_INLINE bool bli_obj_has_conj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ ); } BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj ) { return ( bool ) ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ ); } BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj ) { return ( uplo_t ) ( obj->info & BLIS_UPLO_BITS ); } BLIS_INLINE bool bli_obj_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER ); } BLIS_INLINE bool bli_obj_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER ); } BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( obj ) || bli_obj_is_lower( obj ) ); } BLIS_INLINE bool bli_obj_is_dense( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE ); } BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj ) { return ( bool ) ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS ); } BLIS_INLINE diag_t bli_obj_diag( obj_t* obj ) { return ( diag_t ) ( obj->info & BLIS_UNIT_DIAG_BIT ); } BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj ) { return ( bool ) ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG ); } BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER ); } BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER ); } BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj ) { return ( pack_t ) ( obj->info & BLIS_PACK_SCHEMA_BITS ); } BLIS_INLINE bool bli_obj_is_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_BIT ); } BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_ROWS ) ); } BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj ) { return ( bool ) ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^ BLIS_BITVAL_PACKED_COLUMNS ) ); } BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj ) { return ( bool ) ( obj->info & BLIS_PACK_PANEL_BIT ); } BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj ) { return ( packbuf_t ) ( obj->info & BLIS_PACK_BUFFER_BITS ); } BLIS_INLINE struc_t bli_obj_struc( obj_t* obj ) { return ( struc_t ) ( obj->info & BLIS_STRUC_BITS ); } BLIS_INLINE bool bli_obj_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL ); } BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN ); } BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC ); } BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR ); } // Info modification BLIS_INLINE void bli_obj_apply_trans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ trans ); } BLIS_INLINE void bli_obj_apply_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ conj ); } BLIS_INLINE void bli_obj_set_conjtrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJTRANS_BITS ) | trans ); } BLIS_INLINE void bli_obj_set_onlytrans( trans_t trans, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TRANS_BIT ) | trans ); } BLIS_INLINE void bli_obj_set_conj( conj_t conj, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_CONJ_BIT ) | conj ); } BLIS_INLINE void bli_obj_set_uplo( uplo_t uplo, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UPLO_BITS ) | uplo ); } BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag ); } BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag ); } BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt ); } BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DT_BITS ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) | ( dt << BLIS_TARGET_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DT_BITS ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) | ( dt << BLIS_EXEC_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DT_BITS ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_COMP_PREC_BIT ) | ( dt << BLIS_COMP_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DT_BITS ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_domain( dom_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_DOMAIN_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } // NOTE: This function queries and modifies info2. BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj ) { obj->info2 = ( objbits_t ) ( ( obj->info2 & ~BLIS_SCALAR_PREC_BIT ) | ( dt << BLIS_SCALAR_DT_SHIFT ) ); } BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema ); } BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif ); } BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif ); } // NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead, // packbuf_t is stored/used from the context in order to support various // induced methods. (Though ideally the packbuf_t field would only be // present in the control tree). BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type ); } BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj ) { obj->info = ( objbits_t ) ( ( obj->info & ~BLIS_STRUC_BITS ) | struc ); } BLIS_INLINE void bli_obj_toggle_trans( obj_t* obj ) { bli_obj_apply_trans( BLIS_TRANSPOSE, obj ); } BLIS_INLINE void bli_obj_toggle_conj( obj_t* obj ) { bli_obj_apply_conj( BLIS_CONJUGATE, obj ); } BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj ) { obj->info = ( objbits_t ) ( obj->info ^ BLIS_LOWER_BIT ) ^ BLIS_UPPER_BIT; } // Root matrix query BLIS_INLINE obj_t* bli_obj_root( obj_t* obj ) { return ( obj_t* )( obj->root ); } BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj ) { return ( bool ) ( bli_obj_is_general( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj ) { return ( bool ) ( bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj ) { return ( bool ) ( bli_obj_is_triangular( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj ) { return ( bool ) ( bli_obj_is_hermitian( bli_obj_root( obj ) ) || bli_obj_is_symmetric( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj ) { return ( bool ) ( bli_obj_is_upper( bli_obj_root( obj ) ) ); } BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj ) { return ( bool ) ( bli_obj_is_lower( bli_obj_root( obj ) ) ); } // Root matrix modification BLIS_INLINE void bli_obj_set_as_root( obj_t* obj ) { obj->root = obj; } // Diagonal offset query BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj ) { return ( doff_t ) ( obj->diag_off ); } BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj ) { return ( doff_t ) ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj ) : bli_obj_diag_offset( obj ) ); } // Diagonal offset modification BLIS_INLINE void bli_obj_set_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off = ( doff_t )offset; } BLIS_INLINE void bli_obj_negate_diag_offset( obj_t* obj ) { obj->diag_off = -(obj->diag_off); } BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj ) { obj->diag_off += ( doff_t )offset; } // Dimension query BLIS_INLINE dim_t bli_obj_length( obj_t* obj ) { return ( obj->dim[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_width( obj_t* obj ) { return ( obj->dim[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj ) { return ( obj->dim[ mdim ] ); } BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj ) { return bli_min( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj ) { return bli_max( bli_obj_length( obj ), bli_obj_width( obj ) ); } BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj ) : bli_obj_length( obj ) ); } BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj ) : bli_obj_width( obj ) ); } BLIS_INLINE bool bli_obj_is_1x1( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 && bli_obj_width( x ) == 1 ); } // Stride/increment query BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj ) { return ( obj->rs ); } BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj ) { return ( obj->cs ); } BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj ) { return ( obj->is ); } BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->rs ) ); } BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->cs ) ); } BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj ) { return ( inc_t ) ( bli_abs( obj->is ) ); } // Note: The purpose of these functions is to obtain the length and width // of the smallest submatrices of an object that could still encompass // the stored data above (if obj is upper) or below (if obj is lower) // the diagonal. BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_upper( obj ) ? bli_min( bli_obj_length( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_length( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj ) { return ( dim_t ) ( bli_obj_is_lower( obj ) ? bli_min( bli_obj_width( obj ), bli_obj_length( obj ) + bli_obj_diag_offset( obj ) ) : bli_min( bli_obj_width( obj ), bli_obj_width( obj ) - bli_obj_diag_offset( obj ) ) ); } BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj ) : bli_obj_length_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj ) { return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj ) : bli_obj_width_stored( obj ) ); } BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x ) { return ( bli_obj_length( x ) == 1 ? bli_obj_width( x ) : bli_obj_length( x ) ); } BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x ) { return ( bli_obj_is_1x1( x ) ? 1 : ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x ) : bli_obj_row_stride( x ) ) ); } BLIS_INLINE bool bli_obj_is_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 || bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x ) { return ( bool ) ( bli_obj_width( x ) == 1 ); } BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x ) { return ( bool ) ( bli_obj_length( x ) == 0 || bli_obj_width( x ) == 0 ); } // Dimension modification BLIS_INLINE void bli_obj_set_length( dim_t m, obj_t* obj ) { obj->dim[ BLIS_M ] = m; } BLIS_INLINE void bli_obj_set_width( dim_t n, obj_t* obj ) { obj->dim[ BLIS_N ] = n; } BLIS_INLINE void bli_obj_set_dim( mdim_t mdim, dim_t dim_val, obj_t* obj ) { obj->dim[ mdim ] = dim_val; } BLIS_INLINE void bli_obj_set_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, obj_t* obj ) { if ( bli_does_notrans( trans ) ) { bli_obj_set_length( m, obj ); bli_obj_set_width( n, obj ); } else // if ( bli_does_trans( trans ) ) { bli_obj_set_length( n, obj ); bli_obj_set_width( m, obj ); } } // Stride/increment predicates // // NOTE: The following two macros differ from their non-obj counterparts // in that they do not identify m x 1 and 1 x n objects as row-stored and // column-stored, respectively, which is needed when considering packed // objects. But this is okay, since none of the invocations of these // "obj" macros are used on packed matrices. // BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) == 1 ); } BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) != 1 && bli_obj_col_stride_mag( obj ) != 1 ); } BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) ); } BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj ) { return ( bool ) ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) ); } // Stride/increment modification BLIS_INLINE void bli_obj_set_row_stride( inc_t rs, obj_t* obj ) { obj->rs = rs; } BLIS_INLINE void bli_obj_set_col_stride( inc_t cs, obj_t* obj ) { obj->cs = cs; } BLIS_INLINE void bli_obj_set_strides( inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_row_stride( rs, obj ); bli_obj_set_col_stride( cs, obj ); } BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj ) { obj->is = is; } // Offset query BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj ) { return ( obj->off[ BLIS_M ] ); } BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj ) { return ( obj->off[ BLIS_N ] ); } BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj ) { return ( obj->off[ mdim ] ); } // Offset modification BLIS_INLINE void bli_obj_set_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] = offset; } BLIS_INLINE void bli_obj_set_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_set_off( BLIS_M, offm, obj ); bli_obj_set_off( BLIS_N, offn, obj ); } BLIS_INLINE void bli_obj_inc_off( mdim_t mdim, dim_t offset, obj_t* obj ) { obj->off[ mdim ] += offset; } BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj ) { bli_obj_inc_off( BLIS_M, offm, obj ); bli_obj_inc_off( BLIS_N, offn, obj ); } // Diagonal offset predicates BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj ) { return ( bool ) ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) ); } BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj ) { return ( bool ) ( bli_obj_is_strictly_above_diag( obj ) || bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj ) { return ( bool ) ( !bli_obj_is_strictly_above_diag( obj ) && !bli_obj_is_strictly_below_diag( obj ) ); } BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj ) { return ( bool ) ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) || ( bli_obj_root_is_upper( obj ) && bli_obj_is_strictly_below_diag( obj ) ) ); } // Buffer address query BLIS_INLINE void* bli_obj_buffer( obj_t* obj ) { return ( void* ) ( obj->buffer ); } // Buffer address modification BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj ) { obj->buffer = p; } // Bufferless scalar field query BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj ) { return ( void* ) ( &( obj->scalar ) ); } // Bufferless scalar field modification BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b ) { b->scalar = a->scalar; } // Element size query BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj ) { return ( siz_t ) ( obj->elem_size ); } // Element size modification BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj ) { obj->elem_size = size; } // Packed matrix info query BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj ) { return ( obj->m_padded ); } BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj ) { return ( obj->n_padded ); } // Packed matrix info modification BLIS_INLINE void bli_obj_set_padded_length( dim_t m, obj_t* obj ) { obj->m_padded = m; } BLIS_INLINE void bli_obj_set_padded_width( dim_t n, obj_t* obj ) { obj->n_padded = n; } BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_padded_length( m, obj ); bli_obj_set_padded_width( n, obj ); } // Packed panel info query BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj ) { return ( obj->m_panel ); } BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj ) { return ( obj->n_panel ); } BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj ) { return ( obj->pd ); } BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj ) { return ( obj->ps ); } // Packed panel info modification BLIS_INLINE void bli_obj_set_panel_length( dim_t m, obj_t* obj ) { obj->m_panel = m; } BLIS_INLINE void bli_obj_set_panel_width( dim_t n, obj_t* obj ) { obj->n_panel = n; } BLIS_INLINE void bli_obj_set_panel_dims( dim_t m, dim_t n, obj_t* obj ) { bli_obj_set_panel_length( m, obj ); bli_obj_set_panel_width( n, obj ); } BLIS_INLINE void bli_obj_set_panel_dim( inc_t pd, obj_t* obj ) { obj->pd = pd; } BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj ) { obj->ps = ps; } // stor3_t-related BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b ) { const inc_t rs_c = bli_obj_row_stride( c ); const inc_t cs_c = bli_obj_col_stride( c ); inc_t rs_a, cs_a; inc_t rs_b, cs_b; if ( bli_obj_has_notrans( a ) ) { rs_a = bli_obj_row_stride( a ); cs_a = bli_obj_col_stride( a ); } else { rs_a = bli_obj_col_stride( a ); cs_a = bli_obj_row_stride( a ); } if ( bli_obj_has_notrans( b ) ) { rs_b = bli_obj_row_stride( b ); cs_b = bli_obj_col_stride( b ); } else { rs_b = bli_obj_col_stride( b ); cs_b = bli_obj_row_stride( b ); } return bli_stor3_from_strides( rs_c, cs_c, rs_a, cs_a, rs_b, cs_b ); } // -- User-provided information macros -- // Function pointer query BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj ) { return obj->pack_fn; } BLIS_INLINE void* bli_obj_pack_params( obj_t* obj ) { return obj->pack_params; } BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj ) { return obj->ker_fn; } BLIS_INLINE void* bli_obj_ker_params( obj_t* obj ) { return obj->ker_params; } // Function pointer modification BLIS_INLINE void bli_obj_set_pack_fn( obj_pack_fn_t pack_fn, obj_t* obj ) { obj->pack_fn = pack_fn; } BLIS_INLINE void bli_obj_set_pack_params( void* params, obj_t* obj ) { obj->pack_params = params; } BLIS_INLINE void bli_obj_set_ker_fn( obj_ker_fn_t ker_fn, obj_t* obj ) { obj->ker_fn = ker_fn; } BLIS_INLINE void bli_obj_set_ker_params( void* params, obj_t* obj ) { obj->ker_params = params; } // -- Initialization-related macros -- // Finish the initialization started by the matrix-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_target_dt( dt, obj ); bli_obj_set_exec_dt( dt, obj ); bli_obj_set_comp_dt( dt, obj ); bli_obj_set_dims( m, n, obj ); bli_obj_set_strides( rs, cs, obj ); siz_t elem_size = sizeof( float ); if ( bli_dt_prec_is_double( dt ) ) elem_size *= 2; if ( bli_dt_dom_is_complex( dt ) ) elem_size *= 2; bli_obj_set_elem_size( elem_size, obj ); bli_obj_set_buffer( p, obj ); bli_obj_set_scalar_dt( dt, obj ); void* restrict s = bli_obj_internal_scalar_buffer( obj ); if ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F; (( scomplex* )s)->imag = 0.0F; } else if ( bli_dt_prec_is_double( dt ) ) { (( dcomplex* )s)->real = 1.0; (( dcomplex* )s)->imag = 0.0; } } // Finish the initialization started by the 1x1-specific static initializer // (e.g. BLIS_OBJECT_INITIALIZER_1X1) // NOTE: This is intended only for use in the BLAS compatibility API and typed // BLIS API. BLIS_INLINE void bli_obj_init_finish_1x1( num_t dt, void* p, obj_t* obj ) { bli_obj_set_as_root( obj ); bli_obj_set_dt( dt, obj ); bli_obj_set_buffer( p, obj ); } // -- Miscellaneous object macros -- // Toggle the region referenced (or "stored"). BLIS_INLINE void bli_obj_toggle_region_ref( obj_t* obj ) { if ( bli_obj_is_upper( obj ) ) bli_obj_inc_diag_offset( -1, obj ); else if ( bli_obj_is_lower( obj ) ) bli_obj_inc_diag_offset( 1, obj ); bli_obj_toggle_uplo( obj ); } BLIS_INLINE void bli_obj_toggle_uplo_if_trans( trans_t trans, obj_t* obj ) { if ( bli_does_trans( trans ) && bli_obj_is_upper_or_lower( obj ) ) { bli_obj_toggle_uplo( obj ); bli_obj_negate_diag_offset( obj ); } } // Initialize object with default properties (info field). BLIS_INLINE void bli_obj_set_defaults( obj_t* obj ) { obj->info = 0x0; obj->info = obj->info | BLIS_BITVAL_DENSE | BLIS_BITVAL_GENERAL; } // Acquire buffer at object's submatrix offset (offset-aware buffer query). BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj ) { return ( void* ) ( ( ( char* )( bli_obj_buffer ( obj ) ) + ( dim_t )( bli_obj_elem_size( obj ) ) * ( bli_obj_col_off( obj ) * bli_obj_col_stride( obj ) + bli_obj_row_off( obj ) * bli_obj_row_stride( obj ) ) ) ); } // Acquire buffer from BLIS_CONSTANT object. BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj ) { void* p; if ( dt == BLIS_FLOAT ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s); else if ( dt == BLIS_DOUBLE ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d); else if ( dt == BLIS_SCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->c); else if ( dt == BLIS_DCOMPLEX ) p = &((( constdata_t* )bli_obj_buffer( obj ))->z); else p = &((( constdata_t* )bli_obj_buffer( obj ))->i); return p; } // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects. BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj ) { return ( void* ) ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj ) : bli_obj_buffer_at_off( obj ) ); } // Adjust the pointer based on current offsets, zero the offsets, and then // set the current object as the root. For obj_t's with at least one non-zero // offset, this effectively makes the obj_t "forget" that it was ever a view // into a larger matrix. BLIS_INLINE void bli_obj_reset_origin( obj_t* obj ) { bli_obj_set_buffer( bli_obj_buffer_at_off( obj ), obj ); bli_obj_set_offs( 0, 0, obj ); bli_obj_set_as_root( obj ); } // Make a full alias (shallow copy). BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b ) { bli_obj_init_full_shallow_copy_of( a, b ); } // Check if two objects are aliases of one another. BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b ) { return ( bool ) ( bli_obj_buffer( a ) == bli_obj_buffer( b ) ); } // Create an alias with a trans value applied. // (Note: trans may include a conj component.) BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_trans( trans, b ); } // Create an alias with a conj value applied. BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b ) { bli_obj_alias_to( a, b ); bli_obj_apply_conj( conja, b ); } // Alias only the real part. BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r ) { bli_obj_alias_to( c, r ); if ( bli_obj_is_complex( c ) ) { // Change the datatypes. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, r ); bli_obj_set_target_dt( dt_targ_r, r ); bli_obj_set_exec_dt( dt_exec_r, r ); bli_obj_set_comp_dt( dt_comp_r, r ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, r ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, r ); // Buffer is left unchanged. } } // Alias only the imaginary part. BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i ) { if ( bli_obj_is_complex( c ) ) { bli_obj_alias_to( c, i ); // Change the datatype. const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) ); const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) ); const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c ) ); const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c ) ); bli_obj_set_dt( dt_stor_r, i ); bli_obj_set_target_dt( dt_targ_r, i ); bli_obj_set_exec_dt( dt_exec_r, i ); bli_obj_set_comp_dt( dt_comp_r, i ); // Don't touch the attached scalar datatype. // Update the element size. siz_t es_c = bli_obj_elem_size( c ); bli_obj_set_elem_size( es_c/2, i ); // Update the strides. inc_t rs_c = bli_obj_row_stride( c ); inc_t cs_c = bli_obj_col_stride( c ); bli_obj_set_strides( 2*rs_c, 2*cs_c, i ); // Update the buffer. inc_t is_c = bli_obj_imag_stride( c ); char* p = ( char* )bli_obj_buffer_at_off( c ); bli_obj_set_buffer( p + is_c * es_c/2, i ); } } // Given a 1x1 object, acquire an address to the buffer depending on whether // the object is a BLIS_CONSTANT, and also set a datatype associated with the // chosen buffer (possibly using an auxiliary datatype if the object is // BLIS_CONSTANT). BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf ) { if ( bli_obj_is_const( obj ) ) { *dt = dt_aux; *buf = bli_obj_buffer_for_1x1( dt_aux, obj ); } else { *dt = bli_obj_dt( obj ); *buf = bli_obj_buffer_at_off( obj ); } } // Swap all object fields (metadata/properties). BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b ) { bool a_root_is_self = ( bli_obj_root( a ) == a ); bool b_root_is_self = ( bli_obj_root( b ) == b ); obj_t t = *b; *b = *a; *a = t; if ( a_root_is_self ) bli_obj_set_as_root( b ); if ( b_root_is_self ) bli_obj_set_as_root( a ); } // Swap object pack schemas. BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b ) { const pack_t schema_a = bli_obj_pack_schema( a ); const pack_t schema_b = bli_obj_pack_schema( b ); bli_obj_set_pack_schema( schema_b, a ); bli_obj_set_pack_schema( schema_a, b ); } // Induce a transposition on an object: swap dimensions, increments, and // offsets, then clear the trans bit. BLIS_INLINE void bli_obj_induce_trans( obj_t* obj ) { // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); if ( bli_obj_is_upper_or_lower( obj ) ) bli_obj_toggle_uplo( obj ); // Induce transposition among packed fields. dim_t m_padded = bli_obj_padded_length( obj ); dim_t n_padded = bli_obj_padded_width( obj ); dim_t m_panel = bli_obj_panel_length( obj ); dim_t n_panel = bli_obj_panel_width( obj ); bli_obj_set_padded_dims( n_padded, m_padded, obj ); bli_obj_set_panel_dims( n_panel, m_panel, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } BLIS_INLINE void bli_obj_induce_fast_trans( obj_t* obj ) { // NOTE: This function is only used in situations where the matrices // are guaranteed to not have structure or be packed. // Induce transposition among basic fields. dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); inc_t rs = bli_obj_row_stride( obj ); inc_t cs = bli_obj_col_stride( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_strides( cs, rs, obj ); bli_obj_set_offs( offn, offm, obj ); // Note that this macro DOES NOT touch the transposition bit! If // the calling code is using this function to handle an object whose // transposition bit is set prior to computation, that code needs // to manually clear or toggle the bit, via // bli_obj_set_onlytrans() or bli_obj_toggle_trans(), // respectively. } // Sometimes we need to "reflect" a partition because the data we want is // actually stored on the other side of the diagonal. The nuts and bolts of // this macro look a lot like an induced transposition, except that the row // and column strides are left unchanged (which, of course, drastically // changes the effect of the macro). BLIS_INLINE void bli_obj_reflect_about_diag( obj_t* obj ) { dim_t m = bli_obj_length( obj ); dim_t n = bli_obj_width( obj ); dim_t offm = bli_obj_row_off( obj ); dim_t offn = bli_obj_col_off( obj ); doff_t diag_off = bli_obj_diag_offset( obj ); bli_obj_set_dims( n, m, obj ); bli_obj_set_offs( offn, offm, obj ); bli_obj_set_diag_offset( -diag_off, obj ); bli_obj_toggle_trans( obj ); } #endif // end bli_obj_macro_defs.h // begin bli_complex_macro_defs.h #ifndef BLIS_COMPLEX_MACRO_DEFS_H #define BLIS_COMPLEX_MACRO_DEFS_H // -- Real and imaginary accessor macros -- #define bli_sreal( x ) ( x ) #define bli_simag( x ) ( 0.0F ) #define bli_dreal( x ) ( x ) #define bli_dimag( x ) ( 0.0 ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( (x).real ) #define bli_cimag( x ) ( (x).imag ) #define bli_zreal( x ) ( (x).real ) #define bli_zimag( x ) ( (x).imag ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_creal( x ) ( crealf(x) ) #define bli_cimag( x ) ( cimagf(x) ) #define bli_zreal( x ) ( creal(x) ) #define bli_zimag( x ) ( cimag(x) ) #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_complex_macro_defs.h // begin bli_scalar_macro_defs.h #ifndef BLIS_SCALAR_MACRO_DEFS_H #define BLIS_SCALAR_MACRO_DEFS_H // -- Assignment/Accessor macros -- // NOTE: This macro is defined first since some of the other scalar macros // use it to abstract away the method used to assign complex values (ie: // whether fields of a struct are set directly or whether native C99 // assignment is used). // begin bli_sets.h #ifndef BLIS_SETS_H #define BLIS_SETS_H // sets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssets( xr, xi, y ) { (y) = (xr); } #define bli_dssets( xr, xi, y ) { (y) = (xr); } #define bli_cssets( xr, xi, y ) { (y) = (xr); } #define bli_zssets( xr, xi, y ) { (y) = (xr); } #define bli_issets( xr, xi, y ) { (y) = (xr); } #define bli_sdsets( xr, xi, y ) { (y) = (xr); } #define bli_ddsets( xr, xi, y ) { (y) = (xr); } #define bli_cdsets( xr, xi, y ) { (y) = (xr); } #define bli_zdsets( xr, xi, y ) { (y) = (xr); } #define bli_idsets( xr, xi, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_dcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_ccsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_zcsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_icsets( xr, xi, y ) { bli_creal(y) = (xr); bli_cimag(y) = (xi); } #define bli_szsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_dzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_czsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_zzsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #define bli_izsets( xr, xi, y ) { bli_zreal(y) = (xr); bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_ccsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zcsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_szsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_dzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_czsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #define bli_zzsets( xr, xi, y ) { (y) = (xr) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sisets( xr, xi, y ) { (y) = bli_sreal(xr); } #define bli_disets( xr, xi, y ) { (y) = bli_dreal(xr); } #define bli_cisets( xr, xi, y ) { (y) = bli_creal(xr); } #define bli_zisets( xr, xi, y ) { (y) = bli_zreal(xr); } #define bli_iisets( xr, xi, y ) { (y) = (xr); } #define bli_ssets( xr, xi, y ) bli_sssets( xr, xi, y ) #define bli_dsets( xr, xi, y ) bli_ddsets( xr, xi, y ) #define bli_csets( xr, xi, y ) bli_scsets( xr, xi, y ) #define bli_zsets( xr, xi, y ) bli_dzsets( xr, xi, y ) #define bli_isets( xr, xi, y ) bli_disets( xr, xi, y ) #endif // end bli_sets.h // NOTE: These macros are not used by other scalar macros, but they are // related to those defined in bli_sets.h, and so we #include them here. // begin bli_setrs.h #ifndef BLIS_SETRS_H #define BLIS_SETRS_H // setrs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetrs( xr, y ) { (y) = (xr); } #define bli_dssetrs( xr, y ) { (y) = (xr); } #define bli_sdsetrs( xr, y ) { (y) = (xr); } #define bli_ddsetrs( xr, y ) { (y) = (xr); } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_dcsetrs( xr, y ) { bli_creal(y) = (xr); } #define bli_szsetrs( xr, y ) { bli_zreal(y) = (xr); } #define bli_dzsetrs( xr, y ) { bli_zreal(y) = (xr); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_dcsetrs( xr, y ) { (y) = (xr) + bli_cimag(y) * (I); } #define bli_szsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #define bli_dzsetrs( xr, y ) { (y) = (xr) + bli_zimag(y) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetrs( xr, y ) bli_sssetrs( xr, y ) #define bli_dsetrs( xr, y ) bli_ddsetrs( xr, y ) #define bli_csetrs( xr, y ) bli_scsetrs( xr, y ) #define bli_zsetrs( xr, y ) bli_dzsetrs( xr, y ) #endif // end bli_setrs.h // begin bli_setis.h #ifndef BLIS_SETIS_H #define BLIS_SETIS_H // setis // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sssetis( xi, y ) { ; } #define bli_dssetis( xi, y ) { ; } #define bli_sdsetis( xi, y ) { ; } #define bli_ddsetis( xi, y ) { ; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_dcsetis( xi, y ) { bli_cimag(y) = (xi); } #define bli_szsetis( xi, y ) { bli_zimag(y) = (xi); } #define bli_dzsetis( xi, y ) { bli_zimag(y) = (xi); } #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_dcsetis( xi, y ) { (y) = bli_creal(y) + (xi) * (I); } #define bli_szsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #define bli_dzsetis( xi, y ) { (y) = bli_zreal(y) + (xi) * (I); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssetis( xi, y ) bli_sssetis( xi, y ) #define bli_dsetis( xi, y ) bli_ddsetis( xi, y ) #define bli_csetis( xi, y ) bli_scsetis( xi, y ) #define bli_zsetis( xi, y ) bli_dzsetis( xi, y ) #endif // end bli_setis.h // NOTE: This macro also needs to be defined early on since it determines // how real and imaginary components are accessed (ie: whether the fields // of a struct are read directly or whether native C99 functions are used.) // begin bli_gets.h #ifndef BLIS_GETS_H #define BLIS_GETS_H // gets // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dsgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_csgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zsgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_isgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_sdgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_ddgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_cdgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zdgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_idgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_scgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dcgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_ccgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zcgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_icgets( x, yr, yi ) { (yr) = ( float )(x); (yi) = 0.0F; } #define bli_szgets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = bli_simag(x); } #define bli_dzgets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = bli_dimag(x); } #define bli_czgets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = bli_cimag(x); } #define bli_zzgets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = bli_zimag(x); } #define bli_izgets( x, yr, yi ) { (yr) = ( double )(x); (yi) = 0.0; } #define bli_sigets( x, yr, yi ) { (yr) = bli_sreal(x); (yi) = 0; } #define bli_digets( x, yr, yi ) { (yr) = bli_dreal(x); (yi) = 0; } #define bli_cigets( x, yr, yi ) { (yr) = bli_creal(x); (yi) = 0; } #define bli_zigets( x, yr, yi ) { (yr) = bli_zreal(x); (yi) = 0; } #define bli_iigets( x, yr, yi ) { (yr) = (x); (yi) = 0; } #define bli_sgets( x, yr, yi ) bli_ssgets( x, yr, yi ) #define bli_dgets( x, yr, yi ) bli_ddgets( x, yr, yi ) #define bli_cgets( x, yr, yi ) bli_csgets( x, yr, yi ) #define bli_zgets( x, yr, yi ) bli_zdgets( x, yr, yi ) #define bli_igets( x, yr, yi ) bli_idgets( x, yr, yi ) #endif // end bli_gets.h // -- Scalar constant initialization macros -- // begin bli_constants.h #ifndef BLIS_CONSTANTS_H #define BLIS_CONSTANTS_H // return pointers to constants // 1 #define bli_s1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ONE ) ) #define bli_d1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ONE ) ) #define bli_c1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ONE ) ) #define bli_z1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ONE ) ) #define bli_i1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ONE ) ) // 0 #define bli_s0 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_ZERO ) ) #define bli_d0 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_ZERO ) ) #define bli_c0 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_ZERO ) ) #define bli_z0 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_ZERO ) ) #define bli_i0 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_ZERO ) ) // -1 #define bli_sm1 \ \ ( ( float* ) bli_obj_buffer_for_const( BLIS_FLOAT, &BLIS_MINUS_ONE ) ) #define bli_dm1 \ \ ( ( double* ) bli_obj_buffer_for_const( BLIS_DOUBLE, &BLIS_MINUS_ONE ) ) #define bli_cm1 \ \ ( ( scomplex* ) bli_obj_buffer_for_const( BLIS_SCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_zm1 \ \ ( ( dcomplex* ) bli_obj_buffer_for_const( BLIS_DCOMPLEX, &BLIS_MINUS_ONE ) ) #define bli_im1 \ \ ( ( gint_t* ) bli_obj_buffer_for_const( BLIS_INT, &BLIS_MINUS_ONE ) ) #endif // end bli_constants.h // -- Separated scalar macros (separated real/imaginary values) -- // begin bli_absq2ris.h #ifndef BLIS_ABSQ2RIS_H #define BLIS_ABSQ2RIS_H // absq2ris #define bli_sabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_dabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar); \ } #define bli_cabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0F; \ } #define bli_zabsq2ris( ar, ai, br, bi ) \ { \ (br) = (ar) * (ar) + (ai) * (ai); \ (bi) = 0.0; \ } #endif // end bli_absq2ris.h // begin bli_abval2ris.h #ifndef BLIS_ABVAL2RIS_H #define BLIS_ABVAL2RIS_H // abval2ris #define bli_sabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabsf(xr); \ } #define bli_dabval2ris( xr, xi, ar, ai ) \ { \ (ar) = fabs(xr); \ } #define bli_cabval2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0F; \ } #define bli_zabval2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ (ar) = mag; \ (ai) = 0.0; \ } #endif // end bli_abval2ris.h // begin bli_addris.h #ifndef BLIS_ADDRIS_H #define BLIS_ADDRIS_H // addris #define bli_saddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_daddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ } #define bli_caddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #define bli_zaddris( ar, ai, xr, xi ) \ { \ (xr) = (xr) + (ar); \ (xi) = (xi) + (ai); \ } #endif // end bli_addris.h // begin bli_addjris.h #ifndef BLIS_ADDJRIS_H #define BLIS_ADDJRIS_H // addjris #define bli_saddjris( ar, ai, xr, xi ) bli_saddris( (ar), -(ai), (xr), (xi) ) #define bli_daddjris( ar, ai, xr, xi ) bli_daddris( (ar), -(ai), (xr), (xi) ) #define bli_caddjris( ar, ai, xr, xi ) bli_caddris( (ar), -(ai), (xr), (xi) ) #define bli_zaddjris( ar, ai, xr, xi ) bli_zaddris( (ar), -(ai), (xr), (xi) ) #endif // end bli_addjris.h // begin bli_add3ris.h #ifndef BLIS_ADD3RIS_H #define BLIS_ADD3RIS_H // add3ris #define bli_sadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_dadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ } #define bli_cadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #define bli_zadd3ris( ar, ai, br, bi, cr, ci ) \ { \ (cr) = (ar) + (br); \ (ci) = (ai) + (bi); \ } #endif // end bli_add3ris.h // begin bli_axpbyris.h #ifndef BLIS_AXPBYRIS_H #define BLIS_AXPBYRIS_H // axpbyris #define bli_rxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) - (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) + (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyris bli_rxxpbyris #define bli_dsssxpbyris bli_rxxpbyris #define bli_csssxpbyris bli_rxxpbyris #define bli_zsssxpbyris bli_rxxpbyris #define bli_sdssxpbyris bli_rxxpbyris #define bli_ddssxpbyris bli_rxxpbyris #define bli_cdssxpbyris bli_rxxpbyris #define bli_zdssxpbyris bli_rxxpbyris #define bli_scssxpbyris bli_rxxpbyris #define bli_dcssxpbyris bli_rxxpbyris #define bli_ccssxpbyris bli_rxxpbyris #define bli_zcssxpbyris bli_rxxpbyris #define bli_szssxpbyris bli_rxxpbyris #define bli_dzssxpbyris bli_rxxpbyris #define bli_czssxpbyris bli_rxxpbyris #define bli_zzssxpbyris bli_rxxpbyris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyris. #define bli_saxpbyris bli_ssssaxpbyris #define bli_daxpbyris bli_ddddaxpbyris #define bli_caxpbyris bli_ccccaxpbyris #define bli_zaxpbyris bli_zzzzaxpbyris #endif // end bli_axpbyris.h // begin bli_axpbyjris.h #ifndef BLIS_AXPBYJRIS_H #define BLIS_AXPBYJRIS_H // axpbyjris #define bli_rxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (br) * (yr); \ } #define bli_cxaxpbyjris( ar, ai, xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (ar) * (xr) + (ai) * (xi) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (ai) * (xr) - (ar) * (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (??ss) ---------------------------------------------------------- #define bli_ssssxpbyjris bli_rxxpbyjris #define bli_dsssxpbyjris bli_rxxpbyjris #define bli_csssxpbyjris bli_rxxpbyjris #define bli_zsssxpbyjris bli_rxxpbyjris #define bli_sdssxpbyjris bli_rxxpbyjris #define bli_ddssxpbyjris bli_rxxpbyjris #define bli_cdssxpbyjris bli_rxxpbyjris #define bli_zdssxpbyjris bli_rxxpbyjris #define bli_scssxpbyjris bli_rxxpbyjris #define bli_dcssxpbyjris bli_rxxpbyjris #define bli_ccssxpbyjris bli_rxxpbyjris #define bli_zcssxpbyjris bli_rxxpbyjris #define bli_szssxpbyjris bli_rxxpbyjris #define bli_dzssxpbyjris bli_rxxpbyjris #define bli_czssxpbyjris bli_rxxpbyjris #define bli_zzssxpbyjris bli_rxxpbyjris // NOTE: This series needs to be finished for all other char values for (by), but // not until something in BLIS actually needs mixed-datatype axpbyjris. #define bli_saxpbyjris bli_ssssaxpbyjris #define bli_daxpbyjris bli_ddddaxpbyjris #define bli_caxpbyjris bli_ccccaxpbyjris #define bli_zaxpbyjris bli_zzzzaxpbyjris #endif // end bli_axpbyjris.h // begin bli_axpyris.h #ifndef BLIS_AXPYRIS_H #define BLIS_AXPYRIS_H // axpyris #define bli_rxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ (yi) += (ai) * (xr) + (ar) * (xi); \ } #define bli_roaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) - (ai) * (xi); \ } #define bli_craxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * (xi); \ } #define bli_rcaxpyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyris bli_rxaxpyris #define bli_dssaxpyris bli_rxaxpyris #define bli_cssaxpyris bli_rxaxpyris #define bli_zssaxpyris bli_rxaxpyris #define bli_sdsaxpyris bli_rxaxpyris #define bli_ddsaxpyris bli_rxaxpyris #define bli_cdsaxpyris bli_rxaxpyris #define bli_zdsaxpyris bli_rxaxpyris #define bli_scsaxpyris bli_rxaxpyris #define bli_dcsaxpyris bli_rxaxpyris #define bli_ccsaxpyris bli_roaxpyris #define bli_zcsaxpyris bli_roaxpyris #define bli_szsaxpyris bli_rxaxpyris #define bli_dzsaxpyris bli_rxaxpyris #define bli_czsaxpyris bli_roaxpyris #define bli_zzsaxpyris bli_roaxpyris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyris bli_rxaxpyris #define bli_dsdaxpyris bli_rxaxpyris #define bli_csdaxpyris bli_rxaxpyris #define bli_zsdaxpyris bli_rxaxpyris #define bli_sddaxpyris bli_rxaxpyris #define bli_dddaxpyris bli_rxaxpyris #define bli_cddaxpyris bli_rxaxpyris #define bli_zddaxpyris bli_rxaxpyris #define bli_scdaxpyris bli_rxaxpyris #define bli_dcdaxpyris bli_rxaxpyris #define bli_ccdaxpyris bli_roaxpyris #define bli_zcdaxpyris bli_roaxpyris #define bli_szdaxpyris bli_rxaxpyris #define bli_dzdaxpyris bli_rxaxpyris #define bli_czdaxpyris bli_roaxpyris #define bli_zzdaxpyris bli_roaxpyris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyris bli_rxaxpyris #define bli_dscaxpyris bli_rxaxpyris #define bli_cscaxpyris bli_rcaxpyris #define bli_zscaxpyris bli_rcaxpyris #define bli_sdcaxpyris bli_rxaxpyris #define bli_ddcaxpyris bli_rxaxpyris #define bli_cdcaxpyris bli_rcaxpyris #define bli_zdcaxpyris bli_rcaxpyris #define bli_sccaxpyris bli_craxpyris #define bli_dccaxpyris bli_craxpyris #define bli_cccaxpyris bli_cxaxpyris #define bli_zccaxpyris bli_cxaxpyris #define bli_szcaxpyris bli_craxpyris #define bli_dzcaxpyris bli_craxpyris #define bli_czcaxpyris bli_cxaxpyris #define bli_zzcaxpyris bli_cxaxpyris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyris bli_rxaxpyris #define bli_dszaxpyris bli_rxaxpyris #define bli_cszaxpyris bli_rcaxpyris #define bli_zszaxpyris bli_rcaxpyris #define bli_sdzaxpyris bli_rxaxpyris #define bli_ddzaxpyris bli_rxaxpyris #define bli_cdzaxpyris bli_rcaxpyris #define bli_zdzaxpyris bli_rcaxpyris #define bli_sczaxpyris bli_craxpyris #define bli_dczaxpyris bli_craxpyris #define bli_cczaxpyris bli_cxaxpyris #define bli_zczaxpyris bli_cxaxpyris #define bli_szzaxpyris bli_craxpyris #define bli_dzzaxpyris bli_craxpyris #define bli_czzaxpyris bli_cxaxpyris #define bli_zzzaxpyris bli_cxaxpyris #define bli_saxpyris bli_sssaxpyris #define bli_daxpyris bli_dddaxpyris #define bli_caxpyris bli_cccaxpyris #define bli_zaxpyris bli_zzzaxpyris #endif // end bli_axpyris.h // begin bli_axpyjris.h #ifndef BLIS_AXPYJRIS_H #define BLIS_AXPYJRIS_H // axpyjris #define bli_rxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ } #define bli_cxaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ (yi) += (ai) * (xr) - (ar) * (xi); \ } #define bli_roaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr) + (ai) * (xi); \ } #define bli_craxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ar) * -(xi); \ } #define bli_rcaxpyjris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) += (ar) * (xr); \ (yi) += (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjris bli_rxaxpyjris #define bli_dssaxpyjris bli_rxaxpyjris #define bli_cssaxpyjris bli_rxaxpyjris #define bli_zssaxpyjris bli_rxaxpyjris #define bli_sdsaxpyjris bli_rxaxpyjris #define bli_ddsaxpyjris bli_rxaxpyjris #define bli_cdsaxpyjris bli_rxaxpyjris #define bli_zdsaxpyjris bli_rxaxpyjris #define bli_scsaxpyjris bli_rxaxpyjris #define bli_dcsaxpyjris bli_rxaxpyjris #define bli_ccsaxpyjris bli_roaxpyjris #define bli_zcsaxpyjris bli_roaxpyjris #define bli_szsaxpyjris bli_rxaxpyjris #define bli_dzsaxpyjris bli_rxaxpyjris #define bli_czsaxpyjris bli_roaxpyjris #define bli_zzsaxpyjris bli_roaxpyjris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjris bli_rxaxpyjris #define bli_dsdaxpyjris bli_rxaxpyjris #define bli_csdaxpyjris bli_rxaxpyjris #define bli_zsdaxpyjris bli_rxaxpyjris #define bli_sddaxpyjris bli_rxaxpyjris #define bli_dddaxpyjris bli_rxaxpyjris #define bli_cddaxpyjris bli_rxaxpyjris #define bli_zddaxpyjris bli_rxaxpyjris #define bli_scdaxpyjris bli_rxaxpyjris #define bli_dcdaxpyjris bli_rxaxpyjris #define bli_ccdaxpyjris bli_roaxpyjris #define bli_zcdaxpyjris bli_roaxpyjris #define bli_szdaxpyjris bli_rxaxpyjris #define bli_dzdaxpyjris bli_rxaxpyjris #define bli_czdaxpyjris bli_roaxpyjris #define bli_zzdaxpyjris bli_roaxpyjris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjris bli_rxaxpyjris #define bli_dscaxpyjris bli_rxaxpyjris #define bli_cscaxpyjris bli_rcaxpyjris #define bli_zscaxpyjris bli_rcaxpyjris #define bli_sdcaxpyjris bli_rxaxpyjris #define bli_ddcaxpyjris bli_rxaxpyjris #define bli_cdcaxpyjris bli_rcaxpyjris #define bli_zdcaxpyjris bli_rcaxpyjris #define bli_sccaxpyjris bli_craxpyjris #define bli_dccaxpyjris bli_craxpyjris #define bli_cccaxpyjris bli_cxaxpyjris #define bli_zccaxpyjris bli_cxaxpyjris #define bli_szcaxpyjris bli_craxpyjris #define bli_dzcaxpyjris bli_craxpyjris #define bli_czcaxpyjris bli_cxaxpyjris #define bli_zzcaxpyjris bli_cxaxpyjris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjris bli_rxaxpyjris #define bli_dszaxpyjris bli_rxaxpyjris #define bli_cszaxpyjris bli_rcaxpyjris #define bli_zszaxpyjris bli_rcaxpyjris #define bli_sdzaxpyjris bli_rxaxpyjris #define bli_ddzaxpyjris bli_rxaxpyjris #define bli_cdzaxpyjris bli_rcaxpyjris #define bli_zdzaxpyjris bli_rcaxpyjris #define bli_sczaxpyjris bli_craxpyjris #define bli_dczaxpyjris bli_craxpyjris #define bli_cczaxpyjris bli_cxaxpyjris #define bli_zczaxpyjris bli_cxaxpyjris #define bli_szzaxpyjris bli_craxpyjris #define bli_dzzaxpyjris bli_craxpyjris #define bli_czzaxpyjris bli_cxaxpyjris #define bli_zzzaxpyjris bli_cxaxpyjris #define bli_saxpyjris bli_sssaxpyjris #define bli_daxpyjris bli_dddaxpyjris #define bli_caxpyjris bli_cccaxpyjris #define bli_zaxpyjris bli_zzzaxpyjris #endif // end bli_axpyjris.h // begin bli_axmyris.h #ifndef BLIS_AXMYRIS_H #define BLIS_AXMYRIS_H // axmyris #define bli_saxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_daxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ } #define bli_caxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_zaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr) - (ai) * (xi); \ (yi) -= (ai) * (xr) + (ar) * (xi); \ } #define bli_scaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #define bli_dzaxmyris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) -= (ar) * (xr); \ (yi) -= (ar) * (xi); \ } #endif // end bli_axmyris.h // begin bli_conjris.h #ifndef BLIS_CONJRIS_H #define BLIS_CONJRIS_H // conjris #define bli_sconjris( xr, xi ) \ { \ ; \ } #define bli_dconjris( xr, xi ) \ { \ ; \ } #define bli_cconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #define bli_zconjris( xr, xi ) \ { \ (xi) = -(xi); \ } #endif // end bli_conjris.h // begin bli_copyris.h #ifndef BLIS_COPYRIS_H #define BLIS_COPYRIS_H // copyris #define bli_scopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_dcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ } #define bli_ccopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_zcopyris( ar, ai, br, bi ) \ { \ (br) = (ar); \ (bi) = (ai); \ } #define bli_sscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0F, br, bi ) #define bli_dscopyris( ar, ai, br, bi ) bli_scopyris( ar, 0.0, br, bi ) #define bli_cscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_zscopyris( ar, ai, br, bi ) bli_scopyris( ar, ai, br, bi ) #define bli_sdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0F, br, bi ) #define bli_ddcopyris( ar, ai, br, bi ) bli_dcopyris( ar, 0.0, br, bi ) #define bli_cdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_zdcopyris( ar, ai, br, bi ) bli_dcopyris( ar, ai, br, bi ) #define bli_sccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0F, br, bi ) #define bli_dccopyris( ar, ai, br, bi ) bli_ccopyris( ar, 0.0, br, bi ) #define bli_cccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_zccopyris( ar, ai, br, bi ) bli_ccopyris( ar, ai, br, bi ) #define bli_szcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0F, br, bi ) #define bli_dzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, 0.0, br, bi ) #define bli_czcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #define bli_zzcopyris( ar, ai, br, bi ) bli_zcopyris( ar, ai, br, bi ) #endif // end bli_copyris.h // begin bli_copyjris.h #ifndef BLIS_COPYJRIS_H #define BLIS_COPYJRIS_H // copyjris #define bli_scopyjris( ar, ai, br, bi ) bli_scopyris( (ar), -(ai), (br), (bi) ) #define bli_dcopyjris( ar, ai, br, bi ) bli_dcopyris( (ar), -(ai), (br), (bi) ) #define bli_ccopyjris( ar, ai, br, bi ) bli_ccopyris( (ar), -(ai), (br), (bi) ) #define bli_zcopyjris( ar, ai, br, bi ) bli_zcopyris( (ar), -(ai), (br), (bi) ) #define bli_sscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0F, br, bi ) #define bli_dscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, 0.0, br, bi ) #define bli_cscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_zscopyjris( ar, ai, br, bi ) bli_scopyjris( ar, ai, br, bi ) #define bli_sdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0F, br, bi ) #define bli_ddcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, 0.0, br, bi ) #define bli_cdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_zdcopyjris( ar, ai, br, bi ) bli_dcopyjris( ar, ai, br, bi ) #define bli_sccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0F, br, bi ) #define bli_dccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, 0.0, br, bi ) #define bli_cccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_zccopyjris( ar, ai, br, bi ) bli_ccopyjris( ar, ai, br, bi ) #define bli_szcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0F, br, bi ) #define bli_dzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, 0.0, br, bi ) #define bli_czcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #define bli_zzcopyjris( ar, ai, br, bi ) bli_zcopyjris( ar, ai, br, bi ) #endif // end bli_copyjris.h // begin bli_copycjris.h #ifndef BLIS_COPYCJRIS_H #define BLIS_COPYCJRIS_H // copycjris #define bli_scopycjris( conj, xr, xi, yr, yi ) \ { \ bli_scopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_dcopycjris( conj, xr, xi, yr, yi ) \ { \ bli_dcopyris( (xr), (xi), (yr), (yi) ); \ } #define bli_ccopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_zcopycjris( conj, xr, xi, yr, yi ) \ { \ (yr) = (xr); \ (yi) = ( bli_is_conj( conj ) ? -(xi) \ : (xi) ); \ } #define bli_icopycjris( conj, xr, xi, yr, yi ) \ { \ bli_icopyris( (xr), (xi), (yr), (yi) ); \ } #endif // end bli_copycjris.h // begin bli_eqris.h #ifndef BLIS_EQRIS_H #define BLIS_EQRIS_H // eqris (passed by value) #define bli_seqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_deqris( ar, ai, br, bi ) ( (ar) == (br) ) #define bli_ceqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_zeqris( ar, ai, br, bi ) ( (ar) == (br) && (ai) == (bi) ) #define bli_ieqris( ar, ai, br, bi ) ( (ar) == (br) ) // eq1ris #define bli_seq1ris( ar, ai ) bli_seqris( (ar), (ai), 1.0F, 0.0F ) #define bli_deq1ris( ar, ai ) bli_deqris( (ar), (ai), 1.0, 0.0 ) #define bli_ceq1ris( ar, ai ) bli_ceqris( (ar), (ai), 1.0F, 0.0F ) #define bli_zeq1ris( ar, ai ) bli_zeqris( (ar), (ai), 1.0, 0.0 ) #define bli_ieq1ris( ar, ai ) bli_ieqris( (ar), (ai), 1, 0 ) // eq0ris #define bli_seq0ris( ar, ai ) bli_seqris( (ar), (ai), 0.0F, 0.0F ) #define bli_deq0ris( ar, ai ) bli_deqris( (ar), (ai), 0.0, 0.0 ) #define bli_ceq0ris( ar, ai ) bli_ceqris( (ar), (ai), 0.0F, 0.0F ) #define bli_zeq0ris( ar, ai ) bli_zeqris( (ar), (ai), 0.0, 0.0 ) #define bli_ieq0ris( ar, ai ) bli_ieqris( (ar), (ai), 0, 0 ) // eqm1ris #define bli_seqm1ris( ar, ai ) bli_seqris( (ar), (ai), -1.0F, 0.0F ) #define bli_deqm1ris( ar, ai ) bli_deqris( (ar), (ai), -1.0, 0.0 ) #define bli_ceqm1ris( ar, ai ) bli_ceqris( (ar), (ai), -1.0F, 0.0F ) #define bli_zeqm1ris( ar, ai ) bli_zeqris( (ar), (ai), -1.0, 0.0 ) #define bli_ieqm1ris( ar, ai ) bli_ieqris( (ar), (ai), -1, 0 ) #endif // end bli_eqris.h // begin bli_invertris.h #ifndef BLIS_INVERTRIS_H #define BLIS_INVERTRIS_H // invertris #define bli_sinvertris( xr, xi ) \ { \ (xr) = 1.0F / (xr); \ } #define bli_dinvertris( xr, xi ) \ { \ (xr) = 1.0 / (xr); \ } #define bli_cinvertris( xr, xi ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float xr_s = (xr) / s; \ float xi_s = (xi) / s; \ float temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #define bli_zinvertris( xr, xi ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double xr_s = (xr) / s; \ double xi_s = (xi) / s; \ double temp = ( xr_s * (xr) + xi_s * (xi) ); \ (xr) = xr_s / temp; \ (xi) = -xi_s / temp; \ } #endif // end bli_invertris.h // begin bli_invscalris.h #ifndef BLIS_INVSCALRIS_H #define BLIS_INVSCALRIS_H // invscalris #define bli_sinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_dinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ } #define bli_cinvscalris( ar, ai, xr, xi ) \ { \ float s = bli_fmaxabs( (ar), (ai) ); \ float ar_s = (ar) / s; \ float ai_s = (ai) / s; \ float xrt = (xr); \ float temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_zinvscalris( ar, ai, xr, xi ) \ { \ double s = bli_fmaxabs( (ar), (ai) ); \ double ar_s = (ar) / s; \ double ai_s = (ai) / s; \ double xrt = (xr); \ double temp = ( ar_s * (ar) + ai_s * (ai) ); \ (xr) = ( (xrt) * ar_s + (xi) * ai_s ) / temp; \ (xi) = ( (xi) * ar_s - (xrt) * ai_s ) / temp; \ } #define bli_scinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #define bli_dzinvscalris( ar, ai, xr, xi ) \ { \ (xr) /= (ar); \ (xi) /= (ar); \ } #endif // end bli_invscalris.h // begin bli_invscaljris.h #ifndef BLIS_INVSCALJRIS_H #define BLIS_INVSCALJRIS_H // invscaljris #define bli_sinvscaljris( ar, ai, xr, xi ) bli_sinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dinvscaljris( ar, ai, xr, xi ) bli_dinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_cinvscaljris( ar, ai, xr, xi ) bli_cinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_zinvscaljris( ar, ai, xr, xi ) bli_zinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_scinvscaljris( ar, ai, xr, xi ) bli_scinvscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzinvscaljris( ar, ai, xr, xi ) bli_dzinvscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_invscaljris.h // begin bli_neg2ris.h #ifndef BLIS_NEG2RIS_H #define BLIS_NEG2RIS_H // neg2ris #define bli_sneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_dneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ } #define bli_cneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #define bli_zneg2ris( ar, ai, br, bi ) \ { \ (br) = -(ar); \ (bi) = -(ai); \ } #endif // end bli_neg2ris.h // begin bli_scalris.h #ifndef BLIS_SCALRIS_H #define BLIS_SCALRIS_H // scalris #define bli_sscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_dscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ } #define bli_cscalris( ar, ai, xr, xi ) \ { \ float yr = (ar) * (xr) - (ai) * (xi); \ float yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_zscalris( ar, ai, xr, xi ) \ { \ double yr = (ar) * (xr) - (ai) * (xi); \ double yi = (ai) * (xr) + (ar) * (xi); \ (xr) = yr; \ (xi) = yi; \ } #define bli_scscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #define bli_dzscalris( ar, ai, xr, xi ) \ { \ (xr) = (ar) * (xr); \ (xi) = (ar) * (xi); \ } #endif // end bli_scalris.h // begin bli_scaljris.h #ifndef BLIS_SCALJRIS_H #define BLIS_SCALJRIS_H // scaljris #define bli_sscaljris( ar, ai, xr, xi ) bli_sscalris( (ar), -(ai), (xr), (xi) ) #define bli_dscaljris( ar, ai, xr, xi ) bli_dscalris( (ar), -(ai), (xr), (xi) ) #define bli_cscaljris( ar, ai, xr, xi ) bli_cscalris( (ar), -(ai), (xr), (xi) ) #define bli_zscaljris( ar, ai, xr, xi ) bli_zscalris( (ar), -(ai), (xr), (xi) ) #define bli_scscaljris( ar, ai, xr, xi ) bli_scscalris( (ar), -(ai), (xr), (xi) ) #define bli_dzscaljris( ar, ai, xr, xi ) bli_dzscalris( (ar), -(ai), (xr), (xi) ) #endif // end bli_scaljris.h // begin bli_scalcjris.h #ifndef BLIS_SCALCJRIS_H #define BLIS_SCALCJRIS_H // scalcjris #define bli_sscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_sscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_cscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_cscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_cscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_zscalcjris( conj, ar, ai, xr, xi ) \ { \ if ( bli_is_conj( conj ) ) { bli_zscaljris( (ar), (ai), (xr), (xi) ); } \ else { bli_zscalris( (ar), (ai), (xr), (xi) ); } \ } #define bli_iscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_iscalris( (ar), (xi), (xr), (xi) ); \ } #define bli_scscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_scscalris( (ar), (ai), (xr), (xi) ); \ } #define bli_dzscalcjris( conj, ar, ai, xr, xi ) \ { \ bli_dzscalris( (ar), (ai), (xr), (xi) ); \ } #endif // end bli_scalcjris.h // begin bli_scal2ris.h #ifndef BLIS_SCAL2RIS_H #define BLIS_SCAL2RIS_H // scal2ris #define bli_rxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ (yi) = (ai) * (xr) + (ar) * (xi); \ } #define bli_roscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) - (ai) * (xi); \ } #define bli_crscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * (xi); \ } #define bli_rcscal2ris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2ris bli_rxscal2ris #define bli_dssscal2ris bli_rxscal2ris #define bli_cssscal2ris bli_rxscal2ris #define bli_zssscal2ris bli_rxscal2ris #define bli_sdsscal2ris bli_rxscal2ris #define bli_ddsscal2ris bli_rxscal2ris #define bli_cdsscal2ris bli_rxscal2ris #define bli_zdsscal2ris bli_rxscal2ris #define bli_scsscal2ris bli_rxscal2ris #define bli_dcsscal2ris bli_rxscal2ris #define bli_ccsscal2ris bli_roscal2ris #define bli_zcsscal2ris bli_roscal2ris #define bli_szsscal2ris bli_rxscal2ris #define bli_dzsscal2ris bli_rxscal2ris #define bli_czsscal2ris bli_roscal2ris #define bli_zzsscal2ris bli_roscal2ris // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2ris bli_rxscal2ris #define bli_dsdscal2ris bli_rxscal2ris #define bli_csdscal2ris bli_rxscal2ris #define bli_zsdscal2ris bli_rxscal2ris #define bli_sddscal2ris bli_rxscal2ris #define bli_dddscal2ris bli_rxscal2ris #define bli_cddscal2ris bli_rxscal2ris #define bli_zddscal2ris bli_rxscal2ris #define bli_scdscal2ris bli_rxscal2ris #define bli_dcdscal2ris bli_rxscal2ris #define bli_ccdscal2ris bli_roscal2ris #define bli_zcdscal2ris bli_roscal2ris #define bli_szdscal2ris bli_rxscal2ris #define bli_dzdscal2ris bli_rxscal2ris #define bli_czdscal2ris bli_roscal2ris #define bli_zzdscal2ris bli_roscal2ris // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2ris bli_rxscal2ris #define bli_dscscal2ris bli_rxscal2ris #define bli_cscscal2ris bli_rcscal2ris #define bli_zscscal2ris bli_rcscal2ris #define bli_sdcscal2ris bli_rxscal2ris #define bli_ddcscal2ris bli_rxscal2ris #define bli_cdcscal2ris bli_rcscal2ris #define bli_zdcscal2ris bli_rcscal2ris #define bli_sccscal2ris bli_crscal2ris #define bli_dccscal2ris bli_crscal2ris #define bli_cccscal2ris bli_cxscal2ris #define bli_zccscal2ris bli_cxscal2ris #define bli_szcscal2ris bli_crscal2ris #define bli_dzcscal2ris bli_crscal2ris #define bli_czcscal2ris bli_cxscal2ris #define bli_zzcscal2ris bli_cxscal2ris // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2ris bli_rxscal2ris #define bli_dszscal2ris bli_rxscal2ris #define bli_cszscal2ris bli_rcscal2ris #define bli_zszscal2ris bli_rcscal2ris #define bli_sdzscal2ris bli_rxscal2ris #define bli_ddzscal2ris bli_rxscal2ris #define bli_cdzscal2ris bli_rcscal2ris #define bli_zdzscal2ris bli_rcscal2ris #define bli_sczscal2ris bli_crscal2ris #define bli_dczscal2ris bli_crscal2ris #define bli_cczscal2ris bli_cxscal2ris #define bli_zczscal2ris bli_cxscal2ris #define bli_szzscal2ris bli_crscal2ris #define bli_dzzscal2ris bli_crscal2ris #define bli_czzscal2ris bli_cxscal2ris #define bli_zzzscal2ris bli_cxscal2ris #define bli_sscal2ris bli_sssscal2ris #define bli_dscal2ris bli_dddscal2ris #define bli_cscal2ris bli_cccscal2ris #define bli_zscal2ris bli_zzzscal2ris #endif // end bli_scal2ris.h // begin bli_scal2jris.h #ifndef BLIS_SCAL2JRIS_H #define BLIS_SCAL2JRIS_H // scal2jris #define bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ } #define bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ (yi) = (ai) * (xr) - (ar) * (xi); \ } #define bli_roscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr) + (ai) * (xi); \ } #define bli_crscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ar) * -(xi); \ } #define bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) \ { \ (yr) = (ar) * (xr); \ (yi) = (ai) * (xr); \ } // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zssscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzsscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_csdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zsdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zddscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_scdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ccdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zcdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzdscal2jris( ar, ai, xr, xi, yr, yi ) bli_roscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdcscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dccscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zccscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzcscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zszscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_ddzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zdzscal2jris( ar, ai, xr, xi, yr, yi ) bli_rcscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dczscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zczscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_szzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_crscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_czzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) bli_cxscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_sscal2jris( ar, ai, xr, xi, yr, yi ) bli_sssscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_dscal2jris( ar, ai, xr, xi, yr, yi ) bli_dddscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_cscal2jris( ar, ai, xr, xi, yr, yi ) bli_cccscal2jris( ar, ai, xr, xi, yr, yi ) #define bli_zscal2jris( ar, ai, xr, xi, yr, yi ) bli_zzzscal2jris( ar, ai, xr, xi, yr, yi ) #endif // end bli_scal2jris.h // begin bli_set0ris.h #ifndef BLIS_SET0RIS_H #define BLIS_SET0RIS_H // set0ris #define bli_sset0ris( xr, xi ) bli_scopyris( 0.0F, 0.0F, xr, xi ) #define bli_dset0ris( xr, xi ) bli_dcopyris( 0.0 , 0.0 , xr, xi ) #define bli_cset0ris( xr, xi ) bli_ccopyris( 0.0F, 0.0F, xr, xi ) #define bli_zset0ris( xr, xi ) bli_zcopyris( 0.0 , 0.0 , xr, xi ) #endif // end bli_set0ris.h // begin bli_sqrt2ris.h #ifndef BLIS_SQRT2RIS_H #define BLIS_SQRT2RIS_H // sqrt2ris #define bli_ssqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ } #define bli_dsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ } #define bli_csqrt2ris( xr, xi, ar, ai ) \ { \ float s = bli_fmaxabs( (xr), (xi) ); \ float mag; \ if ( s == 0.0F ) mag = 0.0F; \ else \ { \ mag = sqrtf( s ) * \ sqrtf( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrtf( ( mag + (xr) ) / 2.0F ); \ (ai) = sqrtf( ( mag - (xi) ) / 2.0F ); \ } #define bli_zsqrt2ris( xr, xi, ar, ai ) \ { \ double s = bli_fmaxabs( (xr), (xi) ); \ double mag; \ if ( s == 0.0 ) mag = 0.0; \ else \ { \ mag = sqrt( s ) * \ sqrt( ( (xr) / s ) * (xr) + \ ( (xi) / s ) * (xi) ); \ } \ \ (ar) = sqrt( ( mag + (xr) ) / 2.0 ); \ (ai) = sqrt( ( mag - (xi) ) / 2.0 ); \ } #define bli_scsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrtf( (xr) ); \ (ai) = 0.0F; \ } #define bli_dzsqrt2ris( xr, xi, ar, ai ) \ { \ (ar) = sqrt( (xr) ); \ (ai) = 0.0; \ } #endif // end bli_sqrt2ris.h // begin bli_subris.h #ifndef BLIS_SUBRIS_H #define BLIS_SUBRIS_H // subris #define bli_ssubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_dsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ } #define bli_csubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #define bli_zsubris( ar, ai, xr, xi ) \ { \ (xr) = (xr) - (ar); \ (xi) = (xi) - (ai); \ } #endif // end bli_subris.h // begin bli_subjris.h #ifndef BLIS_SUBJRIS_H #define BLIS_SUBJRIS_H // subjris #define bli_ssubjris( ar, ai, xr, xi ) bli_ssubris( (ar), -(ai), (xr), (xi) ) #define bli_dsubjris( ar, ai, xr, xi ) bli_dsubris( (ar), -(ai), (xr), (xi) ) #define bli_csubjris( ar, ai, xr, xi ) bli_csubris( (ar), -(ai), (xr), (xi) ) #define bli_zsubjris( ar, ai, xr, xi ) bli_zsubris( (ar), -(ai), (xr), (xi) ) #endif // end bli_subjris.h // begin bli_swapris.h #ifndef BLIS_SWAPRIS_H #define BLIS_SWAPRIS_H // swapris #define bli_sswapris( ar, ai, br, bi ) \ { \ float tr, ti; \ \ bli_scopyris( (br), (bi), (tr), (ti) ); \ bli_scopyris( (ar), (ai), (br), (bi) ); \ bli_scopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_dswapris( ar, ai, br, bi ) \ { \ double tr, ti; \ \ bli_dcopyris( (br), (bi), (tr), (ti) ); \ bli_dcopyris( (ar), (ai), (br), (bi) ); \ bli_dcopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_cswapris( ar, ai, br, bi ) \ { \ scomplex tr, ti; \ \ bli_ccopyris( (br), (bi), (tr), (ti) ); \ bli_ccopyris( (ar), (ai), (br), (bi) ); \ bli_ccopyris( (tr), (ti), (ar), (ai) ); \ } #define bli_zswapris( ar, ai, br, bi ) \ { \ dcomplex tr, ti; \ \ bli_zcopyris( (br), (bi), (tr), (ti) ); \ bli_zcopyris( (ar), (ai), (br), (bi) ); \ bli_zcopyris( (tr), (ti), (ar), (ai) ); \ } #endif // end bli_swapris.h // begin bli_xpbyris.h #ifndef BLIS_XPBYRIS_H #define BLIS_XPBYRIS_H // xpbyris #define bli_rxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = (xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = (xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyris bli_rxxpbyris #define bli_dssxpbyris bli_rxxpbyris #define bli_cssxpbyris bli_rxxpbyris #define bli_zssxpbyris bli_rxxpbyris #define bli_sdsxpbyris bli_rxxpbyris #define bli_ddsxpbyris bli_rxxpbyris #define bli_cdsxpbyris bli_rxxpbyris #define bli_zdsxpbyris bli_rxxpbyris #define bli_scsxpbyris bli_rxxpbyris #define bli_dcsxpbyris bli_rxxpbyris #define bli_ccsxpbyris bli_rxxpbyris #define bli_zcsxpbyris bli_rxxpbyris #define bli_szsxpbyris bli_rxxpbyris #define bli_dzsxpbyris bli_rxxpbyris #define bli_czsxpbyris bli_rxxpbyris #define bli_zzsxpbyris bli_rxxpbyris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyris bli_rxxpbyris #define bli_dsdxpbyris bli_rxxpbyris #define bli_csdxpbyris bli_rxxpbyris #define bli_zsdxpbyris bli_rxxpbyris #define bli_sddxpbyris bli_rxxpbyris #define bli_dddxpbyris bli_rxxpbyris #define bli_cddxpbyris bli_rxxpbyris #define bli_zddxpbyris bli_rxxpbyris #define bli_scdxpbyris bli_rxxpbyris #define bli_dcdxpbyris bli_rxxpbyris #define bli_ccdxpbyris bli_rxxpbyris #define bli_zcdxpbyris bli_rxxpbyris #define bli_szdxpbyris bli_rxxpbyris #define bli_dzdxpbyris bli_rxxpbyris #define bli_czdxpbyris bli_rxxpbyris #define bli_zzdxpbyris bli_rxxpbyris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyris bli_rxxpbyris #define bli_dscxpbyris bli_rxxpbyris #define bli_cscxpbyris bli_crxpbyris #define bli_zscxpbyris bli_crxpbyris #define bli_sdcxpbyris bli_rxxpbyris #define bli_ddcxpbyris bli_rxxpbyris #define bli_cdcxpbyris bli_crxpbyris #define bli_zdcxpbyris bli_crxpbyris #define bli_sccxpbyris bli_cxxpbyris #define bli_dccxpbyris bli_cxxpbyris #define bli_cccxpbyris bli_cxxpbyris #define bli_zccxpbyris bli_cxxpbyris #define bli_szcxpbyris bli_cxxpbyris #define bli_dzcxpbyris bli_cxxpbyris #define bli_czcxpbyris bli_cxxpbyris #define bli_zzcxpbyris bli_cxxpbyris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyris bli_rxxpbyris #define bli_dszxpbyris bli_rxxpbyris #define bli_cszxpbyris bli_crxpbyris #define bli_zszxpbyris bli_crxpbyris #define bli_sdzxpbyris bli_rxxpbyris #define bli_ddzxpbyris bli_rxxpbyris #define bli_cdzxpbyris bli_crxpbyris #define bli_zdzxpbyris bli_crxpbyris #define bli_sczxpbyris bli_cxxpbyris #define bli_dczxpbyris bli_cxxpbyris #define bli_cczxpbyris bli_cxxpbyris #define bli_zczxpbyris bli_cxxpbyris #define bli_szzxpbyris bli_cxxpbyris #define bli_dzzxpbyris bli_cxxpbyris #define bli_czzxpbyris bli_cxxpbyris #define bli_zzzxpbyris bli_cxxpbyris #define bli_sxpbyris bli_sssxpbyris #define bli_dxpbyris bli_dddxpbyris #define bli_cxpbyris bli_cccxpbyris #define bli_zxpbyris bli_zzzxpbyris #endif // end bli_xpbyris.h // begin bli_xpbyjris.h #ifndef BLIS_XPBYJRIS_H #define BLIS_XPBYJRIS_H // xpbyjris #define bli_rxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ (yr) = (xr) + (br) * (yr); \ } #define bli_cxxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr) - (bi) * (yi); \ const __typeof__(yi) yt_i = -(xi) + (bi) * (yr) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } #define bli_crxpbyjris( xr, xi, br, bi, yr, yi ) \ { \ const __typeof__(yr) yt_r = (xr) + (br) * (yr); \ const __typeof__(yi) yt_i = -(xi) + (br) * (yi); \ (yr) = yt_r; \ (yi) = yt_i; \ } // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjris bli_rxxpbyjris #define bli_dssxpbyjris bli_rxxpbyjris #define bli_cssxpbyjris bli_rxxpbyjris #define bli_zssxpbyjris bli_rxxpbyjris #define bli_sdsxpbyjris bli_rxxpbyjris #define bli_ddsxpbyjris bli_rxxpbyjris #define bli_cdsxpbyjris bli_rxxpbyjris #define bli_zdsxpbyjris bli_rxxpbyjris #define bli_scsxpbyjris bli_rxxpbyjris #define bli_dcsxpbyjris bli_rxxpbyjris #define bli_ccsxpbyjris bli_rxxpbyjris #define bli_zcsxpbyjris bli_rxxpbyjris #define bli_szsxpbyjris bli_rxxpbyjris #define bli_dzsxpbyjris bli_rxxpbyjris #define bli_czsxpbyjris bli_rxxpbyjris #define bli_zzsxpbyjris bli_rxxpbyjris // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjris bli_rxxpbyjris #define bli_dsdxpbyjris bli_rxxpbyjris #define bli_csdxpbyjris bli_rxxpbyjris #define bli_zsdxpbyjris bli_rxxpbyjris #define bli_sddxpbyjris bli_rxxpbyjris #define bli_dddxpbyjris bli_rxxpbyjris #define bli_cddxpbyjris bli_rxxpbyjris #define bli_zddxpbyjris bli_rxxpbyjris #define bli_scdxpbyjris bli_rxxpbyjris #define bli_dcdxpbyjris bli_rxxpbyjris #define bli_ccdxpbyjris bli_rxxpbyjris #define bli_zcdxpbyjris bli_rxxpbyjris #define bli_szdxpbyjris bli_rxxpbyjris #define bli_dzdxpbyjris bli_rxxpbyjris #define bli_czdxpbyjris bli_rxxpbyjris #define bli_zzdxpbyjris bli_rxxpbyjris // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjris bli_rxxpbyjris #define bli_dscxpbyjris bli_rxxpbyjris #define bli_cscxpbyjris bli_crxpbyjris #define bli_zscxpbyjris bli_crxpbyjris #define bli_sdcxpbyjris bli_rxxpbyjris #define bli_ddcxpbyjris bli_rxxpbyjris #define bli_cdcxpbyjris bli_crxpbyjris #define bli_zdcxpbyjris bli_crxpbyjris #define bli_sccxpbyjris bli_cxxpbyjris #define bli_dccxpbyjris bli_cxxpbyjris #define bli_cccxpbyjris bli_cxxpbyjris #define bli_zccxpbyjris bli_cxxpbyjris #define bli_szcxpbyjris bli_cxxpbyjris #define bli_dzcxpbyjris bli_cxxpbyjris #define bli_czcxpbyjris bli_cxxpbyjris #define bli_zzcxpbyjris bli_cxxpbyjris // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjris bli_rxxpbyjris #define bli_dszxpbyjris bli_rxxpbyjris #define bli_cszxpbyjris bli_crxpbyjris #define bli_zszxpbyjris bli_crxpbyjris #define bli_sdzxpbyjris bli_rxxpbyjris #define bli_ddzxpbyjris bli_rxxpbyjris #define bli_cdzxpbyjris bli_crxpbyjris #define bli_zdzxpbyjris bli_crxpbyjris #define bli_sczxpbyjris bli_cxxpbyjris #define bli_dczxpbyjris bli_cxxpbyjris #define bli_cczxpbyjris bli_cxxpbyjris #define bli_zczxpbyjris bli_cxxpbyjris #define bli_szzxpbyjris bli_cxxpbyjris #define bli_dzzxpbyjris bli_cxxpbyjris #define bli_czzxpbyjris bli_cxxpbyjris #define bli_zzzxpbyjris bli_cxxpbyjris #define bli_sxpbyjris bli_sssxpbyjris #define bli_dxpbyjris bli_dddxpbyjris #define bli_cxpbyjris bli_cccxpbyjris #define bli_zxpbyjris bli_zzzxpbyjris #endif // end bli_xpbyjris.h // Inlined scalar macros in loops // begin bli_scal2ris_mxn.h #ifndef BLIS_SCAL2RIS_MXN_H #define BLIS_SCAL2RIS_MXN_H // scal2ris_mxn BLIS_INLINE void bli_cscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { float* restrict alpha_r = ( float* )alpha; \ float* restrict alpha_i = ( float* )alpha + 1; \ float* restrict x_r = ( float* )x; \ float* restrict x_i = ( float* )x + 1; \ float* restrict y_r = ( float* )y; \ float* restrict y_i = ( float* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { float* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; float* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; float* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; float* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_cscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } BLIS_INLINE void bli_zscal2ris_mxn ( const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t is_y ) { double* restrict alpha_r = ( double* )alpha; \ double* restrict alpha_i = ( double* )alpha + 1; \ double* restrict x_r = ( double* )x; \ double* restrict x_i = ( double* )x + 1; \ double* restrict y_r = ( double* )y; \ double* restrict y_i = ( double* )y + is_y; \ const dim_t incx2 = 2*rs_x; \ const dim_t ldx2 = 2*cs_x; \ \ if ( bli_is_conj( conjx ) ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2jris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } else { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) { double* restrict chi11_r = x_r + (i )*incx2 + (j )*ldx2; double* restrict chi11_i = x_i + (i )*incx2 + (j )*ldx2; double* restrict psi11_r = y_r + (i )*1 + (j )*cs_y; double* restrict psi11_i = y_i + (i )*1 + (j )*cs_y; bli_zscal2ris ( *alpha_r, *alpha_i, *chi11_r, *chi11_i, *psi11_r, *psi11_i ); } } } #endif // end bli_scal2ris_mxn.h // begin bli_scalris_mxn_uplo.h #ifndef BLIS_SCALRIS_MXN_UPLO_H #define BLIS_SCALRIS_MXN_UPLO_H // scalris_mxn_u #define bli_cscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_u( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } // scalris_mxn_l #define bli_cscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #define bli_zscalris_mxn_l( diagoff, m, n, ar, ai, xr, xi, rs_x, cs_x ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zscalris( *(ar), \ *(ai), \ *((xr) + _i*rs_x + _j*cs_x), \ *((xi) + _i*rs_x + _j*cs_x) ); \ } \ } \ } #endif // end bli_scalris_mxn_uplo.h // -- Conventional scalar macros (paired real/imaginary values) -- // begin bli_absq2s.h #ifndef BLIS_ABSQR2_H #define BLIS_ABSQR2_H // absq2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #define bli_ssabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabsq2s( x, a ) { float ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabsq2s( x, a ) { float ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabsq2s( x, a ) { double ti; bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabsq2s( x, a ) { double ti; bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabsq2s( x, a ) bli_sabsq2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabsq2s( x, a ) bli_dabsq2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabsq2s( x, a ) bli_cabsq2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabsq2s( x, a ) bli_zabsq2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scabsq2s( x, a ) bli_scsets( (x) * (x), 0.0, (a) ) #define bli_dcabsq2s( x, a ) bli_dcsets( (x) * (x), 0.0, (a) ) #define bli_ccabsq2s( x, a ) bli_ccsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zcabsq2s( x, a ) bli_zcsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #define bli_szabsq2s( x, a ) bli_szsets( (x) * (x), 0.0, (a) ) #define bli_dzabsq2s( x, a ) bli_dzsets( (x) * (x), 0.0, (a) ) #define bli_czabsq2s( x, a ) bli_czsets( bli_creal(x) * bli_creal(x) + \ bli_cimag(x) * bli_cimag(x), 0.0, (a) ) #define bli_zzabsq2s( x, a ) bli_zzsets( bli_zreal(x) * bli_zreal(x) + \ bli_zimag(x) * bli_zimag(x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabsq2s( x, a ) bli_ssabsq2s( x, a ) #define bli_dabsq2s( x, a ) bli_ddabsq2s( x, a ) #define bli_cabsq2s( x, a ) bli_ccabsq2s( x, a ) #define bli_zabsq2s( x, a ) bli_zzabsq2s( x, a ) #endif // end bli_absq2s.h // begin bli_abval2s.h #ifndef BLIS_ABVAL2S_H #define BLIS_ABVAL2S_H // abval2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), 0.0F ) #define bli_dsabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), 0.0F ) #define bli_csabval2s( x, a ) { float ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_zsabval2s( x, a ) { float ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), ti ); ( void )ti; } #define bli_sdabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), 0.0 ) #define bli_ddabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), 0.0 ) #define bli_cdabval2s( x, a ) { double ti; bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_zdabval2s( x, a ) { double ti; bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), ti ); ( void )ti; } #define bli_scabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szabval2s( x, a ) bli_sabval2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzabval2s( x, a ) bli_dabval2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czabval2s( x, a ) bli_cabval2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzabval2s( x, a ) bli_zabval2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ssabval2s( x, a ) bli_sssets( fabsf(x), 0.0, (a) ) #define bli_dsabval2s( x, a ) bli_dssets( fabs (x), 0.0, (a) ) #define bli_csabval2s( x, a ) bli_cssets( cabsf(x), 0.0, (a) ) #define bli_zsabval2s( x, a ) bli_zssets( cabs (x), 0.0, (a) ) #define bli_sdabval2s( x, a ) bli_sdsets( fabsf(x), 0.0, (a) ) #define bli_ddabval2s( x, a ) bli_ddsets( fabs (x), 0.0, (a) ) #define bli_cdabval2s( x, a ) bli_cdsets( cabsf(x), 0.0, (a) ) #define bli_zdabval2s( x, a ) bli_zdsets( cabs (x), 0.0, (a) ) #define bli_scabval2s( x, a ) bli_scsets( fabsf(x), 0.0, (a) ) #define bli_dcabval2s( x, a ) bli_dcsets( fabs (x), 0.0, (a) ) #define bli_ccabval2s( x, a ) bli_ccsets( cabsf(x), 0.0, (a) ) #define bli_zcabval2s( x, a ) bli_zcsets( cabs (x), 0.0, (a) ) #define bli_szabval2s( x, a ) bli_szsets( fabsf(x), 0.0, (a) ) #define bli_dzabval2s( x, a ) bli_dzsets( fabs (x), 0.0, (a) ) #define bli_czabval2s( x, a ) bli_czsets( cabsf(x), 0.0, (a) ) #define bli_zzabval2s( x, a ) bli_zzsets( cabs (x), 0.0, (a) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sabval2s( x, a ) bli_ssabval2s( x, a ) #define bli_dabval2s( x, a ) bli_ddabval2s( x, a ) #define bli_cabval2s( x, a ) bli_ccabval2s( x, a ) #define bli_zabval2s( x, a ) bli_zzabval2s( x, a ) #endif // end bli_abval2s.h // begin bli_adds.h #ifndef BLIS_ADDS_H #define BLIS_ADDS_H // adds // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssadds( a, y ) bli_saddris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsadds( a, y ) bli_saddris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csadds( a, y ) bli_saddris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsadds( a, y ) bli_saddris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdadds( a, y ) bli_daddris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddadds( a, y ) bli_daddris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdadds( a, y ) bli_daddris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdadds( a, y ) bli_daddris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) bli_caddris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcadds( a, y ) bli_caddris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccadds( a, y ) bli_caddris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcadds( a, y ) bli_caddris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szadds( a, y ) bli_zaddris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzadds( a, y ) bli_zaddris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czadds( a, y ) bli_zaddris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzadds( a, y ) bli_zaddris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scadds( a, y ) { (y) += (a); } #define bli_dcadds( a, y ) { (y) += (a); } #define bli_ccadds( a, y ) { (y) += (a); } #define bli_zcadds( a, y ) { (y) += (a); } #define bli_szadds( a, y ) { (y) += (a); } #define bli_dzadds( a, y ) { (y) += (a); } #define bli_czadds( a, y ) { (y) += (a); } #define bli_zzadds( a, y ) { (y) += (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadds( a, y ) bli_ssadds( a, y ) #define bli_dadds( a, y ) bli_ddadds( a, y ) #define bli_cadds( a, y ) bli_ccadds( a, y ) #define bli_zadds( a, y ) bli_zzadds( a, y ) #endif // end bli_adds.h // begin bli_addjs.h #ifndef BLIS_ADDJS_H #define BLIS_ADDJS_H // addjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssaddjs( a, y ) bli_saddjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsaddjs( a, y ) bli_saddjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csaddjs( a, y ) bli_saddjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsaddjs( a, y ) bli_saddjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdaddjs( a, y ) bli_daddjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddaddjs( a, y ) bli_daddjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdaddjs( a, y ) bli_daddjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdaddjs( a, y ) bli_daddjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) bli_caddjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcaddjs( a, y ) bli_caddjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccaddjs( a, y ) bli_caddjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcaddjs( a, y ) bli_caddjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szaddjs( a, y ) bli_zaddjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzaddjs( a, y ) bli_zaddjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czaddjs( a, y ) bli_zaddjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzaddjs( a, y ) bli_zaddjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scaddjs( a, y ) { (y) += (a); } #define bli_dcaddjs( a, y ) { (y) += (a); } #define bli_ccaddjs( a, y ) { (y) += conjf(a); } #define bli_zcaddjs( a, y ) { (y) += conj (a); } #define bli_szaddjs( a, y ) { (y) += (a); } #define bli_dzaddjs( a, y ) { (y) += (a); } #define bli_czaddjs( a, y ) { (y) += conjf(a); } #define bli_zzaddjs( a, y ) { (y) += conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saddjs( a, y ) bli_ssaddjs( a, y ) #define bli_daddjs( a, y ) bli_ddaddjs( a, y ) #define bli_caddjs( a, y ) bli_ccaddjs( a, y ) #define bli_zaddjs( a, y ) bli_zzaddjs( a, y ) #endif // end bli_addjs.h // begin bli_add3s.h #ifndef BLIS_ADD3S_H #define BLIS_ADD3S_H // add3s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of b. // - The third char encodes the type of c. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_dssadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_cssadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_zssadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_sreal(c), bli_simag(c) ) #define bli_sdsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ddsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_cdsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zdsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_sreal(c), bli_simag(c) ) #define bli_scsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dcsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_ccsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zcsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_sreal(c), bli_simag(c) ) #define bli_szsadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_dzsadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_czsadd3s( a, b, c ) bli_sadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) #define bli_zzsadd3s( a, b, c ) bli_sadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_sreal(c), bli_simag(c) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dsdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_csdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zsdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_dreal(c), bli_dimag(c) ) #define bli_sddadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dddadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_cddadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zddadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_scdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dcdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_ccdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zcdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_szdadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_dzdadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_czdadd3s( a, b, c ) bli_dadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #define bli_zzdadd3s( a, b, c ) bli_dadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_dreal(c), bli_dimag(c) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_dscadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_cscadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_zscadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_creal(c), bli_cimag(c) ) #define bli_sdcadd3s( a, b, c ) bli_sadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_ddcadd3s( a, b, c ) bli_sadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cdcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zdcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_creal(c), bli_cimag(c) ) #define bli_sccadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dccadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_cccadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zccadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_creal(c), bli_cimag(c) ) #define bli_szcadd3s( a, b, c ) bli_cadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_dzcadd3s( a, b, c ) bli_cadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_czcadd3s( a, b, c ) bli_cadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) #define bli_zzcadd3s( a, b, c ) bli_cadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_creal(c), bli_cimag(c) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dszadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cszadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zszadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_sreal(b), bli_simag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sdzadd3s( a, b, c ) bli_dadd3ris( bli_sreal(a), bli_simag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_ddzadd3s( a, b, c ) bli_dadd3ris( bli_dreal(a), bli_dimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cdzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zdzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_dreal(b), bli_dimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_sczadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dczadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_cczadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zczadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_creal(b), bli_cimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_szzadd3s( a, b, c ) bli_zadd3ris( bli_sreal(a), bli_simag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_dzzadd3s( a, b, c ) bli_zadd3ris( bli_dreal(a), bli_dimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_czzadd3s( a, b, c ) bli_zadd3ris( bli_creal(a), bli_cimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #define bli_zzzadd3s( a, b, c ) bli_zadd3ris( bli_zreal(a), bli_zimag(a), bli_zreal(b), bli_zimag(b), bli_zreal(c), bli_zimag(c) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zscadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zccadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czcadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzcadd3s( a, b, c ) { (c) = (a) + (b); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zszadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_ddzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zdzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_sczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_cczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zczadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_szzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_dzzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_czzadd3s( a, b, c ) { (c) = (a) + (b); } #define bli_zzzadd3s( a, b, c ) { (c) = (a) + (b); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sadd3s( a, b, c ) bli_sssadd3s( a, b, c ) #define bli_dadd3s( a, b, c ) bli_dddadd3s( a, b, c ) #define bli_cadd3s( a, b, c ) bli_cccadd3s( a, b, c ) #define bli_zadd3s( a, b, c ) bli_zzzadd3s( a, b, c ) #endif // end bli_add3s.h // begin bli_axpbys.h #ifndef BLIS_AXPBYS_H #define BLIS_AXPBYS_H // axpbys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbys( a, x, b, y ) bli_rxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbys( a, x, b, y ) bli_cxaxpbyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzscaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzccaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzcaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzszaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzdzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ssczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ccczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zcczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzczaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zczzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_szzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_czzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zzzzaxpbys( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbys( a, x, b, y ) bli_ssssaxpbys( a, x, b, y ) #define bli_daxpbys( a, x, b, y ) bli_ddddaxpbys( a, x, b, y ) #define bli_caxpbys( a, x, b, y ) bli_ccccaxpbys( a, x, b, y ) #define bli_zaxpbys( a, x, b, y ) bli_zzzzaxpbys( a, x, b, y ) #endif // end bli_axpbys.h // begin bli_axpbyjs.h #ifndef BLIS_AXPBYJS_H #define BLIS_AXPBYJS_H // axpbyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of b. // - The fourth char encodes the type of y. // -- (axby) = (???s) ---------------------------------------------------------- #define bli_ssssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_csssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_scssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_szssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_czssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzssaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_ssdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_csdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zsdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zddsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzdsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zscsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zccsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzcsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zszsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_sczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zczsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzzsaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (axby) = (???d) ---------------------------------------------------------- #define bli_sssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zssdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzsdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ssddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzddaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zscdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zccdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzcdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zszdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ddzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zdzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zczdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzzdaxpbyjs( a, x, b, y ) bli_rxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_csscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_scscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_szscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_czscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzscaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zddcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzdcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ssccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_csccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zsccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_scccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ccccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zcccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzccaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zszcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zczcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzzcaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzszaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zddzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzdzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ssczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_csczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zsczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_scczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ccczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zcczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzczaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzzaxpbyjs( a, x, b, y ) bli_cxaxpbyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axby) = (???c) ---------------------------------------------------------- #define bli_ssscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdscaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcscaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzscaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdccaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcccaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzccaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzcaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczcaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzcaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } // -- (axby) = (???z) ---------------------------------------------------------- #define bli_ssszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdszaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcszaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzszaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsdzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zddzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcdzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzdzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_ssczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_csczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zsczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdczaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_scczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_ccczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zcczaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzczaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_sszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_dszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zszzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_ddzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_cdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_zdzzaxpbyjs( a, x, b, y ) { (y) = (a) * (x) + (b) * (y); } #define bli_sczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_dczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_cczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_zczzaxpbyjs( a, x, b, y ) { (y) = (a) * conjf(x) + (b) * (y); } #define bli_szzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_dzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_czzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #define bli_zzzzaxpbyjs( a, x, b, y ) { (y) = (a) * conj(x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpbyjs( a, x, b, y ) bli_ssssaxpbyjs( a, x, b, y ) #define bli_daxpbyjs( a, x, b, y ) bli_ddddaxpbyjs( a, x, b, y ) #define bli_caxpbyjs( a, x, b, y ) bli_ccccaxpbyjs( a, x, b, y ) #define bli_zaxpbyjs( a, x, b, y ) bli_zzzzaxpbyjs( a, x, b, y ) #endif // end bli_axpbyjs.h // begin bli_axpys.h #ifndef BLIS_AXPYS_H #define BLIS_AXPYS_H // axpys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpys( a, x, y ) bli_saxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpys( a, x, y ) bli_saxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpys( a, x, y ) bli_daxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpys( a, x, y ) bli_daxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpys( a, x, y ) bli_saxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpys( a, x, y ) bli_saxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpys( a, x, y ) bli_scaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpys( a, x, y ) bli_scaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpys( a, x, y ) bli_caxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpys( a, x, y ) bli_caxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpys( a, x, y ) bli_daxpyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpys( a, x, y ) bli_daxpyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpys( a, x, y ) bli_dzaxpyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpys( a, x, y ) bli_dzaxpyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpys( a, x, y ) bli_zaxpyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpys( a, x, y ) bli_zaxpyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zccaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czcaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzcaxpys( a, x, y ) { (y) += (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_cczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zczaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_szzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_dzzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_czzaxpys( a, x, y ) { (y) += (a) * (x); } #define bli_zzzaxpys( a, x, y ) { (y) += (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpys( a, x, y ) bli_sssaxpys( a, x, y ) #define bli_daxpys( a, x, y ) bli_dddaxpys( a, x, y ) #define bli_caxpys( a, x, y ) bli_cccaxpys( a, x, y ) #define bli_zaxpys( a, x, y ) bli_zzzaxpys( a, x, y ) #endif // end bli_axpys.h // begin bli_axpyjs.h #ifndef BLIS_AXPYJS_H #define BLIS_AXPYJS_H // axpyjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxpyjs( a, x, y ) bli_saxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxpyjs( a, x, y ) bli_saxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxpyjs( a, x, y ) bli_daxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxpyjs( a, x, y ) bli_daxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxpyjs( a, x, y ) bli_saxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxpyjs( a, x, y ) bli_saxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxpyjs( a, x, y ) bli_scaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxpyjs( a, x, y ) bli_scaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxpyjs( a, x, y ) bli_caxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxpyjs( a, x, y ) bli_caxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxpyjs( a, x, y ) bli_daxpyjris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxpyjs( a, x, y ) bli_daxpyjris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxpyjs( a, x, y ) bli_dzaxpyjris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxpyjs( a, x, y ) bli_zaxpyjris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxpyjs( a, x, y ) bli_zaxpyjris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zscaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdcaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zccaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzcaxpyjs( a, x, y ) { (y) += (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_dszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zszaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_ddzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_cdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_zdzaxpyjs( a, x, y ) { (y) += (a) * (x); } #define bli_sczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_dczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_cczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_zczaxpyjs( a, x, y ) { (y) += (a) * conjf(x); } #define bli_szzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_dzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_czzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #define bli_zzzaxpyjs( a, x, y ) { (y) += (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxpyjs( a, x, y ) bli_sssaxpyjs( a, x, y ) #define bli_daxpyjs( a, x, y ) bli_dddaxpyjs( a, x, y ) #define bli_caxpyjs( a, x, y ) bli_cccaxpyjs( a, x, y ) #define bli_zaxpyjs( a, x, y ) bli_zzzaxpyjs( a, x, y ) #endif // end bli_axpyjs.h // begin bli_axmys.h #ifndef BLIS_AXMYS_H #define BLIS_AXMYS_H // axmys // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsaxmys( a, x, y ) bli_saxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsaxmys( a, x, y ) bli_saxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdaxmys( a, x, y ) bli_daxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdaxmys( a, x, y ) bli_daxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcaxmys( a, x, y ) bli_saxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcaxmys( a, x, y ) bli_saxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcaxmys( a, x, y ) bli_scaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcaxmys( a, x, y ) bli_scaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcaxmys( a, x, y ) bli_caxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcaxmys( a, x, y ) bli_caxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzaxmys( a, x, y ) bli_daxmyris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzaxmys( a, x, y ) bli_daxmyris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzaxmys( a, x, y ) bli_dzaxmyris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzaxmys( a, x, y ) bli_dzaxmyris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzaxmys( a, x, y ) bli_zaxmyris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzaxmys( a, x, y ) bli_zaxmyris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zscaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zccaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czcaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzcaxmys( a, x, y ) { (y) -= (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zszaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_ddzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zdzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_sczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_cczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zczaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_szzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_dzzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_czzaxmys( a, x, y ) { (y) -= (a) * (x); } #define bli_zzzaxmys( a, x, y ) { (y) -= (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_saxmys( a, x, y ) bli_sssaxmys( a, x, y ) #define bli_daxmys( a, x, y ) bli_dddaxmys( a, x, y ) #define bli_caxmys( a, x, y ) bli_cccaxmys( a, x, y ) #define bli_zaxmys( a, x, y ) bli_zzzaxmys( a, x, y ) #endif // end bli_axmys.h // begin bli_conjs.h #ifndef BLIS_CONJS_H #define BLIS_CONJS_H // conjs #define bli_sconjs( x ) bli_sconjris( bli_sreal(x), bli_simag(x) ) #define bli_dconjs( x ) bli_dconjris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) bli_cconjris( bli_creal(x), bli_cimag(x) ) #define bli_zconjs( x ) bli_zconjris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cconjs( x ) { (x) = conjf(x); } #define bli_zconjs( x ) { (x) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_conjs.h // begin bli_copys.h #ifndef BLIS_COPYS_H #define BLIS_COPYS_H // copys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopys( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopys( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopys( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopys( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopys( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopys( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopys( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopys( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of ccopyris() means the imaginary part of y will be overwritten with zero. #define bli_sccopys( x, y ) bli_ccopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopys( x, y ) bli_ccopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopys( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopys( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of zcopyris() means the imaginary part of y will be overwritten with zero. #define bli_szcopys( x, y ) bli_zcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopys( x, y ) bli_zcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopys( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopys( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopys( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopys( x, y ) bli_sscopys( x, y ) #define bli_dcopys( x, y ) bli_ddcopys( x, y ) #define bli_ccopys( x, y ) bli_cccopys( x, y ) #define bli_zcopys( x, y ) bli_zzcopys( x, y ) #define bli_icopys( x, y ) bli_iicopys( x, y ) #endif // end bli_copys.h // begin bli_copyjs.h #ifndef BLIS_COPYJS_H #define BLIS_COPYJS_H // copyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) bli_ccopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjs( x, y ) bli_ccopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopyjs( x, y ) bli_zcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjs( x, y ) bli_zcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopyjs( x, y ) { (y) = (x); } #define bli_dccopyjs( x, y ) { (y) = (x); } #define bli_cccopyjs( x, y ) { (y) = conjf(x); } #define bli_zccopyjs( x, y ) { (y) = conj (x); } #define bli_szcopyjs( x, y ) { (y) = (x); } #define bli_dzcopyjs( x, y ) { (y) = (x); } #define bli_czcopyjs( x, y ) { (y) = conjf(x); } #define bli_zzcopyjs( x, y ) { (y) = conj (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopyjs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjs( x, y ) bli_sscopyjs( x, y ) #define bli_dcopyjs( x, y ) bli_ddcopyjs( x, y ) #define bli_ccopyjs( x, y ) bli_cccopyjs( x, y ) #define bli_zcopyjs( x, y ) bli_zzcopyjs( x, y ) #define bli_icopyjs( x, y ) bli_iicopyjs( x, y ) #endif // end bli_copyjs.h // begin bli_copycjs.h #ifndef BLIS_COPYCJS_H #define BLIS_COPYCJS_H // copycjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopycjs( conjx, x, y ) bli_scopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopycjs( conjx, x, y ) bli_dcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopycjs( conjx, x, y ) bli_ccopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopycjs( conjx, x, y ) bli_zcopycjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sccopycjs( conjx, x, y ) { (y) = (x); } #define bli_dccopycjs( conjx, x, y ) { (y) = (x); } #define bli_cccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zccopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szcopycjs( conjx, x, y ) { (y) = (x); } #define bli_dzcopycjs( conjx, x, y ) { (y) = (x); } #define bli_czcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzcopycjs( conjx, x, y ) { (y) = ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_iicopycjs( conjx, x, y ) { (y) = ( gint_t ) (x); } #define bli_scopycjs( conjx, x, y ) bli_sscopycjs( conjx, x, y ) #define bli_dcopycjs( conjx, x, y ) bli_ddcopycjs( conjx, x, y ) #define bli_ccopycjs( conjx, x, y ) bli_cccopycjs( conjx, x, y ) #define bli_zcopycjs( conjx, x, y ) bli_zzcopycjs( conjx, x, y ) #define bli_icopycjs( conjx, x, y ) bli_iicopycjs( conjx, x, y ) #endif // end bli_copycjs.h // begin bli_copynzs.h #ifndef BLIS_COPYNZS_H #define BLIS_COPYNZS_H // copynzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopynzs( x, y ) bli_scopyris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopynzs( x, y ) bli_scopyris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopynzs( x, y ) bli_dcopyris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopynzs( x, y ) bli_dcopyris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyris() is so we don't touch the imaginary part of y. #define bli_sccopynzs( x, y ) bli_scopyris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopynzs( x, y ) bli_scopyris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopynzs( x, y ) bli_ccopyris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopynzs( x, y ) bli_ccopyris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyris() is so we don't touch the imaginary part of y. #define bli_szcopynzs( x, y ) bli_dcopyris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopynzs( x, y ) bli_dcopyris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopynzs( x, y ) bli_zcopyris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopynzs( x, y ) bli_zcopyris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopynzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopynzs( x, y ) bli_sscopynzs( x, y ) #define bli_dcopynzs( x, y ) bli_ddcopynzs( x, y ) #define bli_ccopynzs( x, y ) bli_cccopynzs( x, y ) #define bli_zcopynzs( x, y ) bli_zzcopynzs( x, y ) #define bli_icopynzs( x, y ) bli_iicopynzs( x, y ) #endif // end bli_copynzs.h // begin bli_copyjnzs.h #ifndef BLIS_COPYJNZS_H #define BLIS_COPYJNZS_H // copyjnzs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dscopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cscopyjnzs( x, y ) bli_scopyjris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zscopyjnzs( x, y ) bli_scopyjris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdcopyjnzs( x, y ) bli_dcopyjris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdcopyjnzs( x, y ) bli_dcopyjris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) // NOTE: Use of scopyjris() (implemented in terms of scopyris()), is so we // don't touch the imaginary part of y. #define bli_sccopyjnzs( x, y ) bli_scopyjris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccopyjnzs( x, y ) bli_scopyjris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccopyjnzs( x, y ) bli_ccopyjris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccopyjnzs( x, y ) bli_ccopyjris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // NOTE: Use of dcopyjris() (implemented in terms of dcopyris()), is so we // don't touch the imaginary part of y. #define bli_szcopyjnzs( x, y ) bli_dcopyjris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzcopyjnzs( x, y ) bli_dcopyjris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czcopyjnzs( x, y ) bli_zcopyjris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzcopyjnzs( x, y ) bli_zcopyjris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_iicopyjnzs( x, y ) { (y) = ( gint_t ) (x); } #define bli_scopyjnzs( x, y ) bli_sscopyjnzs( x, y ) #define bli_dcopyjnzs( x, y ) bli_ddcopyjnzs( x, y ) #define bli_ccopyjnzs( x, y ) bli_cccopyjnzs( x, y ) #define bli_zcopyjnzs( x, y ) bli_zzcopyjnzs( x, y ) #define bli_icopyjnzs( x, y ) bli_iicopyjnzs( x, y ) #endif // end bli_copyjnzs.h // begin bli_dots.h #ifndef BLIS_DOTS_H #define BLIS_DOTS_H // dots // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. #define bli_sssdots( x, y, a ) bli_sssaxpys( x, y, a ) #define bli_dssdots( x, y, a ) bli_dssaxpys( x, y, a ) #define bli_cssdots( x, y, a ) bli_cssaxpys( x, y, a ) #define bli_zssdots( x, y, a ) bli_zssaxpys( x, y, a ) #define bli_sdsdots( x, y, a ) bli_sdsaxpys( x, y, a ) #define bli_ddsdots( x, y, a ) bli_ddsaxpys( x, y, a ) #define bli_cdsdots( x, y, a ) bli_cdsaxpys( x, y, a ) #define bli_zdsdots( x, y, a ) bli_zdsaxpys( x, y, a ) #define bli_scsdots( x, y, a ) bli_scsaxpys( x, y, a ) #define bli_dcsdots( x, y, a ) bli_dcsaxpys( x, y, a ) #define bli_ccsdots( x, y, a ) bli_ccsaxpys( x, y, a ) #define bli_zcsdots( x, y, a ) bli_zcsaxpys( x, y, a ) #define bli_szsdots( x, y, a ) bli_szsaxpys( x, y, a ) #define bli_dzsdots( x, y, a ) bli_dzsaxpys( x, y, a ) #define bli_czsdots( x, y, a ) bli_czsaxpys( x, y, a ) #define bli_zzsdots( x, y, a ) bli_zzsaxpys( x, y, a ) #define bli_ssddots( x, y, a ) bli_ssdaxpys( x, y, a ) #define bli_dsddots( x, y, a ) bli_dsdaxpys( x, y, a ) #define bli_csddots( x, y, a ) bli_csdaxpys( x, y, a ) #define bli_zsddots( x, y, a ) bli_zsdaxpys( x, y, a ) #define bli_sdddots( x, y, a ) bli_sddaxpys( x, y, a ) #define bli_ddddots( x, y, a ) bli_dddaxpys( x, y, a ) #define bli_cdddots( x, y, a ) bli_cddaxpys( x, y, a ) #define bli_zdddots( x, y, a ) bli_zddaxpys( x, y, a ) #define bli_scddots( x, y, a ) bli_scdaxpys( x, y, a ) #define bli_dcddots( x, y, a ) bli_dcdaxpys( x, y, a ) #define bli_ccddots( x, y, a ) bli_ccdaxpys( x, y, a ) #define bli_zcddots( x, y, a ) bli_zcdaxpys( x, y, a ) #define bli_szddots( x, y, a ) bli_szdaxpys( x, y, a ) #define bli_dzddots( x, y, a ) bli_dzdaxpys( x, y, a ) #define bli_czddots( x, y, a ) bli_czdaxpys( x, y, a ) #define bli_zzddots( x, y, a ) bli_zzdaxpys( x, y, a ) #define bli_sscdots( x, y, a ) bli_sscaxpys( x, y, a ) #define bli_dscdots( x, y, a ) bli_dscaxpys( x, y, a ) #define bli_cscdots( x, y, a ) bli_cscaxpys( x, y, a ) #define bli_zscdots( x, y, a ) bli_zscaxpys( x, y, a ) #define bli_sdcdots( x, y, a ) bli_sdcaxpys( x, y, a ) #define bli_ddcdots( x, y, a ) bli_ddcaxpys( x, y, a ) #define bli_cdcdots( x, y, a ) bli_cdcaxpys( x, y, a ) #define bli_zdcdots( x, y, a ) bli_zdcaxpys( x, y, a ) #define bli_sccdots( x, y, a ) bli_sccaxpys( x, y, a ) #define bli_dccdots( x, y, a ) bli_dccaxpys( x, y, a ) #define bli_cccdots( x, y, a ) bli_cccaxpys( x, y, a ) #define bli_zccdots( x, y, a ) bli_zccaxpys( x, y, a ) #define bli_szcdots( x, y, a ) bli_szcaxpys( x, y, a ) #define bli_dzcdots( x, y, a ) bli_dzcaxpys( x, y, a ) #define bli_czcdots( x, y, a ) bli_czcaxpys( x, y, a ) #define bli_zzcdots( x, y, a ) bli_zzcaxpys( x, y, a ) #define bli_sszdots( x, y, a ) bli_sszaxpys( x, y, a ) #define bli_dszdots( x, y, a ) bli_dszaxpys( x, y, a ) #define bli_cszdots( x, y, a ) bli_cszaxpys( x, y, a ) #define bli_zszdots( x, y, a ) bli_zszaxpys( x, y, a ) #define bli_sdzdots( x, y, a ) bli_sdzaxpys( x, y, a ) #define bli_ddzdots( x, y, a ) bli_ddzaxpys( x, y, a ) #define bli_cdzdots( x, y, a ) bli_cdzaxpys( x, y, a ) #define bli_zdzdots( x, y, a ) bli_zdzaxpys( x, y, a ) #define bli_sczdots( x, y, a ) bli_sczaxpys( x, y, a ) #define bli_dczdots( x, y, a ) bli_dczaxpys( x, y, a ) #define bli_cczdots( x, y, a ) bli_cczaxpys( x, y, a ) #define bli_zczdots( x, y, a ) bli_zczaxpys( x, y, a ) #define bli_szzdots( x, y, a ) bli_szzaxpys( x, y, a ) #define bli_dzzdots( x, y, a ) bli_dzzaxpys( x, y, a ) #define bli_czzdots( x, y, a ) bli_czzaxpys( x, y, a ) #define bli_zzzdots( x, y, a ) bli_zzzaxpys( x, y, a ) #define bli_sdots( x, y, a ) bli_sssdots( x, y, a ) #define bli_ddots( x, y, a ) bli_ddddots( x, y, a ) #define bli_cdots( x, y, a ) bli_cccdots( x, y, a ) #define bli_zdots( x, y, a ) bli_zzzdots( x, y, a ) #endif // end bli_dots.h // begin bli_dotjs.h #ifndef BLIS_DOTJS_H #define BLIS_DOTJS_H // dotjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // - The third char encodes the type of rho. // - x is used in conjugated form. #define bli_sssdotjs( x, y, a ) bli_sssaxpyjs( y, x, a ) #define bli_dssdotjs( x, y, a ) bli_sdsaxpyjs( y, x, a ) #define bli_cssdotjs( x, y, a ) bli_scsaxpyjs( y, x, a ) #define bli_zssdotjs( x, y, a ) bli_szsaxpyjs( y, x, a ) #define bli_sdsdotjs( x, y, a ) bli_dssaxpyjs( y, x, a ) #define bli_ddsdotjs( x, y, a ) bli_ddsaxpyjs( y, x, a ) #define bli_cdsdotjs( x, y, a ) bli_dcsaxpyjs( y, x, a ) #define bli_zdsdotjs( x, y, a ) bli_dzsaxpyjs( y, x, a ) #define bli_scsdotjs( x, y, a ) bli_cssaxpyjs( y, x, a ) #define bli_dcsdotjs( x, y, a ) bli_cdsaxpyjs( y, x, a ) #define bli_ccsdotjs( x, y, a ) bli_ccsaxpyjs( y, x, a ) #define bli_zcsdotjs( x, y, a ) bli_czsaxpyjs( y, x, a ) #define bli_szsdotjs( x, y, a ) bli_zssaxpyjs( y, x, a ) #define bli_dzsdotjs( x, y, a ) bli_zdsaxpyjs( y, x, a ) #define bli_czsdotjs( x, y, a ) bli_zcsaxpyjs( y, x, a ) #define bli_zzsdotjs( x, y, a ) bli_zzsaxpyjs( y, x, a ) #define bli_ssddotjs( x, y, a ) bli_ssdaxpyjs( y, x, a ) #define bli_dsddotjs( x, y, a ) bli_sddaxpyjs( y, x, a ) #define bli_csddotjs( x, y, a ) bli_scdaxpyjs( y, x, a ) #define bli_zsddotjs( x, y, a ) bli_szdaxpyjs( y, x, a ) #define bli_sdddotjs( x, y, a ) bli_dsdaxpyjs( y, x, a ) #define bli_ddddotjs( x, y, a ) bli_dddaxpyjs( y, x, a ) #define bli_cdddotjs( x, y, a ) bli_dcdaxpyjs( y, x, a ) #define bli_zdddotjs( x, y, a ) bli_dzdaxpyjs( y, x, a ) #define bli_scddotjs( x, y, a ) bli_csdaxpyjs( y, x, a ) #define bli_dcddotjs( x, y, a ) bli_cddaxpyjs( y, x, a ) #define bli_ccddotjs( x, y, a ) bli_ccdaxpyjs( y, x, a ) #define bli_zcddotjs( x, y, a ) bli_czdaxpyjs( y, x, a ) #define bli_szddotjs( x, y, a ) bli_zsdaxpyjs( y, x, a ) #define bli_dzddotjs( x, y, a ) bli_zddaxpyjs( y, x, a ) #define bli_czddotjs( x, y, a ) bli_zcdaxpyjs( y, x, a ) #define bli_zzddotjs( x, y, a ) bli_zzdaxpyjs( y, x, a ) #define bli_sscdotjs( x, y, a ) bli_sscaxpyjs( y, x, a ) #define bli_dscdotjs( x, y, a ) bli_sdcaxpyjs( y, x, a ) #define bli_cscdotjs( x, y, a ) bli_sccaxpyjs( y, x, a ) #define bli_zscdotjs( x, y, a ) bli_szcaxpyjs( y, x, a ) #define bli_sdcdotjs( x, y, a ) bli_dscaxpyjs( y, x, a ) #define bli_ddcdotjs( x, y, a ) bli_ddcaxpyjs( y, x, a ) #define bli_cdcdotjs( x, y, a ) bli_dccaxpyjs( y, x, a ) #define bli_zdcdotjs( x, y, a ) bli_dzcaxpyjs( y, x, a ) #define bli_sccdotjs( x, y, a ) bli_cscaxpyjs( y, x, a ) #define bli_dccdotjs( x, y, a ) bli_cdcaxpyjs( y, x, a ) #define bli_cccdotjs( x, y, a ) bli_cccaxpyjs( y, x, a ) #define bli_zccdotjs( x, y, a ) bli_czcaxpyjs( y, x, a ) #define bli_szcdotjs( x, y, a ) bli_zscaxpyjs( y, x, a ) #define bli_dzcdotjs( x, y, a ) bli_zdcaxpyjs( y, x, a ) #define bli_czcdotjs( x, y, a ) bli_zccaxpyjs( y, x, a ) #define bli_zzcdotjs( x, y, a ) bli_zzcaxpyjs( y, x, a ) #define bli_sszdotjs( x, y, a ) bli_sszaxpyjs( y, x, a ) #define bli_dszdotjs( x, y, a ) bli_sdzaxpyjs( y, x, a ) #define bli_cszdotjs( x, y, a ) bli_sczaxpyjs( y, x, a ) #define bli_zszdotjs( x, y, a ) bli_szzaxpyjs( y, x, a ) #define bli_sdzdotjs( x, y, a ) bli_dszaxpyjs( y, x, a ) #define bli_ddzdotjs( x, y, a ) bli_ddzaxpyjs( y, x, a ) #define bli_cdzdotjs( x, y, a ) bli_dczaxpyjs( y, x, a ) #define bli_zdzdotjs( x, y, a ) bli_dzzaxpyjs( y, x, a ) #define bli_sczdotjs( x, y, a ) bli_cszaxpyjs( y, x, a ) #define bli_dczdotjs( x, y, a ) bli_cdzaxpyjs( y, x, a ) #define bli_cczdotjs( x, y, a ) bli_cczaxpyjs( y, x, a ) #define bli_zczdotjs( x, y, a ) bli_czzaxpyjs( y, x, a ) #define bli_szzdotjs( x, y, a ) bli_zszaxpyjs( y, x, a ) #define bli_dzzdotjs( x, y, a ) bli_zdzaxpyjs( y, x, a ) #define bli_czzdotjs( x, y, a ) bli_zczaxpyjs( y, x, a ) #define bli_zzzdotjs( x, y, a ) bli_zzzaxpyjs( y, x, a ) #define bli_sdotjs( x, y, a ) bli_sssdotjs( x, y, a ) #define bli_ddotjs( x, y, a ) bli_ddddotjs( x, y, a ) #define bli_cdotjs( x, y, a ) bli_cccdotjs( x, y, a ) #define bli_zdotjs( x, y, a ) bli_zzzdotjs( x, y, a ) #endif // end bli_dotjs.h // begin bli_eq.h #ifndef BLIS_EQ_H #define BLIS_EQ_H // eq (passed by value) #define bli_seq( a, b ) ( (a) == (b) ) #define bli_deq( a, b ) ( (a) == (b) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( ( bli_creal(a) == bli_creal(b) ) && ( bli_cimag(a) == bli_cimag(b) ) ) #define bli_zeq( a, b ) ( ( bli_zreal(a) == bli_zreal(b) ) && ( bli_zimag(a) == bli_zimag(b) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceq( a, b ) ( (a) == (b) ) #define bli_zeq( a, b ) ( (a) == (b) ) #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ieq( a, b ) ( (a) == (b) ) // eqtori (passed by value) #define bli_seqtori( a, br, bi ) ( (a) == (br) ) #define bli_deqtori( a, br, bi ) ( (a) == (br) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( ( bli_creal(a) == (br) ) && ( bli_cimag(a) == (bi) ) ) #define bli_zeqtori( a, br, bi ) ( ( bli_zreal(a) == (br) ) && ( bli_zimag(a) == (bi) ) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_ceqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #define bli_zeqtori( a, br, bi ) ( (a) == (br) + (bi) * (I) ) #endif // BLIS_ENABLE_C99_COMPLEX // eqa (passed by address) #define bli_seqa( a, b ) bli_seq( *(( float* )(a)), *(( float* )(b)) ) #define bli_deqa( a, b ) bli_deq( *(( double* )(a)), *(( double* )(b)) ) #define bli_ceqa( a, b ) bli_ceq( *(( scomplex* )(a)), *(( scomplex* )(b)) ) #define bli_zeqa( a, b ) bli_zeq( *(( dcomplex* )(a)), *(( dcomplex* )(b)) ) #define bli_ieqa( a, b ) bli_ieq( *(( gint_t* )(a)), *(( gint_t* )(b)) ) // eq1 #define bli_seq1( a ) bli_seqtori( (a), 1.0F, 0.0F ) #define bli_deq1( a ) bli_deqtori( (a), 1.0, 0.0 ) #define bli_ceq1( a ) bli_ceqtori( (a), 1.0F, 0.0F ) #define bli_zeq1( a ) bli_zeqtori( (a), 1.0, 0.0 ) #define bli_ieq1( a ) bli_ieq ( (a), 1 ) // eq0 #define bli_seq0( a ) bli_seqtori( (a), 0.0F, 0.0F ) #define bli_deq0( a ) bli_deqtori( (a), 0.0, 0.0 ) #define bli_ceq0( a ) bli_ceqtori( (a), 0.0F, 0.0F ) #define bli_zeq0( a ) bli_zeqtori( (a), 0.0, 0.0 ) #define bli_ieq0( a ) bli_ieq ( (a), 0 ) // eqm1 #define bli_seqm1( a ) bli_seqtori( (a), -1.0F, 0.0F ) #define bli_deqm1( a ) bli_deqtori( (a), -1.0, 0.0 ) #define bli_ceqm1( a ) bli_ceqtori( (a), -1.0F, 0.0F ) #define bli_zeqm1( a ) bli_zeqtori( (a), -1.0, 0.0 ) #define bli_ieqm1( a ) bli_ieq ( (a), -1 ) #endif // end bli_eq.h // begin bli_fprints.h #ifndef BLIS_FPRINTS_H #define BLIS_FPRINTS_H // prints #define bli_sfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_dfprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #define bli_cfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_creal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_cimag(x) ); \ fprintf( file, " " ); \ } #define bli_zfprints( file, spec, x ) \ { \ fprintf( file, spec, bli_zreal(x) ); \ fprintf( file, " + " ); \ fprintf( file, spec, bli_zimag(x) ); \ fprintf( file, " " ); \ } #define bli_ifprints( file, spec, x ) \ { \ fprintf( file, spec, (x) ); \ } #endif // end bli_fprints.h // begin bli_inverts.h #ifndef BLIS_INVERTS_H #define BLIS_INVERTS_H // inverts // Notes: // - The first char encodes the type of x. #define bli_sinverts( x ) bli_sinvertris( bli_sreal(x), bli_simag(x) ) #define bli_dinverts( x ) bli_dinvertris( bli_dreal(x), bli_dimag(x) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) bli_cinvertris( bli_creal(x), bli_cimag(x) ) #define bli_zinverts( x ) bli_zinvertris( bli_zreal(x), bli_zimag(x) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_cinverts( x ) { (x) = 1.0F / (x); } #define bli_zinverts( x ) { (x) = 1.0 / (x); } #endif // BLIS_ENABLE_C99_COMPLEX #endif // end bli_inverts.h // begin bli_invscals.h #ifndef BLIS_INVSCALS_H #define BLIS_INVSCALS_H // invscals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscals( a, y ) bli_sinvscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscals( a, y ) bli_sinvscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscals( a, y ) bli_sinvscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscals( a, y ) bli_sinvscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscals( a, y ) bli_dinvscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscals( a, y ) bli_dinvscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscals( a, y ) bli_dinvscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscals( a, y ) bli_dinvscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) bli_scinvscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscals( a, y ) bli_scinvscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscals( a, y ) bli_cinvscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscals( a, y ) bli_cinvscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscals( a, y ) bli_dzinvscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscals( a, y ) bli_dzinvscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscals( a, y ) bli_zinvscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscals( a, y ) bli_zinvscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscals( a, y ) { (y) /= (a); } #define bli_dcinvscals( a, y ) { (y) /= (a); } #define bli_ccinvscals( a, y ) { (y) /= (a); } #define bli_zcinvscals( a, y ) { (y) /= (a); } #define bli_szinvscals( a, y ) { (y) /= (a); } #define bli_dzinvscals( a, y ) { (y) /= (a); } #define bli_czinvscals( a, y ) { (y) /= (a); } #define bli_zzinvscals( a, y ) { (y) /= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscals( a, y ) bli_ssinvscals( a, y ) #define bli_dinvscals( a, y ) bli_ddinvscals( a, y ) #define bli_cinvscals( a, y ) bli_ccinvscals( a, y ) #define bli_zinvscals( a, y ) bli_zzinvscals( a, y ) #endif // end bli_invscals.h // begin bli_invscaljs.h #ifndef BLIS_INVSCALJS_H #define BLIS_INVSCALJS_H // invscaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssinvscaljs( a, y ) bli_sinvscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsinvscaljs( a, y ) bli_sinvscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csinvscaljs( a, y ) bli_sinvscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsinvscaljs( a, y ) bli_sinvscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdinvscaljs( a, y ) bli_dinvscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddinvscaljs( a, y ) bli_dinvscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdinvscaljs( a, y ) bli_dinvscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdinvscaljs( a, y ) bli_dinvscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) bli_scinvscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcinvscaljs( a, y ) bli_scinvscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccinvscaljs( a, y ) bli_cinvscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcinvscaljs( a, y ) bli_cinvscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szinvscaljs( a, y ) bli_dzinvscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzinvscaljs( a, y ) bli_dzinvscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czinvscaljs( a, y ) bli_zinvscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzinvscaljs( a, y ) bli_zinvscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scinvscaljs( a, y ) { (y) /= (a); } #define bli_dcinvscaljs( a, y ) { (y) /= (a); } #define bli_ccinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zcinvscaljs( a, y ) { (y) /= conj (a); } #define bli_szinvscaljs( a, y ) { (y) /= (a); } #define bli_dzinvscaljs( a, y ) { (y) /= (a); } #define bli_czinvscaljs( a, y ) { (y) /= conjf(a); } #define bli_zzinvscaljs( a, y ) { (y) /= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sinvscaljs( a, y ) bli_ssinvscaljs( a, y ) #define bli_dinvscaljs( a, y ) bli_ddinvscaljs( a, y ) #define bli_cinvscaljs( a, y ) bli_ccinvscaljs( a, y ) #define bli_zinvscaljs( a, y ) bli_zzinvscaljs( a, y ) #endif // end bli_invscaljs.h // begin bli_neg2s.h #ifndef BLIS_NEG2S_H #define BLIS_NEG2S_H // neg2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssneg2s( x, y ) bli_sneg2ris( bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsneg2s( x, y ) bli_sneg2ris( bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csneg2s( x, y ) bli_sneg2ris( bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsneg2s( x, y ) bli_sneg2ris( bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdneg2s( x, y ) bli_dneg2ris( bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddneg2s( x, y ) bli_dneg2ris( bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdneg2s( x, y ) bli_dneg2ris( bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdneg2s( x, y ) bli_dneg2ris( bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) bli_cneg2ris( bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcneg2s( x, y ) bli_cneg2ris( bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccneg2s( x, y ) bli_cneg2ris( bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcneg2s( x, y ) bli_cneg2ris( bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szneg2s( x, y ) bli_zneg2ris( bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzneg2s( x, y ) bli_zneg2ris( bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czneg2s( x, y ) bli_zneg2ris( bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzneg2s( x, y ) bli_zneg2ris( bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scneg2s( x, y ) { (y) = -(x); } #define bli_dcneg2s( x, y ) { (y) = -(x); } #define bli_ccneg2s( x, y ) { (y) = -(x); } #define bli_zcneg2s( x, y ) { (y) = -(x); } #define bli_szneg2s( x, y ) { (y) = -(x); } #define bli_dzneg2s( x, y ) { (y) = -(x); } #define bli_czneg2s( x, y ) { (y) = -(x); } #define bli_zzneg2s( x, y ) { (y) = -(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sneg2s( x, y ) bli_ssneg2s( x, y ) #define bli_dneg2s( x, y ) bli_ddneg2s( x, y ) #define bli_cneg2s( x, y ) bli_ccneg2s( x, y ) #define bli_zneg2s( x, y ) bli_zzneg2s( x, y ) #endif // end bli_neg2s.h // begin bli_rands.h #ifndef BLIS_RANDS_H #define BLIS_RANDS_H // rands #define bli_srands( a ) \ { \ (a) = ( float ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0F; \ } #define bli_drands( a ) \ { \ (a) = ( double ) ( ( double ) rand() / \ ( ( double ) RAND_MAX / 2.0 ) \ ) - 1.0; \ } #define bli_crands( a ) \ { \ float ar, ai; \ \ bli_srands( ar ); \ bli_srands( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrands( a ) \ { \ double ar, ai; \ \ bli_drands( ar ); \ bli_drands( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_rands.h // begin bli_randnp2s.h #ifndef BLIS_RANDNP2S_H #define BLIS_RANDNP2S_H // randnp2s #define bli_srandnp2s( a ) \ { \ bli_drandnp2s( a ); \ } #if 0 #define bli_drandnp2s_prev( a ) \ { \ const double m_max = 3.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ if ( t == m_max2 ) t = t - 1.0; \ \ \ t = floor( t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_exp, s_val; \ \ \ PASTEMAC(d,rands)( s_exp ); \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_exp < 0.0 ) r_val = pow( 2.0, -(t - 1.0) ); \ else r_val = pow( 2.0, t - 1.0 ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ r_val = r_val / pow( 2.0, m_max ); \ \ \ \ a = r_val; \ } #endif #define bli_drandnp2s( a ) \ { \ const double m_max = 6.0; \ const double m_max2 = m_max + 2.0; \ double t; \ double r_val; \ \ \ \ do \ { \ \ t = ( ( double ) rand() / ( double ) RAND_MAX ) * m_max2; \ \ \ t = floor( t ); \ } \ \ while ( m_max2 <= t ); \ \ \ if ( t == 0.0 ) r_val = 0.0; \ else \ { \ \ \ double s_val; \ \ \ r_val = pow( 2.0, -(t - 1.0) ); \ \ \ PASTEMAC(d,rands)( s_val ); \ \ \ if ( s_val < 0.0 ) r_val = -r_val; \ } \ \ \ \ a = r_val; \ } #define bli_crandnp2s( a ) \ { \ float ar, ai; \ \ bli_srandnp2s( ar ); \ bli_srandnp2s( ai ); \ \ bli_csets( ar, ai, (a) ); \ } #define bli_zrandnp2s( a ) \ { \ double ar, ai; \ \ bli_drandnp2s( ar ); \ bli_drandnp2s( ai ); \ \ bli_zsets( ar, ai, (a) ); \ } #endif // end bli_randnp2s.h // begin bli_scals.h #ifndef BLIS_SCALS_H #define BLIS_SCALS_H // scals // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscals( a, y ) bli_sscalris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscals( a, y ) bli_sscalris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscals( a, y ) bli_sscalris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscals( a, y ) bli_sscalris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscals( a, y ) bli_dscalris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscals( a, y ) bli_dscalris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscals( a, y ) bli_dscalris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscals( a, y ) bli_dscalris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) bli_scscalris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscals( a, y ) bli_scscalris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscals( a, y ) bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscals( a, y ) bli_cscalris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscals( a, y ) bli_dzscalris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscals( a, y ) bli_dzscalris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscals( a, y ) bli_zscalris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscals( a, y ) bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscals( a, y ) { (y) *= (a); } #define bli_dcscals( a, y ) { (y) *= (a); } #define bli_ccscals( a, y ) { (y) *= (a); } #define bli_zcscals( a, y ) { (y) *= (a); } #define bli_szscals( a, y ) { (y) *= (a); } #define bli_dzscals( a, y ) { (y) *= (a); } #define bli_czscals( a, y ) { (y) *= (a); } #define bli_zzscals( a, y ) { (y) *= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscals( a, y ) bli_ssscals( a, y ) #define bli_dscals( a, y ) bli_ddscals( a, y ) #define bli_cscals( a, y ) bli_ccscals( a, y ) #define bli_zscals( a, y ) bli_zzscals( a, y ) #endif // end bli_scals.h // begin bli_scaljs.h #ifndef BLIS_SCALJS_H #define BLIS_SCALJS_H // scaljs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_ssscaljs( a, y ) bli_sscaljris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dsscaljs( a, y ) bli_sscaljris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_csscaljs( a, y ) bli_sscaljris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zsscaljs( a, y ) bli_sscaljris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdscaljs( a, y ) bli_dscaljris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddscaljs( a, y ) bli_dscaljris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdscaljs( a, y ) bli_dscaljris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdscaljs( a, y ) bli_dscaljris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) bli_scscaljris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcscaljs( a, y ) bli_scscaljris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccscaljs( a, y ) bli_cscaljris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcscaljs( a, y ) bli_cscaljris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szscaljs( a, y ) bli_dzscaljris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzscaljs( a, y ) bli_dzscaljris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czscaljs( a, y ) bli_zscaljris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzscaljs( a, y ) bli_zscaljris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscaljs( a, y ) { (y) *= (a); } #define bli_dcscaljs( a, y ) { (y) *= (a); } #define bli_ccscaljs( a, y ) { (y) *= conjf(a); } #define bli_zcscaljs( a, y ) { (y) *= conj (a); } #define bli_szscaljs( a, y ) { (y) *= (a); } #define bli_dzscaljs( a, y ) { (y) *= (a); } #define bli_czscaljs( a, y ) { (y) *= conjf(a); } #define bli_zzscaljs( a, y ) { (y) *= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscaljs( a, y ) bli_ssscaljs( a, y ) #define bli_dscaljs( a, y ) bli_ddscaljs( a, y ) #define bli_cscaljs( a, y ) bli_ccscaljs( a, y ) #define bli_zscaljs( a, y ) bli_zzscaljs( a, y ) #endif // end bli_scaljs.h // begin bli_scalcjs.h #ifndef BLIS_SCALCJS_H #define BLIS_SCALCJS_H // scalcjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_csscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zsscalcjs( conjx, x, y ) bli_sscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ddscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zdscalcjs( conjx, x, y ) bli_dscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dcscalcjs( conjx, x, y ) bli_scscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ccscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zcscalcjs( conjx, x, y ) bli_cscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzscalcjs( conjx, x, y ) bli_dzscalcjris( conjx, bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzscalcjs( conjx, x, y ) bli_zscalcjris( conjx, bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dcscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_ccscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zcscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #define bli_szscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_dzscalcjs( conjx, x, y ) { (y) *= (x); } #define bli_czscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conjf(x) : (x) ); } #define bli_zzscalcjs( conjx, x, y ) { (y) *= ( bli_is_conj( conjx ) ? conj (x) : (x) ); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscalcjs( conjx, x, y ) bli_ssscalcjs( conjx, x, y ) #define bli_dscalcjs( conjx, x, y ) bli_ddscalcjs( conjx, x, y ) #define bli_cscalcjs( conjx, x, y ) bli_ccscalcjs( conjx, x, y ) #define bli_zscalcjs( conjx, x, y ) bli_zzscalcjs( conjx, x, y ) #endif // end bli_scalcjs.h // begin bli_scal2s.h #ifndef BLIS_SCAL2S_H #define BLIS_SCAL2S_H // scal2s // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2s( a, x, y ) bli_rxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2s( a, x, y ) bli_rxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2s( a, x, y ) bli_roscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2s( a, x, y ) bli_roscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2s( a, x, y ) bli_rxscal2ris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2s( a, x, y ) bli_rxscal2ris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2s( a, x, y ) bli_rcscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2s( a, x, y ) bli_rcscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2s( a, x, y ) bli_crscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2s( a, x, y ) bli_crscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2s( a, x, y ) bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2s( a, x, y ) bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zccscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czcscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzcscal2s( a, x, y ) { (y) = (a) * (x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_cczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zczscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_szzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_dzzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_czzscal2s( a, x, y ) { (y) = (a) * (x); } #define bli_zzzscal2s( a, x, y ) { (y) = (a) * (x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2s( a, x, y ) bli_sssscal2s( a, x, y ) #define bli_dscal2s( a, x, y ) bli_dddscal2s( a, x, y ) #define bli_cscal2s( a, x, y ) bli_cccscal2s( a, x, y ) #define bli_zscal2s( a, x, y ) bli_zzzscal2s( a, x, y ) #endif // end bli_scal2s.h // begin bli_scal2js.h #ifndef BLIS_SCAL2JS_H #define BLIS_SCAL2JS_H // scal2js // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_dssscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_cssscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_zssscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_sreal(y), bli_simag(y) ) #define bli_sdsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ddsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_cdsscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zdsscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_sreal(y), bli_simag(y) ) #define bli_scsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dcsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_ccsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zcsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_sreal(y), bli_simag(y) ) #define bli_szsscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_dzsscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_czsscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) #define bli_zzsscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_sreal(y), bli_simag(y) ) // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dsdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_csdscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zsdscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_dreal(y), bli_dimag(y) ) #define bli_sddscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dddscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_cddscal2js( a, x, y ) bli_rxscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zddscal2js( a, x, y ) bli_rxscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_scdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dcdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_ccdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zcdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_szdscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_dzdscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_czdscal2js( a, x, y ) bli_roscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #define bli_zzdscal2js( a, x, y ) bli_roscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_dscscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_cscscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_zscscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_creal(y), bli_cimag(y) ) #define bli_sdcscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_ddcscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cdcscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zdcscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_creal(y), bli_cimag(y) ) #define bli_sccscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dccscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_cccscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zccscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_creal(y), bli_cimag(y) ) #define bli_szcscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_dzcscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_czcscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) #define bli_zzcscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_creal(y), bli_cimag(y) ) // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dszscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cszscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zszscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sdzscal2js( a, x, y ) bli_rxscal2jris( bli_sreal(a), bli_simag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_ddzscal2js( a, x, y ) bli_rxscal2jris( bli_dreal(a), bli_dimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cdzscal2js( a, x, y ) bli_rcscal2jris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zdzscal2js( a, x, y ) bli_rcscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_sczscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dczscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_cczscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zczscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_szzscal2js( a, x, y ) bli_crscal2jris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_dzzscal2js( a, x, y ) bli_crscal2jris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_czzscal2js( a, x, y ) bli_cxscal2jris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #define bli_zzzscal2js( a, x, y ) bli_cxscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zscscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdcscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zccscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czcscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzcscal2js( a, x, y ) { (y) = (a) * conj(x); } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_dszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zszscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_ddzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_cdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_zdzscal2js( a, x, y ) { (y) = (a) * (x); } #define bli_sczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_dczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_cczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_zczscal2js( a, x, y ) { (y) = (a) * conjf(x); } #define bli_szzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_dzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_czzscal2js( a, x, y ) { (y) = (a) * conj(x); } #define bli_zzzscal2js( a, x, y ) { (y) = (a) * conj(x); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sscal2js( a, x, y ) bli_sssscal2js( a, x, y ) #define bli_dscal2js( a, x, y ) bli_dddscal2js( a, x, y ) #define bli_cscal2js( a, x, y ) bli_cccscal2js( a, x, y ) #define bli_zscal2js( a, x, y ) bli_zzzscal2js( a, x, y ) #endif // end bli_scal2js.h // begin bli_set0s.h #ifndef BLIS_SET0S_H #define BLIS_SET0S_H #define bli_sset0s( a ) bli_ssets( 0.0F, 0.0F, (a) ) #define bli_dset0s( a ) bli_dsets( 0.0 , 0.0 , (a) ) #define bli_cset0s( a ) bli_csets( 0.0F, 0.0F, (a) ) #define bli_zset0s( a ) bli_zsets( 0.0 , 0.0 , (a) ) #endif // end bli_set0s.h // begin bli_set1s.h #ifndef BLIS_SET1S_H #define BLIS_SET1S_H #define bli_sset1s( a ) bli_ssets( 1.0F, 0.0F, (a) ) #define bli_dset1s( a ) bli_dsets( 1.0 , 0.0 , (a) ) #define bli_cset1s( a ) bli_csets( 1.0F, 0.0F, (a) ) #define bli_zset1s( a ) bli_zsets( 1.0 , 0.0 , (a) ) #endif // end bli_set1s.h // begin bli_seti0s.h #ifndef BLIS_SETI0S_H #define BLIS_SETI0S_H #define bli_sseti0s( a ) bli_ssetis( 0.0F, (a) ) #define bli_dseti0s( a ) bli_dsetis( 0.0 , (a) ) #define bli_cseti0s( a ) bli_csetis( 0.0F, (a) ) #define bli_zseti0s( a ) bli_zsetis( 0.0 , (a) ) #endif // end bli_seti0s.h // begin bli_sqrt2s.h #ifndef BLIS_SQRT2S_H #define BLIS_SQRT2S_H // sqrt2s // Notes: // - The first char encodes the type of x. // - The second char encodes the type of a. #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) bli_ssqrt2ris( bli_sreal(x), bli_simag(x), bli_sreal(a), bli_simag(a) ) #define bli_dssqrt2s( x, a ) bli_ssqrt2ris( bli_dreal(x), bli_dimag(x), bli_sreal(a), bli_simag(a) ) #define bli_cssqrt2s( x, a ) bli_ssqrt2ris( bli_creal(x), bli_cimag(x), bli_sreal(a), bli_simag(a) ) #define bli_zssqrt2s( x, a ) bli_ssqrt2ris( bli_zreal(x), bli_zimag(x), bli_sreal(a), bli_simag(a) ) #define bli_sdsqrt2s( x, a ) bli_dsqrt2ris( bli_sreal(x), bli_simag(x), bli_dreal(a), bli_dimag(a) ) #define bli_ddsqrt2s( x, a ) bli_dsqrt2ris( bli_dreal(x), bli_dimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_cdsqrt2s( x, a ) bli_dsqrt2ris( bli_creal(x), bli_cimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_zdsqrt2s( x, a ) bli_dsqrt2ris( bli_zreal(x), bli_zimag(x), bli_dreal(a), bli_dimag(a) ) #define bli_scsqrt2s( x, a ) bli_scsqrt2ris( bli_sreal(x), bli_simag(x), bli_creal(a), bli_cimag(a) ) #define bli_dcsqrt2s( x, a ) bli_scsqrt2ris( bli_dreal(x), bli_dimag(x), bli_creal(a), bli_cimag(a) ) #define bli_ccsqrt2s( x, a ) bli_csqrt2ris( bli_creal(x), bli_cimag(x), bli_creal(a), bli_cimag(a) ) #define bli_zcsqrt2s( x, a ) bli_csqrt2ris( bli_zreal(x), bli_zimag(x), bli_creal(a), bli_cimag(a) ) #define bli_szsqrt2s( x, a ) bli_dzsqrt2ris( bli_sreal(x), bli_simag(x), bli_zreal(a), bli_zimag(a) ) #define bli_dzsqrt2s( x, a ) bli_dzsqrt2ris( bli_dreal(x), bli_dimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_czsqrt2s( x, a ) bli_zsqrt2ris( bli_creal(x), bli_cimag(x), bli_zreal(a), bli_zimag(a) ) #define bli_zzsqrt2s( x, a ) bli_zsqrt2ris( bli_zreal(x), bli_zimag(x), bli_zreal(a), bli_zimag(a) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_sssqrt2s( x, a ) { (a) = ( float ) sqrtf( (x) ) ; } #define bli_dssqrt2s( x, a ) { (a) = ( float ) sqrt ( (x) ) ; } #define bli_cssqrt2s( x, a ) { (a) = ( float )bli_creal( csqrtf( (x) ) ); } #define bli_zssqrt2s( x, a ) { (a) = ( float )bli_zreal( csqrt ( (x) ) ); } #define bli_sdsqrt2s( x, a ) { (a) = ( double ) sqrtf( (x) ) ; } #define bli_ddsqrt2s( x, a ) { (a) = ( double ) sqrt ( (x) ) ; } #define bli_cdsqrt2s( x, a ) { (a) = ( double )bli_creal( csqrtf( (x) ) ); } #define bli_zdsqrt2s( x, a ) { (a) = ( double )bli_zreal( csqrt ( (x) ) ); } #define bli_scsqrt2s( x, a ) { (a) = ( scomplex ) sqrtf( (x) ) ; } #define bli_dcsqrt2s( x, a ) { (a) = ( scomplex ) sqrt ( (x) ) ; } #define bli_ccsqrt2s( x, a ) { (a) = ( scomplex ) csqrtf( (x) ) ; } #define bli_zcsqrt2s( x, a ) { (a) = ( scomplex ) csqrt ( (x) ) ; } #define bli_szsqrt2s( x, a ) { (a) = ( dcomplex ) sqrtf( (x) ) ; } #define bli_dzsqrt2s( x, a ) { (a) = ( dcomplex ) sqrt ( (x) ) ; } #define bli_czsqrt2s( x, a ) { (a) = ( dcomplex ) csqrtf( (x) ) ; } #define bli_zzsqrt2s( x, a ) { (a) = ( dcomplex ) csqrt ( (x) ) ; } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssqrt2s( x, a ) bli_sssqrt2s( x, a ) #define bli_dsqrt2s( x, a ) bli_ddsqrt2s( x, a ) #define bli_csqrt2s( x, a ) bli_ccsqrt2s( x, a ) #define bli_zsqrt2s( x, a ) bli_zzsqrt2s( x, a ) #endif // end bli_sqrt2s.h // begin bli_subs.h #ifndef BLIS_SUBS_H #define BLIS_SUBS_H // subs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubs( a, y ) bli_ssubris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubs( a, y ) bli_ssubris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubs( a, y ) bli_ssubris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubs( a, y ) bli_ssubris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubs( a, y ) bli_dsubris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubs( a, y ) bli_dsubris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubs( a, y ) bli_dsubris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubs( a, y ) bli_dsubris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) bli_csubris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubs( a, y ) bli_csubris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubs( a, y ) bli_csubris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubs( a, y ) bli_csubris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubs( a, y ) bli_zsubris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubs( a, y ) bli_zsubris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubs( a, y ) bli_zsubris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubs( a, y ) bli_zsubris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubs( a, y ) { (y) -= (a); } #define bli_dcsubs( a, y ) { (y) -= (a); } #define bli_ccsubs( a, y ) { (y) -= (a); } #define bli_zcsubs( a, y ) { (y) -= (a); } #define bli_szsubs( a, y ) { (y) -= (a); } #define bli_dzsubs( a, y ) { (y) -= (a); } #define bli_czsubs( a, y ) { (y) -= (a); } #define bli_zzsubs( a, y ) { (y) -= (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubs( a, y ) bli_sssubs( a, y ) #define bli_dsubs( a, y ) bli_ddsubs( a, y ) #define bli_csubs( a, y ) bli_ccsubs( a, y ) #define bli_zsubs( a, y ) bli_zzsubs( a, y ) #endif // end bli_subs.h // begin bli_subjs.h #ifndef BLIS_SUBJS_H #define BLIS_SUBJS_H // subjs // Notes: // - The first char encodes the type of a. // - The second char encodes the type of y. #define bli_sssubjs( a, y ) bli_ssubjris( bli_sreal(a), bli_simag(a), bli_sreal(y), bli_simag(y) ) #define bli_dssubjs( a, y ) bli_ssubjris( bli_dreal(a), bli_dimag(a), bli_sreal(y), bli_simag(y) ) #define bli_cssubjs( a, y ) bli_ssubjris( bli_creal(a), bli_cimag(a), bli_sreal(y), bli_simag(y) ) #define bli_zssubjs( a, y ) bli_ssubjris( bli_zreal(a), bli_zimag(a), bli_sreal(y), bli_simag(y) ) #define bli_sdsubjs( a, y ) bli_dsubjris( bli_sreal(a), bli_simag(a), bli_dreal(y), bli_dimag(y) ) #define bli_ddsubjs( a, y ) bli_dsubjris( bli_dreal(a), bli_dimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_cdsubjs( a, y ) bli_dsubjris( bli_creal(a), bli_cimag(a), bli_dreal(y), bli_dimag(y) ) #define bli_zdsubjs( a, y ) bli_dsubjris( bli_zreal(a), bli_zimag(a), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) bli_csubjris( bli_sreal(a), bli_simag(a), bli_creal(y), bli_cimag(y) ) #define bli_dcsubjs( a, y ) bli_csubjris( bli_dreal(a), bli_dimag(a), bli_creal(y), bli_cimag(y) ) #define bli_ccsubjs( a, y ) bli_csubjris( bli_creal(a), bli_cimag(a), bli_creal(y), bli_cimag(y) ) #define bli_zcsubjs( a, y ) bli_csubjris( bli_zreal(a), bli_zimag(a), bli_creal(y), bli_cimag(y) ) #define bli_szsubjs( a, y ) bli_zsubjris( bli_sreal(a), bli_simag(a), bli_zreal(y), bli_zimag(y) ) #define bli_dzsubjs( a, y ) bli_zsubjris( bli_dreal(a), bli_dimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_czsubjs( a, y ) bli_zsubjris( bli_creal(a), bli_cimag(a), bli_zreal(y), bli_zimag(y) ) #define bli_zzsubjs( a, y ) bli_zsubjris( bli_zreal(a), bli_zimag(a), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX #define bli_scsubjs( a, y ) { (y) -= (a); } #define bli_dcsubjs( a, y ) { (y) -= (a); } #define bli_ccsubjs( a, y ) { (y) -= conjf(a); } #define bli_zcsubjs( a, y ) { (y) -= conj (a); } #define bli_szsubjs( a, y ) { (y) -= (a); } #define bli_dzsubjs( a, y ) { (y) -= (a); } #define bli_czsubjs( a, y ) { (y) -= conjf(a); } #define bli_zzsubjs( a, y ) { (y) -= conj (a); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_ssubjs( a, y ) bli_sssubjs( a, y ) #define bli_dsubjs( a, y ) bli_ddsubjs( a, y ) #define bli_csubjs( a, y ) bli_ccsubjs( a, y ) #define bli_zsubjs( a, y ) bli_zzsubjs( a, y ) #endif // end bli_subjs.h // begin bli_swaps.h #ifndef BLIS_SWAPS_H #define BLIS_SWAPS_H // swaps // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_ssswaps( x, y ) \ { \ float w; \ bli_sscopys( (y), (w) ); \ bli_sscopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dsswaps( x, y ) \ { \ double w; \ bli_sdcopys( (y), (w) ); \ bli_dscopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_csswaps( x, y ) \ { \ scomplex w; \ bli_sccopys( (y), (w) ); \ bli_cscopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zsswaps( x, y ) \ { \ dcomplex w; \ bli_szcopys( (y), (w) ); \ bli_zscopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sdswaps( x, y ) \ { \ float w; \ bli_dscopys( (y), (w) ); \ bli_sdcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_ddswaps( x, y ) \ { \ double w; \ bli_ddcopys( (y), (w) ); \ bli_ddcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_cdswaps( x, y ) \ { \ scomplex w; \ bli_dccopys( (y), (w) ); \ bli_cdcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zdswaps( x, y ) \ { \ dcomplex w; \ bli_dzcopys( (y), (w) ); \ bli_zdcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_scswaps( x, y ) \ { \ float w; \ bli_cscopys( (y), (w) ); \ bli_sccopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dcswaps( x, y ) \ { \ double w; \ bli_cdcopys( (y), (w) ); \ bli_dccopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_ccswaps( x, y ) \ { \ scomplex w; \ bli_cccopys( (y), (w) ); \ bli_cccopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zcswaps( x, y ) \ { \ dcomplex w; \ bli_czcopys( (y), (w) ); \ bli_zccopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_szswaps( x, y ) \ { \ float w; \ bli_zscopys( (y), (w) ); \ bli_szcopys( (x), (y) ); \ bli_sscopys( (w), (x) ); \ } #define bli_dzswaps( x, y ) \ { \ double w; \ bli_zdcopys( (y), (w) ); \ bli_dzcopys( (x), (y) ); \ bli_ddcopys( (w), (x) ); \ } #define bli_czswaps( x, y ) \ { \ scomplex w; \ bli_zccopys( (y), (w) ); \ bli_czcopys( (x), (y) ); \ bli_cccopys( (w), (x) ); \ } #define bli_zzswaps( x, y ) \ { \ dcomplex w; \ bli_zzcopys( (y), (w) ); \ bli_zzcopys( (x), (y) ); \ bli_zzcopys( (w), (x) ); \ } #define bli_sswaps( x, y ) bli_ssswaps( x, y ) #define bli_dswaps( x, y ) bli_ddswaps( x, y ) #define bli_cswaps( x, y ) bli_ccswaps( x, y ) #define bli_zswaps( x, y ) bli_zzswaps( x, y ) #endif // end bli_swaps.h // begin bli_xpbys.h #ifndef BLIS_XPBYS_H #define BLIS_XPBYS_H // xpbys // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbys( x, b, y ) bli_rxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbys( x, b, y ) bli_rxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbys( x, b, y ) bli_rxxpbyris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbys( x, b, y ) bli_rxxpbyris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbys( x, b, y ) bli_crxpbyris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbys( x, b, y ) bli_crxpbyris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbys( x, b, y ) bli_cxxpbyris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbys( x, b, y ) bli_cxxpbyris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbys( x, b, y ) bli_cxxpbyris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbys( x, b, y ) bli_cxxpbyris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbys( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbys( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbys( x, b, y ) bli_sssxpbys( x, b, y ) #define bli_dxpbys( x, b, y ) bli_dddxpbys( x, b, y ) #define bli_cxpbys( x, b, y ) bli_cccxpbys( x, b, y ) #define bli_zxpbys( x, b, y ) bli_zzzxpbys( x, b, y ) #endif // end bli_xpbys.h // begin bli_xpbyjs.h #ifndef BLIS_XPBYJS_H #define BLIS_XPBYJS_H // xpbyjs // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (??s) ------------------------------------------------------------ #define bli_sssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_dssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_cssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_zssxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_sreal(y), bli_simag(y) ) #define bli_sdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ddsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_cdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zdsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_sreal(y), bli_simag(y) ) #define bli_scsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_ccsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zcsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_sreal(y), bli_simag(y) ) #define bli_szsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_dzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_czsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) #define bli_zzsxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_sreal(y), bli_simag(y) ) // -- (xby) = (??d) ------------------------------------------------------------ #define bli_ssdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_csdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zsdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_dreal(y), bli_dimag(y) ) #define bli_sddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_cddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zddxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_scdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_ccdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zcdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_szdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_dzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_czdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #define bli_zzdxpbyjs( x, b, y ) bli_rxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_dreal(y), bli_dimag(y) ) #ifndef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_dscxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_cscxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_zscxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_creal(y), bli_cimag(y) ) #define bli_sdcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_ddcxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zdcxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_creal(y), bli_cimag(y) ) #define bli_sccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_cccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zccxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_creal(y), bli_cimag(y) ) #define bli_szcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_dzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_czcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) #define bli_zzcxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_creal(y), bli_cimag(y) ) // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dszxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cszxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zszxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_sreal(b), bli_simag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sdzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_sreal(x), bli_simag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_ddzxpbyjs( x, b, y ) bli_rxxpbyjris( bli_dreal(x), bli_dimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_creal(x), bli_cimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zdzxpbyjs( x, b, y ) bli_crxpbyjris( bli_zreal(x), bli_zimag(x), bli_dreal(b), bli_dimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_sczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_cczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zczxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_creal(b), bli_cimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_szzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_sreal(x), bli_simag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_dzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_dreal(x), bli_dimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_czzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_creal(x), bli_cimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #define bli_zzzxpbyjs( x, b, y ) bli_cxxpbyjris( bli_zreal(x), bli_zimag(x), bli_zreal(b), bli_zimag(b), bli_zreal(y), bli_zimag(y) ) #else // ifdef BLIS_ENABLE_C99_COMPLEX // -- (xby) = (??c) ------------------------------------------------------------ #define bli_sscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zscxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zccxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzcxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } // -- (xby) = (??z) ------------------------------------------------------------ #define bli_sszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zszxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_ddzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zdzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_sczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_cczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zczxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_szzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_dzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_czzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #define bli_zzzxpbyjs( x, b, y ) { (y) = (x) + (b) * (y); } #endif // BLIS_ENABLE_C99_COMPLEX #define bli_sxpbyjs( x, b, y ) bli_sssxpbyjs( x, b, y ) #define bli_dxpbyjs( x, b, y ) bli_dddxpbyjs( x, b, y ) #define bli_cxpbyjs( x, b, y ) bli_cccxpbyjs( x, b, y ) #define bli_zxpbyjs( x, b, y ) bli_zzzxpbyjs( x, b, y ) #endif // end bli_xpbyjs.h // Inlined scalar macros in loops // begin bli_adds_mxn.h #ifndef BLIS_ADDS_MXN_H #define BLIS_ADDS_MXN_H // adds_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_ssadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ssadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ssadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dsadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_csadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_csadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_csadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zsadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zsadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zsadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_scadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_scadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_scadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dcadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ccadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ccadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ccadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zcadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zcadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zcadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?z BLIS_INLINE void bli_szadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzadds( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzadds( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sadds_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ssadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dadds_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_cadds_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ccadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zadds_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzadds_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_adds_mxn.h // begin bli_adds_mxn_uplo.h #ifndef BLIS_ADDS_MXN_UPLO_H #define BLIS_ADDS_MXN_UPLO_H // adds_mxn_u #define bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } // adds_mxn_l #define bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ssadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ccadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ for ( _j = 0; _j < n; ++_j ) \ { \ for ( _i = 0; _i < m; ++_i ) \ { \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzadds( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } \ } #define bli_sadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_u( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_sadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ssadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_dadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ddadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_cadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_ccadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #define bli_zadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ) \ { \ bli_zzadds_mxn_l( diagoff, m, n, x, rs_x, cs_x, y, rs_y, cs_y ); \ } #endif // end bli_adds_mxn_uplo.h // begin bli_set0s_mxn.h #ifndef BLIS_SET0S_MXN_H #define BLIS_SET0S_MXN_H // set0s_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. BLIS_INLINE void bli_sset0s_mxn( const dim_t m, const dim_t n, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_sset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_dset0s_mxn( const dim_t m, const dim_t n, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_dset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_cset0s_mxn( const dim_t m, const dim_t n, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_cset0s( *(y + i*rs_y + j*cs_y) ); } BLIS_INLINE void bli_zset0s_mxn( const dim_t m, const dim_t n, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { for ( dim_t j = 0; j < n; ++j ) for ( dim_t i = 0; i < m; ++i ) bli_zset0s( *(y + i*rs_y + j*cs_y) ); } #endif // end bli_set0s_mxn.h // begin bli_copys_mxn.h #ifndef BLIS_COPYS_MXN_H #define BLIS_COPYS_MXN_H // copys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. // xy = ?s BLIS_INLINE void bli_sscopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dscopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cscopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zscopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zscopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zscopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?d BLIS_INLINE void bli_sdcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_ddcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_ddcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_ddcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cdcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zdcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zdcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zdcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_sccopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } // xy = ?c BLIS_INLINE void bli_szcopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czcopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii + jj*cs_x), *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzcopys( *(x + ii*rs_x + jj), *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzcopys( *(x + ii*rs_x + jj*cs_x), *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_scopys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_dcopys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_ccopys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } BLIS_INLINE void bli_zcopys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); } #endif // end bli_copys_mxn.h // begin bli_scal2s_mxn.h #ifndef BLIS_SCAL2S_MXN_H #define BLIS_SCAL2S_MXN_H // scal2s_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t rs_x, const inc_t cs_x, \ ctype* restrict y, const inc_t rs_y, const inc_t cs_y \ ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*cs_x; \ ctype* restrict yj = y + j*cs_y; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*rs_x; \ ctype* restrict yij = yj + i*rs_y; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( scal2s_mxn ) #endif // end bli_scal2s_mxn.h // begin bli_xpbys_mxn.h #ifndef BLIS_XPBYS_MXN_H #define BLIS_XPBYS_MXN_H // xpbys_mxn // Notes: // - The first char encodes the type of x. // - The second char encodes the type of b. // - The third char encodes the type of y. // -- (xby) = (?ss) ------------------------------------------------------------ BLIS_INLINE void bli_sssxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_sscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dssxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_dscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cssxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_cscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zssxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_seq0( *beta ) ) { bli_zscopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zssxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zssxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?dd) ------------------------------------------------------------ BLIS_INLINE void bli_sddxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_sdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dddxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_ddcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cddxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_cdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zddxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_deq0( *beta ) ) { bli_zdcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zddxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zddxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?cc) ------------------------------------------------------------ BLIS_INLINE void bli_sccxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_sccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_sccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_sccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dccxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_dccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_cccxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_cccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_cccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_cccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zccxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_ceq0( *beta ) ) { bli_zccopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zccxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zccxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } // -- (xby) = (?zz) ------------------------------------------------------------ BLIS_INLINE void bli_szzxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_szcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_szzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_szzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_dzzxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_dzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_dzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_dzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_czzxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_czcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_czzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_czzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_zzzxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { // If beta is zero, overwrite y with x (in case y has infs or NaNs). if ( bli_zeq0( *beta ) ) { bli_zzcopys_mxn( m, n, x, rs_x, cs_x, y, rs_y, cs_y ); return; } #ifdef BLIS_ENABLE_CR_CASES if ( rs_x == 1 && rs_y == 1 ) { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii + jj*cs_x), *beta, *(y + ii + jj*cs_y) ); } else if ( cs_x == 1 && cs_y == 1 ) { for ( dim_t ii = 0; ii < m; ++ii ) for ( dim_t jj = 0; jj < n; ++jj ) bli_zzzxpbys( *(x + ii*rs_x + jj), *beta, *(y + ii*rs_y + jj) ); } else #endif { for ( dim_t jj = 0; jj < n; ++jj ) for ( dim_t ii = 0; ii < m; ++ii ) bli_zzzxpbys( *(x + ii*rs_x + jj*cs_x), *beta, *(y + ii*rs_y + jj*cs_y) ); } } BLIS_INLINE void bli_sxpbys_mxn( const dim_t m, const dim_t n, float* restrict x, const inc_t rs_x, const inc_t cs_x, float* restrict beta, float* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_sssxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_dxpbys_mxn( const dim_t m, const dim_t n, double* restrict x, const inc_t rs_x, const inc_t cs_x, double* restrict beta, double* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_dddxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_cxpbys_mxn( const dim_t m, const dim_t n, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict beta, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_cccxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } BLIS_INLINE void bli_zxpbys_mxn( const dim_t m, const dim_t n, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict beta, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y ) { bli_zzzxpbys_mxn( m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); } #endif // end bli_xpbys_mxn.h // begin bli_xpbys_mxn_uplo.h #ifndef BLIS_XPBYS_MXN_UPLO_H #define BLIS_XPBYS_MXN_UPLO_H // xpbys_mxn_u #define bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i >= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } // xpbys_mxn_l #define bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_seq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sscopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_sssxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_deq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_ddcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_dddxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_ceq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_cccxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ { \ dim_t _i, _j; \ \ \ if ( bli_zeq0( *beta ) ) \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzcopys( *(x + _i*rs_x + _j*cs_x), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ else \ { \ for ( _j = 0; _j < n; ++_j ) \ for ( _i = 0; _i < m; ++_i ) \ if ( (doff_t)_j - (doff_t)_i <= diagoff ) \ { \ bli_zzzxpbys( *(x + _i*rs_x + _j*cs_x), \ *(beta), \ *(y + _i*rs_y + _j*cs_y) ); \ } \ } \ } #define bli_sxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_u( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_sxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_sssxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_dxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_dddxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_cxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_cccxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #define bli_zxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ) \ {\ bli_zzzxpbys_mxn_l( diagoff, m, n, x, rs_x, cs_x, beta, y, rs_y, cs_y ); \ } #endif // end bli_xpbys_mxn_uplo.h // -- "broadcast B" scalar macros -- // begin bli_bcastbbs_mxn.h #ifndef BLIS_BCASTBBS_MXN_H #define BLIS_BCASTBBS_MXN_H // bcastbbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = ldy; \ const dim_t ds_y = 1; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yi = y + i*incy; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yij = yi + j*ldy; \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( bcastbbs_mxn ) #endif // end bli_bcastbbs_mxn.h // begin bli_scal2bbs_mxn.h #ifndef BLIS_SCAL2BBS_MXN_H #define BLIS_SCAL2BBS_MXN_H // scal2bbs_mxn #undef GENTFUNCRO #define GENTFUNCRO( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2js)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict xj = x + j*ldx; \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict xij = xj + i*incx; \ ctype* restrict yij = yj + i*incy; \ \ PASTEMAC(ch,scal2s)( *alpha, *xij, *yij ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,copys)( *yij, *yijd ); \ } \ } \ } \ } \ } INSERT_GENTFUNCRO_BASIC0( scal2bbs_mxn ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const conj_t conjx, \ const dim_t m, \ const dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, const inc_t incx, const inc_t ldx, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ const inc_t incx2 = 2 * incx; \ const inc_t ldx2 = 2 * ldx; \ \ const inc_t incy2 = 2 * incy; \ const inc_t ldy2 = 2 * ldy; \ \ ctype_r* restrict alpha_r = ( ctype_r* )alpha; \ ctype_r* restrict alpha_i = ( ctype_r* )alpha + 1; \ ctype_r* restrict chi_r = ( ctype_r* )x; \ ctype_r* restrict chi_i = ( ctype_r* )x + 1; \ ctype_r* restrict psi_r = ( ctype_r* )y; \ ctype_r* restrict psi_i = ( ctype_r* )y + 1*d; \ \ if ( bli_is_conj( conjx ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2jris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype_r* restrict chij_r = chi_r + j*ldx2; \ ctype_r* restrict chij_i = chi_i + j*ldx2; \ ctype_r* restrict psij_r = psi_r + j*ldy2; \ ctype_r* restrict psij_i = psi_i + j*ldy2; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype_r* restrict chiij_r = chij_r + i*incx2; \ ctype_r* restrict chiij_i = chij_i + i*incx2; \ ctype_r* restrict psiij_r = psij_r + i*incy2; \ ctype_r* restrict psiij_i = psij_i + i*incy2; \ \ PASTEMAC(ch,scal2ris)( *alpha_r, *alpha_i, \ *chiij_r, *chiij_i, \ *psiij_r, *psiij_i ); \ \ for ( dim_t p = 1; p < d; ++p ) \ { \ ctype_r* restrict psiijd_r = psiij_r + p*ds_y; \ ctype_r* restrict psiijd_i = psiij_i + p*ds_y; \ \ PASTEMAC(ch,copyris)( *psiij_r, *psiij_i, \ *psiijd_r, *psiijd_i ); \ } \ } \ } \ } \ } INSERT_GENTFUNCCO_BASIC0( scal2bbs_mxn ) #endif // end bli_scal2bbs_mxn.h // begin bli_set0bbs_mxn.h #ifndef BLIS_SET0BBS_MXN_H #define BLIS_SET0BBS_MXN_H // set0bbs_mxn #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ BLIS_INLINE void PASTEMAC(ch,opname) \ ( \ const dim_t m, \ const dim_t n, \ ctype* restrict y, const inc_t incy, const inc_t ldy \ ) \ { \ \ const dim_t d = incy; \ const dim_t ds_y = 1; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict yj = y + j*ldy; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict yij = yj + i*incy; \ \ for ( dim_t p = 0; p < d; ++p ) \ { \ ctype* restrict yijd = yij + p*ds_y; \ \ PASTEMAC(ch,set0s)( *yijd ); \ } \ } \ } \ } INSERT_GENTFUNC_BASIC0( set0bbs_mxn ) #endif // end bli_set0bbs_mxn.h // -- 1m-specific scalar macros -- // 1e // begin bli_copy1es.h #ifndef BLIS_COPY1ES_H #define BLIS_COPY1ES_H // copy1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopy1es( a, bri, bir ) {} #define bli_dscopy1es( a, bri, bir ) {} #define bli_cscopy1es( a, bri, bir ) {} #define bli_zscopy1es( a, bri, bir ) {} #define bli_sdcopy1es( a, bri, bir ) {} #define bli_ddcopy1es( a, bri, bir ) {} #define bli_cdcopy1es( a, bri, bir ) {} #define bli_zdcopy1es( a, bri, bir ) {} #define bli_sccopy1es( a, bri, bir ) {} #define bli_dccopy1es( a, bri, bir ) {} #define bli_cccopy1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( -bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopy1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( -bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopy1es( a, bri, bir ) {} #define bli_dzcopy1es( a, bri, bir ) {} #define bli_czcopy1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( -bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopy1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( -bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopy1es( a, bri, bir ) bli_cccopy1es( a, bri, bir ) #define bli_zcopy1es( a, bri, bir ) bli_zzcopy1es( a, bri, bir ) #endif // end bli_copy1es.h // begin bli_copyj1es.h #ifndef BLIS_COPYJ1ES_H #define BLIS_COPYJ1ES_H // copyj1es // Notes: // - The first char encodes the type of x. // - The second char encodes the type of y. #define bli_sscopyj1es( a, bri, bir ) {} #define bli_dscopyj1es( a, bri, bir ) {} #define bli_cscopyj1es( a, bri, bir ) {} #define bli_zscopyj1es( a, bri, bir ) {} #define bli_sdcopyj1es( a, bri, bir ) {} #define bli_ddcopyj1es( a, bri, bir ) {} #define bli_cdcopyj1es( a, bri, bir ) {} #define bli_zdcopyj1es( a, bri, bir ) {} #define bli_sccopyj1es( a, bri, bir ) {} #define bli_dccopyj1es( a, bri, bir ) {} #define bli_cccopyj1es( a, bri, bir ) \ { \ bli_cccopyris( bli_creal(a), -bli_cimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_cccopyris( bli_cimag(a), bli_creal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_zccopyj1es( a, bri, bir ) \ { \ bli_zccopyris( bli_zreal(a), -bli_zimag(a), bli_creal(bri), bli_cimag(bri) ); \ bli_zccopyris( bli_zimag(a), bli_zreal(a), bli_creal(bir), bli_cimag(bir) ); \ } #define bli_szcopyj1es( a, bri, bir ) {} #define bli_dzcopyj1es( a, bri, bir ) {} #define bli_czcopyj1es( a, bri, bir ) \ { \ bli_czcopyris( bli_creal(a), -bli_cimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_czcopyris( bli_cimag(a), bli_creal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_zzcopyj1es( a, bri, bir ) \ { \ bli_zzcopyris( bli_zreal(a), -bli_zimag(a), bli_zreal(bri), bli_zimag(bri) ); \ bli_zzcopyris( bli_zimag(a), bli_zreal(a), bli_zreal(bir), bli_zimag(bir) ); \ } #define bli_ccopyj1es( a, bri, bir ) bli_cccopyj1es( a, bri, bir ) #define bli_zcopyj1es( a, bri, bir ) bli_zzcopyj1es( a, bri, bir ) #endif // end bli_copyj1es.h // begin bli_invert1es.h #ifndef BLIS_INVERT1ES_H #define BLIS_INVERT1ES_H // invert1es #define bli_cinvert1es( bri, bir ) \ { \ bli_cinvertris( bli_creal(bri), bli_cimag(bri) ); \ bli_ccopyris( bli_creal(bri), -bli_cimag(bri), bli_cimag(bir), bli_creal(bir) ); \ } #define bli_zinvert1es( bri, bir ) \ { \ bli_zinvertris( bli_zreal(bri), bli_zimag(bri) ); \ bli_zcopyris( bli_zreal(bri), -bli_zimag(bri), bli_zimag(bir), bli_zreal(bir) ); \ } #endif // end bli_invert1es.h // begin bli_scal1es.h #ifndef BLIS_SCAL1ES_H #define BLIS_SCAL1ES_H // scal1es #define bli_cscal1es( a, yri, yir ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), bli_creal(yri), bli_cimag(yri) ); \ bli_ccopyris( -bli_cimag(yri), bli_creal(yri), bli_creal(yir), bli_cimag(yir) ); \ } #define bli_zscal1es( a, yri, yir ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), bli_zreal(yri), bli_zimag(yri) ); \ bli_zcopyris( -bli_zimag(yri), bli_zreal(yri), bli_zreal(yir), bli_zimag(yir) ); \ } #endif // end bli_scal1es.h // begin bli_scal21es.h #ifndef BLIS_SCAL21ES_H #define BLIS_SCAL21ES_H // scal21es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal21es( a, x, yri, yir ) {} #define bli_sdsscal21es( a, x, yri, yir ) {} #define bli_scsscal21es( a, x, yri, yir ) {} #define bli_szsscal21es( a, x, yri, yir ) {} #define bli_dssscal21es( a, x, yri, yir ) {} #define bli_ddsscal21es( a, x, yri, yir ) {} #define bli_dcsscal21es( a, x, yri, yir ) {} #define bli_dzsscal21es( a, x, yri, yir ) {} #define bli_cssscal21es( a, x, yri, yir ) {} #define bli_cdsscal21es( a, x, yri, yir ) {} #define bli_ccsscal21es( a, x, yri, yir ) {} #define bli_czsscal21es( a, x, yri, yir ) {} #define bli_zssscal21es( a, x, yri, yir ) {} #define bli_zdsscal21es( a, x, yri, yir ) {} #define bli_zcsscal21es( a, x, yri, yir ) {} #define bli_zzsscal21es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal21es( a, x, yri, yir ) {} #define bli_sddscal21es( a, x, yri, yir ) {} #define bli_scdscal21es( a, x, yri, yir ) {} #define bli_szdscal21es( a, x, yri, yir ) {} #define bli_dsdscal21es( a, x, yri, yir ) {} #define bli_dddscal21es( a, x, yri, yir ) {} #define bli_dcdscal21es( a, x, yri, yir ) {} #define bli_dzdscal21es( a, x, yri, yir ) {} #define bli_csdscal21es( a, x, yri, yir ) {} #define bli_cddscal21es( a, x, yri, yir ) {} #define bli_ccdscal21es( a, x, yri, yir ) {} #define bli_czdscal21es( a, x, yri, yir ) {} #define bli_zsdscal21es( a, x, yri, yir ) {} #define bli_zddscal21es( a, x, yri, yir ) {} #define bli_zcdscal21es( a, x, yri, yir ) {} #define bli_zzdscal21es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal21es( a, x, yri, yir ) {} #define bli_sdcscal21es( a, x, yri, yir ) {} #define bli_sccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal21es( a, x, yri, yir ) {} #define bli_ddcscal21es( a, x, yri, yir ) {} #define bli_dccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal21es( a, x, yri, yir ) {} #define bli_sdzscal21es( a, x, yri, yir ) {} #define bli_sczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal21es( a, x, yri, yir ) {} #define bli_ddzscal21es( a, x, yri, yir ) {} #define bli_dczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal21es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), -bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal21es( a, x, yri, yir ) bli_cccscal21es( a, x, yri, yir ) #define bli_zscal21es( a, x, yri, yir ) bli_zzzscal21es( a, x, yri, yir ) #endif // end bli_scal21es.h // begin bli_scal2j1es.h #ifndef BLIS_SCAL2J1ES_H #define BLIS_SCAL2J1ES_H // scal2j1es // Notes: // - The first char encodes the type of a. // - The second char encodes the type of x. // - The third char encodes the type of y. // -- (axy) = (??s) ------------------------------------------------------------ #define bli_sssscal2j1es( a, x, yri, yir ) {} #define bli_sdsscal2j1es( a, x, yri, yir ) {} #define bli_scsscal2j1es( a, x, yri, yir ) {} #define bli_szsscal2j1es( a, x, yri, yir ) {} #define bli_dssscal2j1es( a, x, yri, yir ) {} #define bli_ddsscal2j1es( a, x, yri, yir ) {} #define bli_dcsscal2j1es( a, x, yri, yir ) {} #define bli_dzsscal2j1es( a, x, yri, yir ) {} #define bli_cssscal2j1es( a, x, yri, yir ) {} #define bli_cdsscal2j1es( a, x, yri, yir ) {} #define bli_ccsscal2j1es( a, x, yri, yir ) {} #define bli_czsscal2j1es( a, x, yri, yir ) {} #define bli_zssscal2j1es( a, x, yri, yir ) {} #define bli_zdsscal2j1es( a, x, yri, yir ) {} #define bli_zcsscal2j1es( a, x, yri, yir ) {} #define bli_zzsscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??d) ------------------------------------------------------------ #define bli_ssdscal2j1es( a, x, yri, yir ) {} #define bli_sddscal2j1es( a, x, yri, yir ) {} #define bli_scdscal2j1es( a, x, yri, yir ) {} #define bli_szdscal2j1es( a, x, yri, yir ) {} #define bli_dsdscal2j1es( a, x, yri, yir ) {} #define bli_dddscal2j1es( a, x, yri, yir ) {} #define bli_dcdscal2j1es( a, x, yri, yir ) {} #define bli_dzdscal2j1es( a, x, yri, yir ) {} #define bli_csdscal2j1es( a, x, yri, yir ) {} #define bli_cddscal2j1es( a, x, yri, yir ) {} #define bli_ccdscal2j1es( a, x, yri, yir ) {} #define bli_czdscal2j1es( a, x, yri, yir ) {} #define bli_zsdscal2j1es( a, x, yri, yir ) {} #define bli_zddscal2j1es( a, x, yri, yir ) {} #define bli_zcdscal2j1es( a, x, yri, yir ) {} #define bli_zzdscal2j1es( a, x, yri, yir ) {} // -- (axy) = (??c) ------------------------------------------------------------ #define bli_sscscal2j1es( a, x, yri, yir ) {} #define bli_sdcscal2j1es( a, x, yri, yir ) {} #define bli_sccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dscscal2j1es( a, x, yri, yir ) {} #define bli_ddcscal2j1es( a, x, yri, yir ) {} #define bli_dccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zscscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zccscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzcscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } // -- (axy) = (??z) ------------------------------------------------------------ #define bli_sszscal2j1es( a, x, yri, yir ) {} #define bli_sdzscal2j1es( a, x, yri, yir ) {} #define bli_sczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_szzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_sreal(a), bli_simag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dszscal2j1es( a, x, yri, yir ) {} #define bli_ddzscal2j1es( a, x, yri, yir ) {} #define bli_dczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_dzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_dreal(a), bli_dimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_czzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zszscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_sreal(x), -bli_simag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_simag(x), bli_sreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zdzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), -bli_dimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dimag(x), bli_dreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zczscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_creal(x), -bli_cimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_cimag(x), bli_creal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_zzzscal2j1es( a, x, yri, yir ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), -bli_zimag(x), bli_zreal(yri), bli_zimag(yri) ); \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zimag(x), bli_zreal(x), bli_zreal(yir), bli_zimag(yir) ); \ } #define bli_cscal2j1es( a, x, yri, yir ) bli_cccscal2j1es( a, x, yri, yir ) #define bli_zscal2j1es( a, x, yri, yir ) bli_zzzscal2j1es( a, x, yri, yir ) #endif // end bli_scal2j1es.h // 1r // begin bli_copy1rs.h #ifndef BLIS_COPY1RS_H #define BLIS_COPY1RS_H // copy1rs #define bli_ccopy1rs( a, br, bi ) \ { \ bli_ccopyris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopy1rs( a, br, bi ) \ { \ bli_zcopyris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copy1rs.h // begin bli_copyj1rs.h #ifndef BLIS_COPYJ1RS_H #define BLIS_COPYJ1RS_H // copyj1rs #define bli_ccopyj1rs( a, br, bi ) \ { \ bli_ccopyjris( bli_creal(a), bli_cimag(a), br, bi ); \ } #define bli_zcopyj1rs( a, br, bi ) \ { \ bli_zcopyjris( bli_zreal(a), bli_zimag(a), br, bi ); \ } #endif // end bli_copyj1rs.h // begin bli_invert1rs.h #ifndef BLIS_INVERT1RS_H #define BLIS_INVERT1RS_H // invert1rs #define bli_cinvert1rs( xr, xi ) bli_cinvertris( xr, xi ) #define bli_zinvert1rs( xr, xi ) bli_zinvertris( xr, xi ) #endif // end bli_invert1rs.h // begin bli_scal1rs.h #ifndef BLIS_SCAL1RS_H #define BLIS_SCAL1RS_H // scal1rs #define bli_cscal1rs( a, yr, yi ) \ { \ bli_cscalris( bli_creal(a), bli_cimag(a), yr, yi ); \ } #define bli_zscal1rs( a, yr, yi ) \ { \ bli_zscalris( bli_zreal(a), bli_zimag(a), yr, yi ); \ } #define bli_scscal1rs( a, yr, yi ) \ { \ bli_scscalris( bli_sreal(a), bli_simag(a), yr, yi ); \ } #define bli_dzscal1rs( a, yr, yi ) \ { \ bli_dzscalris( bli_dreal(a), bli_dimag(a), yr, yi ); \ } #endif // end bli_scal1rs.h // begin bli_scal21rs.h #ifndef BLIS_SCAL21RS_H #define BLIS_SCAL21RS_H // scal21rs #define bli_cscscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal21rs( a, x, yr, yi ) \ { \ bli_cxscal2ris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal21rs( a, x, yr, yi ) bli_cccscal21rs( a, x, yr, yi ) #define bli_zscal21rs( a, x, yr, yi ) bli_zzzscal21rs( a, x, yr, yi ) #endif // end bli_scal21rs.h // begin bli_scal2j1rs.h #ifndef BLIS_SCAL2J1RS_H #define BLIS_SCAL2J1RS_H // scal2j1rs #define bli_cscscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_sreal(x), bli_simag(x), yr, yi ); \ } #define bli_cccscal2j1rs( a, x, yr, yi ) \ { \ bli_cscal2jris( bli_creal(a), bli_cimag(a), bli_creal(x), bli_cimag(x), yr, yi ); \ } #define bli_zdzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_dreal(x), bli_dimag(x), yr, yi ); \ } #define bli_zzzscal2j1rs( a, x, yr, yi ) \ { \ bli_zscal2jris( bli_zreal(a), bli_zimag(a), bli_zreal(x), bli_zimag(x), yr, yi ); \ } #define bli_cscal2j1rs( a, x, yr, yi ) bli_cccscal2j1rs( a, x, yr, yi ) #define bli_zscal2j1rs( a, x, yr, yi ) bli_zzzscal2j1rs( a, x, yr, yi ) #endif // end bli_scal2j1rs.h // 1m (1e or 1r) // begin bli_invert1ms_mxn_diag.h #ifndef BLIS_INVERT1MS_MXN_DIAG_H #define BLIS_INVERT1MS_MXN_DIAG_H // invert1ms_mxn_diag #define bli_cinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zinvert1ms_mxn_diag( schema, offm, offn, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1es( *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zinvert1rs( *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_invert1ms_mxn_diag.h // begin bli_scal1ms_mxn.h #ifndef BLIS_SCAL1MS_MXN_H #define BLIS_SCAL1MS_MXN_H // scal1ms_mxn #define bli_cscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_cscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #define bli_zscal1ms_mxn( schema, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ bli_zscal1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } #endif // end bli_scal1ms_mxn.h // begin bli_scal21ms_mxn.h #ifndef BLIS_SCAL21MS_MXN_H #define BLIS_SCAL21MS_MXN_H // scal21ms_mxn BLIS_INLINE void bli_cscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict x, const inc_t rs_x, const inc_t cs_x, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_ri = y; scomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_r = y_cast; float* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_cscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } BLIS_INLINE void bli_zscal21ms_mxn ( const pack_t schema, const conj_t conjx, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict x, const inc_t rs_x, const inc_t cs_x, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { dim_t i, j; if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_ri = y; dcomplex* restrict y_ir = y + ld_y/2; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21es( *(alpha), *(x + i*rs_x + j*cs_x), *(y_ri + i*rs_y + j*cs_y), *(y_ir + i*rs_y + j*cs_y) ); } } } else { inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_r = y_cast; double* restrict y_i = y_cast + ld_y; if ( bli_is_conj( conjx ) ) { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal2j1rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } else { for ( j = 0; j < n; ++j ) for ( i = 0; i < m; ++i ) { bli_zscal21rs( *(alpha), *(x + i*rs_x + j*cs_x ), *(y_r + i*rs_y2 + j*cs_y2), *(y_i + i*rs_y2 + j*cs_y2) ); } } } } #endif // end bli_scal21ms_mxn.h // begin bli_scal21ms_mxn_diag.h #ifndef BLIS_SCAL21MS_MXN_DIAG_H #define BLIS_SCAL21MS_MXN_DIAG_H // scal21ms_mxn_diag #define bli_cscscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cscscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zdzscal21ms_mxn_diag( schema, m, n, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21es( *(a), \ *(x + i*rs_x + i*cs_x), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zdzscal21rs( *(a), \ *(x + i*rs_x + i*cs_x ), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_scal21ms_mxn_diag.h // begin bli_scal21ms_mxn_uplo.h #ifndef BLIS_SCAL21MS_MXN_UPLO_H #define BLIS_SCAL21MS_MXN_UPLO_H // scal21ms_mxn_uplo #define bli_cscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_ri = y; \ scomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_r = y_cast; \ float* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_cscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #define bli_zscal21ms_mxn_uplo( schema, uplo, conjx, m, a, x, rs_x, cs_x, y, rs_y, cs_y, ld_y ) \ { \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_ri = y; \ dcomplex* restrict y_ir = y + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21es( *(a), \ *(x + i*rs_x + j*cs_x), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_r = y_cast; \ double* restrict y_i = y_cast + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjx ) ) \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal2j1rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < m; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zscal21rs( *(a), \ *(x + i*rs_x + j*cs_x ), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } \ } #endif // end bli_scal21ms_mxn_uplo.h // begin bli_set1ms_mxn.h #ifndef BLIS_SET1MS_MXN_H #define BLIS_SET1MS_MXN_H // set1ms_mxn #define bli_sset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } #define bli_dset1ms_mxn( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ \ } BLIS_INLINE void bli_cset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, scomplex* restrict alpha, scomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { scomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; scomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } float* restrict y_cast = ( float* )y; float* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; float* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_ccopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } BLIS_INLINE void bli_zset1ms_mxn ( const pack_t schema, const dim_t offm, const dim_t offn, const dim_t m, const dim_t n, dcomplex* restrict alpha, dcomplex* restrict y, const inc_t rs_y, const inc_t cs_y, const inc_t ld_y ) { inc_t offm_local = offm; inc_t offn_local = offn; dim_t m_local = m; dim_t n_local = n; inc_t rs_y1 = rs_y; inc_t cs_y1 = cs_y; inc_t rs_y2 = rs_y; inc_t cs_y2 = cs_y; dim_t i, j; if ( cs_y == 1 ) { bli_swap_incs( &offm_local, &offn_local ); bli_swap_dims( &m_local, &n_local ); bli_swap_incs( &rs_y1, &cs_y1 ); bli_swap_incs( &rs_y2, &cs_y2 ); } if ( bli_is_1e_packed( schema ) ) { dcomplex* restrict y_off_ri = y + (offm_local )*rs_y1 + (offn_local )*cs_y1; dcomplex* restrict y_off_ir = y + (offm_local )*rs_y1 + (offn_local )*cs_y1 + ld_y/2; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1es( *(alpha), *(y_off_ri + i*rs_y1 + j*cs_y1), *(y_off_ir + i*rs_y1 + j*cs_y1) ); } } else { if ( rs_y2 == 1 ) { cs_y2 *= 2; } else { rs_y2 *= 2; } double* restrict y_cast = ( double* )y; double* restrict y_off_r = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2; double* restrict y_off_i = y_cast + (offm_local )*rs_y2 + (offn_local )*cs_y2 + ld_y; for ( j = 0; j < n_local; ++j ) for ( i = 0; i < m_local; ++i ) { bli_zcopy1rs( *(alpha), *(y_off_r + i*rs_y2 + j*cs_y2), *(y_off_i + i*rs_y2 + j*cs_y2) ); } } } #endif // end bli_set1ms_mxn.h // begin bli_set1ms_mxn_diag.h #ifndef BLIS_SET1MS_MXN_DIAG_H #define BLIS_SET1MS_MXN_DIAG_H // set1ms_mxn_diag #define bli_cset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ scomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ float* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zset1ms_mxn_diag( schema, offm, offn, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y + (offm )*rs_y \ + (offn )*cs_y; \ dcomplex* restrict y_off_ir = y + (offm )*rs_y \ + (offn )*cs_y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_off_ri + i*rs_y + i*cs_y), \ *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_r = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2; \ double* restrict y_off_i = y_cast + (offm )*rs_y2 \ + (offn )*cs_y2 + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_off_r + i*rs_y2 + i*cs_y2), \ *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_set1ms_mxn_diag.h // begin bli_set1ms_mxn_uplo.h #ifndef BLIS_SET1MS_MXN_UPLO_H #define BLIS_SET1MS_MXN_UPLO_H // set1ms_mxn_uplo #define bli_cset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ scomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ scomplex* restrict y_ri = y0; \ scomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ float* restrict y0 = ( float* )y + (diagoff_abs )*offdiag_inc; \ float* restrict y_r = y0; \ float* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_ccopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #define bli_zset1ms_mxn_uplo( schema, diagoff, uplo, m, n, a, y, rs_y, cs_y, ld_y ) \ { \ doff_t diagoff_abs = bli_abs( diagoff ); \ inc_t offdiag_inc; \ dim_t i, j; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ \ if ( diagoff > 0 ) offdiag_inc = cs_y; \ else offdiag_inc = rs_y; \ \ dcomplex* restrict y0 = y + (diagoff_abs )*offdiag_inc; \ dcomplex* restrict y_ri = y0; \ dcomplex* restrict y_ir = y0 + ld_y/2; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1es( *(a), \ *(y_ri + i*rs_y + j*cs_y), \ *(y_ir + i*rs_y + j*cs_y) ); \ } \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ \ if ( diagoff > 0 ) offdiag_inc = cs_y2; \ else offdiag_inc = rs_y2; \ \ double* restrict y0 = ( double* )y + (diagoff_abs )*offdiag_inc; \ double* restrict y_r = y0; \ double* restrict y_i = y0 + ld_y; \ \ if ( bli_is_lower( uplo ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = j; i < m; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < j + 1; ++i ) \ { \ bli_zcopy1rs( *(a), \ *(y_r + i*rs_y2 + j*cs_y2), \ *(y_i + i*rs_y2 + j*cs_y2) ); \ } \ } \ } \ } #endif // end bli_set1ms_mxn_uplo.h // begin bli_seti01ms_mxn_diag.h #ifndef BLIS_SETI01MS_MXN_DIAG_H #define BLIS_SETI01MS_MXN_DIAG_H // seti01ms_mxn_diag #define bli_cseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ scomplex* restrict y_off_ri = y; \ scomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_cseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_csetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ float* restrict y_cast = ( float* )y; \ float* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_sset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #define bli_zseti01ms_mxn_diag( schema, m, n, y, rs_y, cs_y, ld_y ) \ { \ dim_t min_m_n = bli_min( m, n ); \ dim_t i; \ \ \ if ( bli_is_1e_packed( schema ) ) \ { \ dcomplex* restrict y_off_ri = y; \ dcomplex* restrict y_off_ir = y + ld_y/2; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_zseti0s( *(y_off_ri + i*rs_y + i*cs_y) ); \ bli_zsetr0s( *(y_off_ir + i*rs_y + i*cs_y) ); \ } \ } \ else \ { \ inc_t rs_y2 = rs_y; \ inc_t cs_y2 = cs_y; \ \ \ if ( rs_y2 == 1 ) { cs_y2 *= 2; } \ else { rs_y2 *= 2; } \ \ double* restrict y_cast = ( double* )y; \ double* restrict y_off_i = y_cast + ld_y; \ \ for ( i = 0; i < min_m_n; ++i ) \ { \ bli_dset0s( *(y_off_i + i*rs_y2 + i*cs_y2) ); \ } \ } \ } #endif // end bli_seti01ms_mxn_diag.h #endif // end bli_scalar_macro_defs.h // begin bli_error_macro_defs.h #ifndef BLIS_ERROR_MACRO_DEFS_H #define BLIS_ERROR_MACRO_DEFS_H // Used to insert filenames and line numbers into error-checking code. #define bli_check_error_code( code ) \ bli_check_error_code_helper( code, __FILE__, __LINE__ ) #endif // end bli_error_macro_defs.h // begin bli_blas_macro_defs.h #ifndef BLIS_BLAS_MACRO_DEFS_H #define BLIS_BLAS_MACRO_DEFS_H // -- Various Fortran compatibility macros -- // Macro to treat negative dimensions as zero. #define bli_convert_blas_dim1( n_blas, n_blis )\ { \ if ( n_blas < 0 ) n_blis = ( dim_t )0; \ else n_blis = ( dim_t )n_blas; \ } // Macro to flip signs of increments if input increments are negative. #define bli_convert_blas_incv( n, x_blas, incx_blas, \ x_blis, incx_blis ) \ { \ if ( incx_blas < 0 ) \ { \ \ x_blis = (x_blas) + (n-1)*(-incx_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ else \ { \ x_blis = (x_blas); \ incx_blis = ( inc_t )(incx_blas); \ } \ } #endif // end bli_blas_macro_defs.h // begin bli_builtin_macro_defs.h #ifndef BLIS_BUILTIN_MACRO_DEFS_H #define BLIS_BUILTIN_MACRO_DEFS_H #if defined(__ICC) || defined(__INTEL_COMPILER) // icc #define bli_prefetch( addr, rw, loc ) #elif defined(__clang__) // clang #define bli_prefetch( addr, rw, loc ) #elif defined(__GNUC__) // gcc #define bli_prefetch( addr, rw, loc ) __builtin_prefetch( addr, rw, loc ); #endif #endif // end bli_builtin_macro_defs.h // begin bli_oapi_macro_defs.h // Define the suffix to add to object API function names that include // additional "expert" parameters. #define BLIS_OAPI_EX_SUF _ex // end bli_oapi_macro_defs.h // begin bli_tapi_macro_defs.h // Define the suffix to add to typed API function names that include // additional "expert" parameters. #define BLIS_TAPI_EX_SUF _ex // end bli_tapi_macro_defs.h #endif // end bli_macro_defs.h // -- pragma definitions -- // begin bli_pragma_macro_defs.h #ifndef BLIS_PRAGMA_MACRO_DEFS_H #define BLIS_PRAGMA_MACRO_DEFS_H // Generally speaking, if BLIS_ENABLE_PRAGMA_OMP_SIMD is set, then we define // all instances of PRAGMA_SIMD as _Pragma("omp simd"). #ifdef BLIS_ENABLE_PRAGMA_OMP_SIMD #define PRAGMA_OMP_SIMD _Pragma("omp simd") #else #define PRAGMA_OMP_SIMD #endif // Require ISO C99 or later for SIMD-related pragmas. #if (( __STDC_VERSION__ >= 199901L )) #define GEN_PRAGMA(x) _Pragma(#x) #if defined(__ICC) || defined(__INTEL_COMPILER) // Intel icc. //#define PRAGMA_SIMD GEN_PRAGMA(simd) #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__clang__) // clang/llvm. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #elif defined(__GNUC__) // GNU gcc. #define PRAGMA_SIMD PRAGMA_OMP_SIMD #else // Unknown compiler. #define PRAGMA_SIMD #endif #endif #endif // end bli_pragma_macro_defs.h // -- Threading definitions -- // begin bli_thread.h #ifndef BLIS_THREAD_H #define BLIS_THREAD_H // Include thread communicator (thrcomm_t) object definitions and prototypes. // begin bli_thrcomm.h #ifndef BLIS_THRCOMM_H #define BLIS_THRCOMM_H // Include definitions (mostly thrcomm_t) specific to the method of // multithreading. // begin bli_thrcomm_single.h #ifndef BLIS_THRCOMM_SINGLE_H #define BLIS_THRCOMM_SINGLE_H // Define thrcomm_t for situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING //thread communicators may be implementation dependent #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_single.h // begin bli_thrcomm_openmp.h #ifndef BLIS_THRCOMM_OPENMP_H #define BLIS_THRCOMM_OPENMP_H // Define thrcomm_t for situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #include // skipped // Define thrcomm_t for tree barriers and non-tree barriers. #ifdef BLIS_TREE_BARRIER struct barrier_s { int arity; int count; struct barrier_s* dad; volatile int signal; }; typedef struct barrier_s barrier_t; struct thrcomm_s { void* sent_object; dim_t n_threads; barrier_t** barriers; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; // Prototypes specific to tree barriers. #ifdef BLIS_TREE_BARRIER barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_t** leaves, int leaf_index ); void bli_thrcomm_tree_barrier_free( barrier_t* barrier ); void bli_thrcomm_tree_barrier( barrier_t* barack ); #endif #endif #endif // end bli_thrcomm_openmp.h // begin bli_thrcomm_pthreads.h #ifndef BLIS_THRCOMM_PTHREADS_H #define BLIS_THRCOMM_PTHREADS_H // Define thrcomm_t for situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS #ifdef BLIS_USE_PTHREAD_BARRIER struct thrcomm_s { void* sent_object; dim_t n_threads; bli_pthread_barrier_t barrier; }; #else struct thrcomm_s { void* sent_object; dim_t n_threads; // NOTE: barrier_sense was originally a gint_t-based bool_t, but upon // redefining bool_t as bool we discovered that some gcc __atomic built-ins // don't allow the use of bool for the variables being operated upon. // (Specifically, this was observed of __atomic_fetch_xor(), but it likely // applies to all other related built-ins.) Thus, we get around this by // redefining barrier_sense as a gint_t. //volatile gint_t barrier_sense; gint_t barrier_sense; dim_t barrier_threads_arrived; }; #endif typedef struct thrcomm_s thrcomm_t; #endif #endif // end bli_thrcomm_pthreads.h // thrcomm_t query (field only) BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm ) { return comm->n_threads; } // Thread communicator prototypes. thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads ); void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm ); void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm ); void bli_thrcomm_cleanup( thrcomm_t* comm ); BLIS_EXPORT_BLIS void bli_thrcomm_barrier( dim_t thread_id, thrcomm_t* comm ); BLIS_EXPORT_BLIS void* bli_thrcomm_bcast( dim_t inside_id, void* to_send, thrcomm_t* comm ); void bli_thrcomm_barrier_atomic( dim_t thread_id, thrcomm_t* comm ); #endif // end bli_thrcomm.h // Include thread info (thrinfo_t) object definitions and prototypes. // begin bli_thrinfo.h #ifndef BLIS_THRINFO_H #define BLIS_THRINFO_H // Thread info structure definition struct thrinfo_s { // The thread communicator for the other threads sharing the same work // at this level. thrcomm_t* ocomm; // Our thread id within the ocomm thread communicator. dim_t ocomm_id; // The number of distinct threads used to parallelize the loop. dim_t n_way; // What we're working on. dim_t work_id; // When freeing, should the communicators in this node be freed? Usually, // this is field is true, but when nodes are created that share the same // communicators as other nodes (such as with packm nodes), this is set // to false. bool free_comm; // The bszid_t to help identify the node. This is mostly only useful when // debugging or tracing the allocation and release of thrinfo_t nodes. bszid_t bszid; struct thrinfo_s* sub_prenode; struct thrinfo_s* sub_node; }; typedef struct thrinfo_s thrinfo_t; // // thrinfo_t functions // NOTE: The naming of these should be made consistent at some point. // (ie: bli_thrinfo_ vs. bli_thread_) // // thrinfo_t query (field only) BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t ) { return (t->ocomm)->n_threads; } BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t ) { return t->ocomm_id; } BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t ) { return t->n_way; } BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t ) { return t->work_id; } BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t ) { return t->ocomm; } BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t ) { return t->free_comm; } BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t ) { return t->bszid; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t ) { return t->sub_node; } BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t ) { return t->sub_prenode; } // thrinfo_t query (complex) BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t ) { return t->ocomm_id == 0; } // thrinfo_t modification BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t ) { t->ocomm = ocomm; } BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t ) { t->ocomm_id = ocomm_id; } BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t ) { t->n_way = n_way; } BLIS_INLINE void bli_thrinfo_set_work_id( dim_t work_id, thrinfo_t* t ) { t->work_id = work_id; } BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t ) { t->free_comm = free_comm; } BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t ) { t->bszid = bszid; } BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t ) { t->sub_node = sub_node; } BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t* t ) { t->sub_prenode = sub_prenode; } // other thrinfo_t-related functions BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p ) { return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm ); } BLIS_INLINE void bli_thread_barrier( thrinfo_t* t ) { bli_thrcomm_barrier( t->ocomm_id, t->ocomm ); } // // Prototypes for level-3 thrinfo functions not specific to any operation. // thrinfo_t* bli_thrinfo_create ( rntm_t* rntm, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bool free_comm, bszid_t bszid, thrinfo_t* sub_node ); void bli_thrinfo_init_single ( thrinfo_t* thread ); void bli_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_thrinfo_grow ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_rgrow ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_rgrow_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_create_for_cntl_prenode ( rntm_t* rntm, cntl_t* cntl_par, cntl_t* cntl_chl, thrinfo_t* thread_par ); // ----------------------------------------------------------------------------- #if 0 void bli_thrinfo_grow_tree ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_thrinfo_grow_tree_ic ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); #endif #endif // end bli_thrinfo.h // begin bli_thrinfo_sup.h #ifndef BLIS_THRINFO_SUP_H #define BLIS_THRINFO_SUP_H // // Prototypes for level-3 thrinfo sup functions. // void bli_thrinfo_sup_grow ( rntm_t* rntm, bszid_t* bszid_par, thrinfo_t* thread ); thrinfo_t* bli_thrinfo_sup_rgrow ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_cur, thrinfo_t* thread_par ); thrinfo_t* bli_thrinfo_sup_create_for_cntl ( rntm_t* rntm, bszid_t* bszid_par, bszid_t* bszid_chl, thrinfo_t* thread_par ); #endif // end bli_thrinfo_sup.h // Include some operation-specific thrinfo_t prototypes. // Note that the bli_packm_thrinfo.h must be included before the others! // begin bli_packm_thrinfo.h // // thrinfo_t macros specific to packm. // #define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ \ ( i % n_way == work_id % n_way ) #define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ \ ( start <= i && i < end ) // Define a general-purpose version of bli_packm_my_iter() whose definition // depends on whether slab or round-robin partitioning was requested at // configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB #define bli_packm_my_iter bli_packm_my_iter_sl #else // BLIS_ENABLE_JRIR_RR #define bli_packm_my_iter bli_packm_my_iter_rr #endif // // thrinfo_t APIs specific to packm. // #if 0 thrinfo_t* bli_packm_thrinfo_create ( thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); #endif void bli_packm_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, bszid_t bszid, thrinfo_t* sub_node ); void bli_packm_thrinfo_init_single ( thrinfo_t* thread ); #if 0 void bli_packm_thrinfo_free ( thrinfo_t* thread ); #endif // end bli_packm_thrinfo.h // begin bli_l3_thrinfo.h // // thrinfo_t macros specific to various level-3 operations. // // gemm // NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt // NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // trmm // NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to // change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) #define bli_trmm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // trsm #define bli_trsm_my_iter_rr( index, thread ) \ \ ( index % thread->n_way == thread->work_id % thread->n_way ) // // thrinfo_t APIs specific to level-3 operations. // void bli_l3_thrinfo_init ( thrinfo_t* thread, thrcomm_t* ocomm, dim_t ocomm_id, dim_t n_way, dim_t work_id, thrinfo_t* sub_node ); void bli_l3_thrinfo_init_single ( thrinfo_t* thread ); void bli_l3_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_sup_thrinfo_free ( rntm_t* rntm, thrinfo_t* thread ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, cntl_t* cntl, thrinfo_t** thread ); void bli_l3_sup_thrinfo_create_root ( dim_t id, thrcomm_t* gl_comm, rntm_t* rntm, thrinfo_t** thread ); void bli_l3_sup_thrinfo_update_root ( rntm_t* rntm, thrinfo_t* thread ); void bli_l3_thrinfo_print_gemm_paths ( thrinfo_t** threads ); void bli_l3_thrinfo_print_trsm_paths ( thrinfo_t** threads ); // ----------------------------------------------------------------------------- void bli_l3_thrinfo_free_paths ( rntm_t* rntm, thrinfo_t** threads ); // end bli_l3_thrinfo.h // Include the level-3 thread decorator and related definitions and prototypes // for the conventional code path. // begin bli_l3_decor.h #ifndef BLIS_L3_DECOR_H #define BLIS_L3_DECOR_H // -- conventional definitions ------------------------------------------------- // Level-3 internal function type. typedef void (*l3int_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // Level-3 thread decorator prototype. void bli_l3_thread_decorator ( l3int_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // Include definitions specific to the method of multithreading for the // conventional code path. // begin bli_l3_decor_single.h #ifndef BLIS_L3_DECOR_SINGLE_H #define BLIS_L3_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_decor_single.h // begin bli_l3_decor_openmp.h #ifndef BLIS_L3_DECOR_OPENMP_H #define BLIS_L3_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP void bli_l3_thread_decorator_thread_check ( dim_t n_threads, dim_t tid, thrcomm_t* gl_comm, rntm_t* rntm ); #endif #endif // end bli_l3_decor_openmp.h // begin bli_l3_decor_pthreads.h #ifndef BLIS_L3_DECOR_PTHREADS_H #define BLIS_L3_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_thread_entry( void* data_void ); #endif #endif // end bli_l3_decor_pthreads.h #endif // end bli_l3_decor.h // Include the level-3 thread decorator and related definitions and prototypes // for the sup code path. // begin bli_l3_sup_decor.h #ifndef BLIS_L3_SUP_DECOR_H #define BLIS_L3_SUP_DECOR_H // -- sup definitions ---------------------------------------------------------- // Level-3 sup internal function type. typedef err_t (*l3supint_t) ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // Level-3 sup thread decorator prototype. err_t bli_l3_sup_thread_decorator ( l3supint_t func, opid_t family, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // Include definitions specific to the method of multithreading for the // sup code path. // begin bli_l3_sup_decor_single.h #ifndef BLIS_L3_SUP_DECOR_SINGLE_H #define BLIS_L3_SUP_DECOR_SINGLE_H // Definitions specific to situations when multithreading is disabled. #ifndef BLIS_ENABLE_MULTITHREADING #endif #endif // end bli_l3_sup_decor_single.h // begin bli_l3_sup_decor_openmp.h #ifndef BLIS_L3_SUP_DECOR_OPENMP_H #define BLIS_L3_SUP_DECOR_OPENMP_H // Definitions specific to situations when OpenMP multithreading is enabled. #ifdef BLIS_ENABLE_OPENMP #endif #endif // end bli_l3_sup_decor_openmp.h // begin bli_l3_sup_decor_pthreads.h #ifndef BLIS_L3_SUP_DECOR_PTHREADS_H #define BLIS_L3_SUP_DECOR_PTHREADS_H // Definitions specific to situations when POSIX multithreading is enabled. #ifdef BLIS_ENABLE_PTHREADS // Thread entry point prototype. void* bli_l3_sup_thread_entry( void* data_void ); #endif #endif // end bli_l3_sup_decor_pthreads.h #endif // end bli_l3_sup_decor.h // Initialization-related prototypes. void bli_thread_init( void ); void bli_thread_finalize( void ); // Thread range-related prototypes. BLIS_EXPORT_BLIS void bli_thread_range_sub ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end ); #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ dir_t direct, \ thrinfo_t* thr, \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl, \ cntx_t* cntx, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_mdim ) GENPROT( thread_range_ndim ) #undef GENPROT #define GENPROT( opname ) \ \ siz_t PASTEMAC0( opname ) \ ( \ thrinfo_t* thr, \ obj_t* a, \ blksz_t* bmult, \ dim_t* start, \ dim_t* end \ ); GENPROT( thread_range_l2r ) GENPROT( thread_range_r2l ) GENPROT( thread_range_t2b ) GENPROT( thread_range_b2t ) GENPROT( thread_range_weighted_l2r ) GENPROT( thread_range_weighted_r2l ) GENPROT( thread_range_weighted_t2b ) GENPROT( thread_range_weighted_b2t ) dim_t bli_thread_range_width_l ( doff_t diagoff_j, dim_t m, dim_t n_j, dim_t j, dim_t n_way, dim_t bf, dim_t bf_left, double area_per_thr, bool handle_edge_low ); siz_t bli_find_area_trap_l ( dim_t m, dim_t n, doff_t diagoff ); siz_t bli_thread_range_weighted_sub ( thrinfo_t* restrict thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* restrict j_start_thr, dim_t* restrict j_end_thr ); // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes typedef struct { dim_t n; dim_t sqrt_n; dim_t f; } bli_prime_factors_t; void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors); dim_t bli_next_prime_factor(bli_prime_factors_t* factors); bool bli_is_prime( dim_t n ); void bli_thread_partition_2x2 ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_slow ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); void bli_thread_partition_2x2_fast ( dim_t n_thread, dim_t work1, dim_t work2, dim_t* restrict nt1, dim_t* restrict nt2 ); // ----------------------------------------------------------------------------- dim_t bli_gcd( dim_t x, dim_t y ); dim_t bli_lcm( dim_t x, dim_t y ); dim_t bli_ipow( dim_t base, dim_t power ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS dim_t bli_thread_get_jc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_pc_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ic_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_jr_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_ir_nt( void ); BLIS_EXPORT_BLIS dim_t bli_thread_get_num_threads( void ); BLIS_EXPORT_BLIS void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir ); BLIS_EXPORT_BLIS void bli_thread_set_num_threads( dim_t value ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); // ----------------------------------------------------------------------------- BLIS_INLINE void bli_thread_range_jrir_rr ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; } BLIS_INLINE void bli_thread_range_jrir_sl ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Use contiguous slab partitioning of jr/ir loops. bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); *inc = 1; } BLIS_INLINE void bli_thread_range_jrir ( thrinfo_t* thread, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { // Define a general-purpose version of bli_thread_range_jrir() whose // definition depends on whether slab or round-robin partitioning was // requested at configure-time. #ifdef BLIS_ENABLE_JRIR_SLAB bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); #else bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); #endif } #if 0 BLIS_INLINE void bli_thread_range_weighted_jrir ( thrinfo_t* thread, doff_t diagoff, uplo_t uplo, dim_t m, dim_t n, dim_t bf, bool handle_edge_low, dim_t* start, dim_t* end, dim_t* inc ) { #ifdef BLIS_ENABLE_JRIR_SLAB // Use contiguous slab partitioning for jr/ir loops. bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, handle_edge_low, start, end ); *start = *start / bf; *inc = 1; if ( *end % bf ) *end = *end / bf + 1; else *end = *end / bf; #else // Use interleaved partitioning of jr/ir loops. *start = bli_thread_work_id( thread ); *inc = bli_thread_n_way( thread ); *end = n; #endif } #endif #endif // end bli_thread.h // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Constant definitions -- // begin bli_extern_defs.h #ifndef BLIS_EXTERN_DEFS_H #define BLIS_EXTERN_DEFS_H BLIS_EXPORT_BLIS extern obj_t BLIS_TWO; BLIS_EXPORT_BLIS extern obj_t BLIS_ONE; //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO; //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE; BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO; BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED; BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED; #endif // end bli_extern_defs.h // -- BLIS architecture/kernel definitions -- // begin bli_l1v_ker_prot.h // // Define template prototypes for level-1v kernels. // #define ADDV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define AMAXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* restrict cntx \ ); \ #define AXPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define AXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define COPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define DOTXV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ); \ #define INVERTV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCALV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SCAL2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define SETV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ); \ #define SUBV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define SWAPV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ #define XPBYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); \ // end bli_l1v_ker_prot.h // begin bli_l1f_ker_prot.h // // Define template prototypes for level-1f kernels. // #define AXPY2V_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define AXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); #define DOTAXPYV_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXAXPYF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ); #define DOTXF_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ); // end bli_l1f_ker_prot.h // begin bli_l1m_ker_prot.h // // Define template prototypes for level-1m kernels. // // native packm kernels #define PACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // native unpackm kernels #define UNPACKM_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); // 1e/1r packm kernels #define PACKM_1ER_KER_PROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); // end bli_l1m_ker_prot.h // begin bli_l3_ukr_prot.h // // Define template prototypes for level-3 micro-kernels. // #define GEMM_UKR_PROT( ctype, ch, opname ) GEMM_UKR_PROT2(ctype, ctype, ch, opname) #define GEMM_UKR_PROT2( ctype_in, ctype_out, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype_out* restrict alpha, \ ctype_in* restrict a, \ ctype_in* restrict b, \ ctype_out* restrict beta, \ ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define GEMMTRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); #define TRSM_UKR_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_ukr_prot.h // begin bli_l3_sup_ker_prot.h // // Define template prototypes for level-3 kernels on small/unpacked matrices. // #define GEMMSUP_KER_PROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); // end bli_l3_sup_ker_prot.h // begin bli_arch_config_pre.h #ifndef BLIS_ARCH_CONFIG_PRE_H #define BLIS_ARCH_CONFIG_PRE_H // -- Naming-related kernel definitions ---------------------------------------- // The default suffix appended to reference kernels. #define BLIS_REF_SUFFIX _ref // A suffix used for labeling certain induced method aware functions. #define BLIS_IND_SUFFIX _ind // Add an underscore to the BLIS kernel set string, if it was defined. #ifdef BLIS_CNAME #define BLIS_CNAME_INFIX PASTECH(_,BLIS_CNAME) #endif // Combine the CNAME and _ref for convenience to the code that defines // reference kernels. //#define BLIS_CNAME_REF_SUFFIX PASTECH2(_,BLIS_CNAME,BLIS_REF_SUFFIX) // -- Prototype-generating macro definitions ----------------------------------- // Prototype-generating macro for bli_cntx_init_*() functions. #define CNTX_INIT_PROTS( archname ) \ \ void PASTEMAC(cntx_init_,archname) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_REF_SUFFIX) \ ( \ cntx_t* cntx \ ); \ void PASTEMAC2(cntx_init_,archname,BLIS_IND_SUFFIX) \ ( \ ind_t method, \ cntx_t* cntx \ ); #endif // end bli_arch_config_pre.h // begin bli_arch_config.h #ifndef BLIS_ARCH_CONFIG_H #define BLIS_ARCH_CONFIG_H // // -- Context initialization prototypes ---------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_CONFIG_SKX CNTX_INIT_PROTS( skx ) #endif #ifdef BLIS_CONFIG_KNL CNTX_INIT_PROTS( knl ) #endif #ifdef BLIS_CONFIG_KNC CNTX_INIT_PROTS( knc ) #endif #ifdef BLIS_CONFIG_HASWELL CNTX_INIT_PROTS( haswell ) #endif #ifdef BLIS_CONFIG_SANDYBRIDGE CNTX_INIT_PROTS( sandybridge ) #endif #ifdef BLIS_CONFIG_PENRYN CNTX_INIT_PROTS( penryn ) #endif // -- AMD64 architectures -- #ifdef BLIS_CONFIG_ZEN3 CNTX_INIT_PROTS( zen3 ) #endif #ifdef BLIS_CONFIG_ZEN2 CNTX_INIT_PROTS( zen2 ) #endif #ifdef BLIS_CONFIG_ZEN CNTX_INIT_PROTS( zen ) #endif #ifdef BLIS_CONFIG_EXCAVATOR CNTX_INIT_PROTS( excavator ) #endif #ifdef BLIS_CONFIG_STEAMROLLER CNTX_INIT_PROTS( steamroller ) #endif #ifdef BLIS_CONFIG_PILEDRIVER CNTX_INIT_PROTS( piledriver ) #endif #ifdef BLIS_CONFIG_BULLDOZER CNTX_INIT_PROTS( bulldozer ) #endif // -- ARM architectures -- #ifdef BLIS_CONFIG_ARMSVE CNTX_INIT_PROTS( armsve ) #endif #ifdef BLIS_CONFIG_A64FX CNTX_INIT_PROTS( a64fx ) #endif #ifdef BLIS_CONFIG_FIRESTORM CNTX_INIT_PROTS( firestorm ) #endif #ifdef BLIS_CONFIG_THUNDERX2 CNTX_INIT_PROTS( thunderx2 ) #endif #ifdef BLIS_CONFIG_CORTEXA57 CNTX_INIT_PROTS( cortexa57 ) #endif #ifdef BLIS_CONFIG_CORTEXA53 CNTX_INIT_PROTS( cortexa53 ) #endif #ifdef BLIS_CONFIG_CORTEXA15 CNTX_INIT_PROTS( cortexa15 ) #endif #ifdef BLIS_CONFIG_CORTEXA9 CNTX_INIT_PROTS( cortexa9 ) #endif // -- IBM Power -- #ifdef BLIS_CONFIG_POWER10 CNTX_INIT_PROTS( power10 ) #endif #ifdef BLIS_CONFIG_POWER9 CNTX_INIT_PROTS( power9 ) #endif #ifdef BLIS_CONFIG_POWER7 CNTX_INIT_PROTS( power7 ) #endif // -- IBM BG/Q -- #ifdef BLIS_CONFIG_BGQ CNTX_INIT_PROTS( bgq ) #endif // -- Generic -- #ifdef BLIS_CONFIG_GENERIC CNTX_INIT_PROTS( generic ) #endif // // -- Architecture family-specific headers ------------------------------------- // // -- x86_64 families -- #ifdef BLIS_FAMILY_INTEL64 #include "bli_family_intel64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64 #include "bli_family_amd64.h" // skipped #endif #ifdef BLIS_FAMILY_AMD64_LEGACY #include "bli_family_amd64_legacy.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64 // begin bli_family_x86_64.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_x86_64.h #endif #ifdef BLIS_FAMILY_X86_64_NO_SKX #include "bli_family_x86_64_no_skx.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN2 #include "bli_family_x86_64_no_zen2.h" // skipped #endif #ifdef BLIS_FAMILY_X86_64_NO_ZEN3 #include "bli_family_x86_64_no_zen3.h" // skipped #endif // -- Intel64 architectures -- #ifdef BLIS_FAMILY_SKX // begin bli_family_skx.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 3 #define BLIS_THREAD_RATIO_N 2 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 4 // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#include //#define BLIS_MALLOC_POOL malloc //#define BLIS_FREE_POOL free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- // -- Cache and register blocksizes -- // // Constraints: // // (1) MC must be a multiple of: // (a) MR (for zero-padding purposes) // (b) NR (for zero-padding purposes when MR and NR are "swapped") // (2) NC must be a multiple of // (a) NR (for zero-padding purposes) // (b) MR (for zero-padding purposes when MR and NR are "swapped") // #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_16x12_l2 #define BLIS_DEFAULT_MC_D 144 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 5760 #define BLIS_DEFAULT_MR_D 16 #define BLIS_DEFAULT_NR_D 12 #define BLIS_PACKDIM_MR_D 16 #define BLIS_PACKDIM_NR_D 12 // NOTE: If the micro-kernel, which is typically unrolled to a factor // of f, handles leftover edge cases (ie: when k % f > 0) then these // register blocksizes in the k dimension can be defined to 1. //#define BLIS_DEFAULT_KR_S 1 //#define BLIS_DEFAULT_KR_D 1 //#define BLIS_DEFAULT_KR_C 1 //#define BLIS_DEFAULT_KR_Z 1 // -- Maximum cache blocksizes (for optimizing edge cases) -- // NOTE: These cache blocksize "extensions" have the same constraints as // the corresponding default blocksizes above. When these values are // larger than the default blocksizes, blocksizes used at edge cases are // enlarged if such an extension would encompass the remaining portion of // the matrix dimension. #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) //#define BLIS_MAXIMUM_MC_C (BLIS_DEFAULT_MC_C + BLIS_DEFAULT_MC_C/4) //#define BLIS_MAXIMUM_KC_C (BLIS_DEFAULT_KC_C + BLIS_DEFAULT_KC_C/4) //#define BLIS_MAXIMUM_NC_C (BLIS_DEFAULT_NC_C + BLIS_DEFAULT_NC_C/4) //#define BLIS_MAXIMUM_MC_Z (BLIS_DEFAULT_MC_Z + BLIS_DEFAULT_MC_Z/4) //#define BLIS_MAXIMUM_KC_Z (BLIS_DEFAULT_KC_Z + BLIS_DEFAULT_KC_Z/4) //#define BLIS_MAXIMUM_NC_Z (BLIS_DEFAULT_NC_Z + BLIS_DEFAULT_NC_Z/4) #endif //#endif // end bli_family_skx.h #endif #ifdef BLIS_FAMILY_KNL // begin bli_family_knl.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- THREADING PARAMETERS ----------------------------------------------------- #define BLIS_THREAD_RATIO_M 4 #define BLIS_THREAD_RATIO_N 1 #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // -- MEMORY ALLOCATION -------------------------------------------------------- //#define BLIS_TREE_BARRIER //#define BLIS_TREE_BARRIER_ARITY 4 #define BLIS_SIMD_ALIGN_SIZE 64 #define BLIS_SIMD_MAX_SIZE 64 #define BLIS_SIMD_MAX_NUM_REGISTERS 32 //#define BLIS_MALLOC_INTL hbw_malloc //#define BLIS_FREE_INTL hbw_free #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_SGEMM_UKERNEL bli_sgemm_opt_30x16_knc #define BLIS_DEFAULT_MC_S 240 #define BLIS_DEFAULT_KC_S 240 #define BLIS_DEFAULT_NC_S 14400 #define BLIS_DEFAULT_MR_S 30 #define BLIS_DEFAULT_NR_S 16 #define BLIS_PACKDIM_MR_S 32 #define BLIS_PACKDIM_NR_S 16 #if 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8_knc #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #elif 0 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_30x8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 240 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DEFAULT_MR_D 30 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 32 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_30XK_KERNEL bli_dpackm_30xk_opt #else #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #define BLIS_DGEMM_UKERNEL bli_dgemm_opt_24x8 #define BLIS_DEFAULT_MR_D 24 #define BLIS_DEFAULT_NR_D 8 #define BLIS_PACKDIM_MR_D 24 #define BLIS_PACKDIM_NR_D 8 #define BLIS_DEFAULT_MC_D 120 #define BLIS_DEFAULT_KC_D 336 #define BLIS_DEFAULT_NC_D 14400 #define BLIS_DPACKM_8XK_KERNEL bli_dpackm_8xk_opt #define BLIS_DPACKM_24XK_KERNEL bli_dpackm_24xk_opt #endif #define BLIS_MAXIMUM_MC_S (BLIS_DEFAULT_MC_S + BLIS_DEFAULT_MC_S/4) #define BLIS_MAXIMUM_KC_S (BLIS_DEFAULT_KC_S + BLIS_DEFAULT_KC_S/4) #define BLIS_MAXIMUM_NC_S (BLIS_DEFAULT_NC_S + 0) #define BLIS_MAXIMUM_MC_D (BLIS_DEFAULT_MC_D + BLIS_DEFAULT_MC_D/4) #define BLIS_MAXIMUM_KC_D (BLIS_DEFAULT_KC_D + BLIS_DEFAULT_KC_D/4) #define BLIS_MAXIMUM_NC_D (BLIS_DEFAULT_NC_D + 0) #endif //#endif // end bli_family_knl.h #endif #ifdef BLIS_FAMILY_KNC #include "bli_family_knc.h" // skipped #endif #ifdef BLIS_FAMILY_HASWELL // begin bli_family_haswell.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- // -- sgemm micro-kernel -- #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_4x24 #define BLIS_DEFAULT_MC_S 256 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 4 #define BLIS_DEFAULT_NR_S 24 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_6x16 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 6 #define BLIS_DEFAULT_NR_S 16 #define BLIS_SGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x6 #define BLIS_DEFAULT_MC_S 144 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 4080 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 6 #endif // -- dgemm micro-kernel -- #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x12 #define BLIS_DEFAULT_MC_D 152 #define BLIS_DEFAULT_KC_D 160 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 12 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 1 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_6x8 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 6 #define BLIS_DEFAULT_NR_D 8 #define BLIS_DGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x6 #define BLIS_DEFAULT_MC_D 72 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4080 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 6 #endif // -- cgemm micro-kernel -- #if 1 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_3x8 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 3 #define BLIS_DEFAULT_NR_C 8 #define BLIS_CGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x3 #define BLIS_DEFAULT_MC_C 144 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4080 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 3 #endif // -- zgemm micro-kernel -- #if 1 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_3x4 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 3 #define BLIS_DEFAULT_NR_Z 4 #define BLIS_ZGEMM_UKERNEL_PREFERS_CONTIG_ROWS #endif #if 0 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x3 #define BLIS_DEFAULT_MC_Z 72 #define BLIS_DEFAULT_KC_Z 256 #define BLIS_DEFAULT_NC_Z 4080 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 3 #endif #endif //#endif // end bli_family_haswell.h #endif #ifdef BLIS_FAMILY_SANDYBRIDGE // begin bli_family_sandybridge.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS AND DEFINITIONS --------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x4 #define BLIS_DEFAULT_MC_D 96 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_sandybridge.h #endif #ifdef BLIS_FAMILY_PENRYN // begin bli_family_penryn.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x4 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MC_S 768 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x4 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MC_D 384 #define BLIS_DEFAULT_KC_D 384 #define BLIS_DEFAULT_NC_D 4096 #define BLIS_DGEMMTRSM_L_UKERNEL bli_dgemmtrsm_l_asm_4x4 #define BLIS_DGEMMTRSM_U_UKERNEL bli_dgemmtrsm_u_asm_4x4 // -- LEVEL-1F KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPY2V_KERNEL bli_daxpy2v_int_var1 #define BLIS_DDOTAXPYV_KERNEL bli_ddotaxpyv_int_var1 #define BLIS_DAXPYF_KERNEL bli_daxpyf_int_var1 #define BLIS_DDOTXF_KERNEL bli_ddotxf_int_var1 #define BLIS_DDOTXAXPYF_KERNEL bli_ddotxaxpyf_int_var1 // -- LEVEL-1V KERNEL DEFINITIONS ---------------------------------------------- #define BLIS_DAXPYV_KERNEL bli_daxpyv_opt_var1 #define BLIS_DDOTV_KERNEL bli_ddotv_opt_var1 #endif //#endif // end bli_family_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_FAMILY_ZEN3 // begin bli_family_zen3.h #ifndef BLI_FAMILY_ZEN3_ #define BLI_FAMILY_ZEN3_ // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. // #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 // To enable framework optimizations for zen3 platform // All zen3 specific code should be included in this macro #define BLIS_CONFIG_ZEN3 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 #endif // end bli_family_zen3.h #endif #ifdef BLIS_FAMILY_ZEN2 // begin bli_family_zen2.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 #define BLIS_ENABLE_SMALL_MATRIX_ROME #define BLIS_SMALL_MATRIX_THRES_ROME 400 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME 80 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M 40 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M 1000 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME 150 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M 5 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N 130 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M 10 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N 1200 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N 280 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N 100 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME 110 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N 30 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME 120 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME_COL_PANEL_N 50 // When running HPL with pure MPI without DGEMM threading (Single-threaded // BLIS), defining this macro as 1 yields better performance. #define AOCL_BLIS_MULTIINSTANCE 0 #endif // end bli_family_zen2.h #endif #ifdef BLIS_FAMILY_ZEN // begin bli_family_zen.h // By default, it is effective to parallelize the outer loops. // Setting these macros to 1 will force JR and IR inner loops // to be not paralleized. #define BLIS_THREAD_MAX_IR 1 #define BLIS_THREAD_MAX_JR 1 #define BLIS_ENABLE_ZEN_BLOCK_SIZES // Vanilla BLIS disables AMD's small matrix handling by default. #if 0 #define BLIS_ENABLE_SMALL_MATRIX #define BLIS_ENABLE_SMALL_MATRIX_TRSM // This will select the threshold below which small matrix code will be called. #define BLIS_SMALL_MATRIX_THRES 700 #define BLIS_SMALL_M_RECT_MATRIX_THRES 160 #define BLIS_SMALL_K_RECT_MATRIX_THRES 128 #define BLIS_SMALL_MATRIX_THRES_TRSM 32768 //128(128+128) => m*(m+n) #define BLIS_SMALL_MATRIX_A_THRES_TRSM 128 #define BLIS_SMALL_MATRIX_A_THRES_M_GEMMT 96 #define BLIS_SMALL_MATRIX_A_THRES_N_GEMMT 128 //This macro will enable BLIS DGEMM to choose block sizes for a single instance mode #define BLIS_ENABLE_SINGLE_INSTANCE_BLOCK_SIZES 0 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES 250 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES 90 #define D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO 22 #endif #if 0 // Allow the sup implementation to combine some small edge case iterations in // the 2nd loop of the panel-block algorithm (MR) and/or the 2nd loop of the // block-panel algorithm (NR) with the last full iteration that precedes it. // NOTE: These cpp macros need to be explicitly set to an integer since they // are used at compile-time to create unconditional branches or dead code // regions. #define BLIS_ENABLE_SUP_MR_EXT 1 #define BLIS_ENABLE_SUP_NR_EXT 0 #endif // end bli_family_zen.h #endif #ifdef BLIS_FAMILY_EXCAVATOR // begin bli_family_excavator.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DEFAULT_MC_S 528 #define BLIS_DEFAULT_KC_S 256 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_DEFAULT_MC_D 264 #define BLIS_DEFAULT_KC_D 256 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_DEFAULT_MC_C 264 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #define BLIS_DEFAULT_MC_Z 100 #define BLIS_DEFAULT_KC_Z 320 #define BLIS_DEFAULT_NC_Z 8400 #endif //#endif // end bli_family_excavator.h #endif #ifdef BLIS_FAMILY_STEAMROLLER // begin bli_family_steamroller.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 //#endif // end bli_family_steamroller.h #endif #ifdef BLIS_FAMILY_PILEDRIVER // begin bli_family_piledriver.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H // -- MEMORY ALLOCATION -------------------------------------------------------- #define BLIS_SIMD_ALIGN_SIZE 16 #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_16x3 #define BLIS_DEFAULT_MC_S 2016 #define BLIS_DEFAULT_KC_S 128 #define BLIS_DEFAULT_NC_S 8400 #define BLIS_DEFAULT_MR_S 16 #define BLIS_DEFAULT_NR_S 3 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_8x3 #define BLIS_DEFAULT_MC_D 1008 #define BLIS_DEFAULT_KC_D 128 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 3 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_4x2 #define BLIS_DEFAULT_MC_C 512 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 8400 #define BLIS_DEFAULT_MR_C 4 #define BLIS_DEFAULT_NR_C 2 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_2x2 #define BLIS_DEFAULT_MC_Z 400 #define BLIS_DEFAULT_KC_Z 160 #define BLIS_DEFAULT_NC_Z 8400 #define BLIS_DEFAULT_MR_Z 2 #define BLIS_DEFAULT_NR_Z 2 #endif //#endif // end bli_family_piledriver.h #endif #ifdef BLIS_FAMILY_BULLDOZER // begin bli_family_bulldozer.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H #if 0 // -- LEVEL-3 MICRO-KERNEL CONSTANTS ------------------------------------------- #define BLIS_SGEMM_UKERNEL bli_sgemm_asm_8x8_fma4 #define BLIS_DEFAULT_MC_S 128 #define BLIS_DEFAULT_KC_S 384 #define BLIS_DEFAULT_NC_S 4096 #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 8 #define BLIS_DGEMM_UKERNEL bli_dgemm_asm_4x6_fma4 #define BLIS_DEFAULT_MC_D 1080 #define BLIS_DEFAULT_KC_D 120 #define BLIS_DEFAULT_NC_D 8400 #define BLIS_DEFAULT_MR_D 4 #define BLIS_DEFAULT_NR_D 6 #define BLIS_CGEMM_UKERNEL bli_cgemm_asm_8x4_fma4 #define BLIS_DEFAULT_MC_C 96 #define BLIS_DEFAULT_KC_C 256 #define BLIS_DEFAULT_NC_C 4096 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_ZGEMM_UKERNEL bli_zgemm_asm_4x4_fma4 #define BLIS_DEFAULT_MC_Z 64 #define BLIS_DEFAULT_KC_Z 192 #define BLIS_DEFAULT_NC_Z 4096 #define BLIS_DEFAULT_MR_Z 4 #define BLIS_DEFAULT_NR_Z 4 #endif //#endif // end bli_family_bulldozer.h #endif // -- ARM families -- #ifdef BLIS_FAMILY_ARM64 #include "bli_family_arm64.h" // skipped #endif #ifdef BLIS_FAMILY_ARM32 #include "bli_family_arm32.h" // skipped #endif // -- ARM architectures -- #ifdef BLIS_FAMILY_ARMSVE #include "bli_family_armsve.h" // skipped #endif #ifdef BLIS_FAMILY_A64FX #include "bli_family_a64fx.h" // skipped #endif #ifdef BLIS_FAMILY_FIRESTORM #include "bli_family_firestorm.h" // skipped #endif #ifdef BLIS_FAMILY_THUNDERX2 #include "bli_family_thunderx2.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA57 #include "bli_family_cortexa57.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA53 #include "bli_family_cortexa53.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA15 #include "bli_family_cortexa15.h" // skipped #endif #ifdef BLIS_FAMILY_CORTEXA9 #include "bli_family_cortexa9.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_FAMILY_POWER10 #include "bli_family_power10.h" // skipped #endif #ifdef BLIS_FAMILY_POWER9 #include "bli_family_power9.h" // skipped #endif #ifdef BLIS_FAMILY_POWER7 #include "bli_family_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_FAMILY_BGQ #include "bli_family_bgq.h" // skipped #endif // -- Generic -- #ifdef BLIS_FAMILY_GENERIC // begin bli_family_generic.h //#ifndef BLIS_FAMILY_H //#define BLIS_FAMILY_H //#endif // end bli_family_generic.h #endif // // -- kernel set prototypes ---------------------------------------------------- // // -- Intel64 architectures -- #ifdef BLIS_KERNELS_SKX // begin bli_kernels_skx.h GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) // end bli_kernels_skx.h #endif #ifdef BLIS_KERNELS_KNL // begin bli_kernels_knl.h GEMM_UKR_PROT( float, s, gemm_knl_asm_24x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 ) PACKM_KER_PROT( float, s, packm_knl_asm_24xk ) PACKM_KER_PROT( float, s, packm_knl_asm_16xk ) PACKM_KER_PROT( double, d, packm_knl_asm_24xk ) PACKM_KER_PROT( double, d, packm_knl_asm_8xk ) // unused: GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) PACKM_KER_PROT( double, d, packm_knl_asm_30xk ) // end bli_kernels_knl.h #endif #ifdef BLIS_KERNELS_KNC #include "bli_kernels_knc.h" // skipped #endif #ifdef BLIS_KERNELS_HASWELL // begin bli_kernels_haswell.h // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) // end bli_kernels_haswell.h #endif #ifdef BLIS_KERNELS_SANDYBRIDGE // begin bli_kernels_sandybridge.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) // end bli_kernels_sandybridge.h #endif #ifdef BLIS_KERNELS_PENRYN // begin bli_kernels_penryn.h GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) // end bli_kernels_penryn.h #endif // -- AMD64 architectures -- #ifdef BLIS_KERNELS_ZEN2 // begin bli_kernels_zen2.h // -- level-1f -- AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) // end bli_kernels_zen2.h #endif #ifdef BLIS_KERNELS_ZEN // begin bli_kernels_zen.h // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) // end bli_kernels_zen.h #endif //#ifdef BLIS_KERNELS_EXCAVATOR //#include "bli_kernels_excavator.h" //#endif //#ifdef BLIS_KERNELS_STEAMROLLER //#include "bli_kernels_steamroller.h" //#endif #ifdef BLIS_KERNELS_PILEDRIVER // begin bli_kernels_piledriver.h // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) // end bli_kernels_piledriver.h #endif #ifdef BLIS_KERNELS_BULLDOZER // begin bli_kernels_bulldozer.h GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) // end bli_kernels_bulldozer.h #endif // -- ARM architectures -- #ifdef BLIS_KERNELS_ARMSVE #include "bli_kernels_armsve.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" // skipped #endif #ifdef BLIS_KERNELS_ARMV7A #include "bli_kernels_armv7a.h" // skipped #endif // -- IBM Power -- #ifdef BLIS_KERNELS_POWER10 #include "bli_kernels_power10.h" // skipped #endif #ifdef BLIS_KERNELS_POWER9 #include "bli_kernels_power9.h" // skipped #endif #ifdef BLIS_KERNELS_POWER7 #include "bli_kernels_power7.h" // skipped #endif // -- IBM BG/Q -- #ifdef BLIS_KERNELS_BGQ #include "bli_kernels_bgq.h" // skipped #endif #endif // end bli_arch_config.h // begin bli_kernel_macro_defs.h #ifndef BLIS_KERNEL_MACRO_DEFS_H #define BLIS_KERNEL_MACRO_DEFS_H // -- Define default threading parameters -------------------------------------- // -- Conventional (large code path) values -- // These BLIS_THREAD_RATIO_? macros distort the amount of work in the m and n // dimensions for the purposes of factorizing the total number of threads into // ways of parallelism in the ic and jc loops. See bli_rntm.c to see how these // macros are used. #ifndef BLIS_THREAD_RATIO_M #define BLIS_THREAD_RATIO_M 1 #endif #ifndef BLIS_THREAD_RATIO_N #define BLIS_THREAD_RATIO_N 1 #endif // These BLIS_THREAD_MAX_?R macros place a ceiling on the maximum amount of // parallelism allowed when performing automatic factorization. See bli_rntm.c // to see how these macros are used. #ifndef BLIS_THREAD_MAX_IR #define BLIS_THREAD_MAX_IR 1 #endif #ifndef BLIS_THREAD_MAX_JR #define BLIS_THREAD_MAX_JR 4 #endif #if 0 // -- Skinny/small possibly-unpacked (sup code path) values -- #ifndef BLIS_THREAD_SUP_RATIO_M #define BLIS_THREAD_SUP_RATIO_M 1 #endif #ifndef BLIS_THREAD_SUP_RATIO_N #define BLIS_THREAD_SUP_RATIO_N 2 #endif #ifndef BLIS_THREAD_SUP_MAX_IR #define BLIS_THREAD_SUP_MAX_IR 1 #endif #ifndef BLIS_THREAD_SUP_MAX_JR #define BLIS_THREAD_SUP_MAX_JR 8 #endif #endif // -- Memory allocation -------------------------------------------------------- // hbwmalloc.h provides hbw_malloc() and hbw_free() on systems with // libmemkind. But disable use of libmemkind if BLIS_DISABLE_MEMKIND // was explicitly defined. #ifdef BLIS_DISABLE_MEMKIND #undef BLIS_ENABLE_MEMKIND #endif #ifdef BLIS_ENABLE_MEMKIND #include // skipped #endif // Memory allocation functions. These macros define the three types of // malloc()-style functions, and their free() counterparts: one for each // type of memory to be allocated. // NOTE: ANY ALTERNATIVE TO malloc()/free() USED FOR ANY OF THE FOLLOWING // THREE PAIRS OF MACROS MUST USE THE SAME FUNCTION PROTOTYPE AS malloc() // and free(): // // void* malloc( size_t size ); // void free( void* p ); // // This allocation function is called to allocate memory for blocks within // BLIS's internal memory pools. #ifndef BLIS_MALLOC_POOL // If use of libmemkind was enabled at configure-time, the default // memory allocation function for memory pools should be hbw_malloc() // instead of malloc(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_MALLOC_POOL hbw_malloc #else #define BLIS_MALLOC_POOL malloc #endif #endif #ifndef BLIS_FREE_POOL // If use of libmemkind was enabled at configure-time, the default // memory deallocation function for memory pools should be hbw_free() // instead of free(). #ifdef BLIS_ENABLE_MEMKIND #define BLIS_FREE_POOL hbw_free #else #define BLIS_FREE_POOL free #endif #endif // This allocation function is called to allocate memory for internally- // used objects and structures, such as control tree nodes. #ifndef BLIS_MALLOC_INTL #define BLIS_MALLOC_INTL malloc #endif #ifndef BLIS_FREE_INTL #define BLIS_FREE_INTL free #endif // This allocation function is called to allocate memory for objects // created by user-level API functions, such as bli_obj_create(). #ifndef BLIS_MALLOC_USER #define BLIS_MALLOC_USER malloc #endif #ifndef BLIS_FREE_USER #define BLIS_FREE_USER free #endif // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the // memory pools. #ifndef BLIS_PAGE_SIZE #define BLIS_PAGE_SIZE 4096 #endif // The maximum number of named SIMD vector registers available for use. // When configuring with umbrella configuration families, this should be // set to the maximum number of registers across all sub-configurations in // the family. #ifndef BLIS_SIMD_MAX_NUM_REGISTERS #define BLIS_SIMD_MAX_NUM_REGISTERS 32 #endif // The maximum size (in bytes) of each SIMD vector. // When configuring with umbrella configuration families, this should be // set to the maximum SIMD size across all sub-configurations in the family. #ifndef BLIS_SIMD_MAX_SIZE #define BLIS_SIMD_MAX_SIZE 64 #endif // Alignment size (in bytes) needed by the instruction set for aligned // SIMD/vector instructions. #ifndef BLIS_SIMD_ALIGN_SIZE #define BLIS_SIMD_ALIGN_SIZE BLIS_SIMD_MAX_SIZE #endif // The maximum size in bytes of local stack buffers within macro-kernel // functions. These buffers are usually used to store a temporary copy // of a single microtile. The reason we multiply by 2 is to handle induced // methods, where we use real domain register blocksizes in units of // complex elements. Specifically, the macro-kernels will need this larger // micro-tile footprint, even though the virtual micro-kernels will only // ever be writing to half (real or imaginary part) at a time. #ifndef BLIS_STACK_BUF_MAX_SIZE #define BLIS_STACK_BUF_MAX_SIZE ( BLIS_SIMD_MAX_NUM_REGISTERS * \ BLIS_SIMD_MAX_SIZE * 2 ) #endif // Alignment size used to align local stack buffers within macro-kernel // functions. #ifndef BLIS_STACK_BUF_ALIGN_SIZE #define BLIS_STACK_BUF_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when allocating memory via BLIS_MALLOC_USER. // To disable heap alignment, set this to 1. #ifndef BLIS_HEAP_ADDR_ALIGN_SIZE #define BLIS_HEAP_ADDR_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment size used when sizing leading dimensions of memory allocated // via BLIS_MALLOC_USER. #ifndef BLIS_HEAP_STRIDE_ALIGN_SIZE #define BLIS_HEAP_STRIDE_ALIGN_SIZE BLIS_SIMD_ALIGN_SIZE #endif // Alignment sizes used when allocating blocks to the internal memory // pool, via BLIS_MALLOC_POOL. #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_A #define BLIS_POOL_ADDR_ALIGN_SIZE_A BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_B #define BLIS_POOL_ADDR_ALIGN_SIZE_B BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_C #define BLIS_POOL_ADDR_ALIGN_SIZE_C BLIS_PAGE_SIZE #endif #ifndef BLIS_POOL_ADDR_ALIGN_SIZE_GEN #define BLIS_POOL_ADDR_ALIGN_SIZE_GEN BLIS_PAGE_SIZE #endif // Offsets from alignment specified by BLIS_POOL_ADDR_ALIGN_SIZE_*. #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_A #define BLIS_POOL_ADDR_OFFSET_SIZE_A 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_B #define BLIS_POOL_ADDR_OFFSET_SIZE_B 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_C #define BLIS_POOL_ADDR_OFFSET_SIZE_C 0 #endif #ifndef BLIS_POOL_ADDR_OFFSET_SIZE_GEN #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif #endif // end bli_kernel_macro_defs.h // -- Base operation prototypes -- // begin bli_init.h BLIS_EXPORT_BLIS void bli_init( void ); BLIS_EXPORT_BLIS void bli_finalize( void ); void bli_init_auto( void ); void bli_finalize_auto( void ); void bli_init_apis( void ); void bli_finalize_apis( void ); void bli_init_once( void ); void bli_finalize_once( void ); // end bli_init.h // begin bli_malloc.h // Typedef function pointer types for malloc() and free() substitutes. //typedef void* (*malloc_ft) ( size_t size ); //typedef void (*free_ft) ( void* p ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void* bli_malloc_pool( size_t size ); BLIS_EXPORT_BLIS void bli_free_pool( void* p ); #endif void* bli_malloc_intl( size_t size, err_t* r_val ); void* bli_calloc_intl( size_t size, err_t* r_val ); void bli_free_intl( void* p ); BLIS_EXPORT_BLIS void* bli_malloc_user( size_t size, err_t* r_val ); BLIS_EXPORT_BLIS void bli_free_user( void* p ); // ----------------------------------------------------------------------------- void* bli_fmalloc_align( malloc_ft f, size_t size, size_t align_size, err_t* r_val ); void bli_ffree_align( free_ft f, void* p ); void* bli_fmalloc_noalign( malloc_ft f, size_t size, err_t* r_val ); void bli_ffree_noalign( free_ft f, void* p ); void bli_fmalloc_align_check( malloc_ft f, size_t size, size_t align_size ); void bli_fmalloc_post_check( void* p ); // end bli_malloc.h // begin bli_const.h void bli_const_init( void ); void bli_const_finalize( void ); // end bli_const.h // begin bli_obj.h // begin bli_obj_check.h void bli_obj_create_check( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); void bli_obj_create_without_buffer_check( num_t dt, dim_t m, dim_t n, obj_t* obj ); void bli_obj_alloc_buffer_check( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_attach_buffer_check( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); void bli_obj_create_scalar_check( num_t dt, obj_t* obj ); void bli_obj_free_check( obj_t* obj ); void bli_obj_create_const_check( double value, obj_t* obj ); void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b ); void bli_dt_size_check( num_t dt ); void bli_dt_string_check( num_t dt ); void bli_dt_union_check( num_t dt1, num_t dt2 ); void bli_obj_print_check( char* label, obj_t* obj ); // end bli_obj_check.h BLIS_EXPORT_BLIS void bli_obj_create ( num_t dt, dim_t m, dim_t n, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_with_attached_buffer ( num_t dt, dim_t m, dim_t n, void* p, inc_t rs, inc_t cs, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_without_buffer ( num_t dt, dim_t m, dim_t n, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_alloc_buffer ( inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_attach_buffer ( void* p, inc_t rs, inc_t cs, inc_t is, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1 ( num_t dt, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer ( num_t dt, void* p, obj_t* obj ); BLIS_EXPORT_BLIS void bli_obj_create_conf_to ( obj_t* s, obj_t* d ); BLIS_EXPORT_BLIS void bli_obj_free ( obj_t* obj ); void bli_adjust_strides ( dim_t m, dim_t n, siz_t elem_size, inc_t* rs, inc_t* cs, inc_t* is ); BLIS_EXPORT_BLIS siz_t bli_dt_size ( num_t dt ); BLIS_EXPORT_BLIS char* bli_dt_string ( num_t dt ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult ( dim_t dim, dim_t dim_mult ); BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size ( dim_t dim, siz_t elem_size, siz_t align_size ); BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size ( void* p, size_t align_size ); BLIS_EXPORT_BLIS void bli_obj_print ( char* label, obj_t* obj ); // end bli_obj.h // begin bli_obj_scalar.h BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached ( num_t dt, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of ( num_t dt, conj_t conj, obj_t* alpha, obj_t* beta ); BLIS_EXPORT_BLIS void bli_obj_scalar_detach ( obj_t* a, obj_t* alpha ); BLIS_EXPORT_BLIS void bli_obj_scalar_attach ( conj_t conj, obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to ( num_t dt, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar ( obj_t* alpha, obj_t* a ); BLIS_EXPORT_BLIS void bli_obj_scalar_reset ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag ( obj_t* a ); BLIS_EXPORT_BLIS bool bli_obj_scalar_equals ( obj_t* a, obj_t* beta ); // end bli_obj_scalar.h // begin bli_blksz.h // blksz_t query BLIS_INLINE dim_t bli_blksz_get_def ( num_t dt, blksz_t* b ) { return b->v[ dt ]; } BLIS_INLINE dim_t bli_blksz_get_max ( num_t dt, blksz_t* b ) { return b->e[ dt ]; } // blksz_t modification BLIS_INLINE void bli_blksz_set_def ( dim_t val, num_t dt, blksz_t* b ) { b->v[ dt ] = val; } BLIS_INLINE void bli_blksz_set_max ( dim_t val, num_t dt, blksz_t* b ) { b->e[ dt ] = val; } BLIS_INLINE void bli_blksz_copy ( blksz_t* b_src, blksz_t* b_dst ) { *b_dst = *b_src; } BLIS_INLINE void bli_blksz_copy_if_pos ( blksz_t* b_src, blksz_t* b_dst ) { // Copy the blocksize values over to b_dst one-by-one so that // we can skip the ones that are non-positive. const dim_t v_s = bli_blksz_get_def( BLIS_FLOAT, b_src ); const dim_t v_d = bli_blksz_get_def( BLIS_DOUBLE, b_src ); const dim_t v_c = bli_blksz_get_def( BLIS_SCOMPLEX, b_src ); const dim_t v_z = bli_blksz_get_def( BLIS_DCOMPLEX, b_src ); const dim_t e_s = bli_blksz_get_max( BLIS_FLOAT, b_src ); const dim_t e_d = bli_blksz_get_max( BLIS_DOUBLE, b_src ); const dim_t e_c = bli_blksz_get_max( BLIS_SCOMPLEX, b_src ); const dim_t e_z = bli_blksz_get_max( BLIS_DCOMPLEX, b_src ); if ( v_s > 0 ) bli_blksz_set_def( v_s, BLIS_FLOAT, b_dst ); if ( v_d > 0 ) bli_blksz_set_def( v_d, BLIS_DOUBLE, b_dst ); if ( v_c > 0 ) bli_blksz_set_def( v_c, BLIS_SCOMPLEX, b_dst ); if ( v_z > 0 ) bli_blksz_set_def( v_z, BLIS_DCOMPLEX, b_dst ); if ( e_s > 0 ) bli_blksz_set_max( e_s, BLIS_FLOAT, b_dst ); if ( e_d > 0 ) bli_blksz_set_max( e_d, BLIS_DOUBLE, b_dst ); if ( e_c > 0 ) bli_blksz_set_max( e_c, BLIS_SCOMPLEX, b_dst ); if ( e_z > 0 ) bli_blksz_set_max( e_z, BLIS_DCOMPLEX, b_dst ); } BLIS_INLINE void bli_blksz_copy_def_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_def( dt_src, b_src ); bli_blksz_set_def( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_max_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { const dim_t val = bli_blksz_get_max( dt_src, b_src ); bli_blksz_set_max( val, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_copy_dt ( num_t dt_src, blksz_t* b_src, num_t dt_dst, blksz_t* b_dst ) { bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst ); bli_blksz_copy_max_dt( dt_src, b_src, dt_dst, b_dst ); } BLIS_INLINE void bli_blksz_scale_def ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_def( dt, b ); bli_blksz_set_def( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { const dim_t val = bli_blksz_get_max( dt, b ); bli_blksz_set_max( ( val * num ) / den, dt, b ); } BLIS_INLINE void bli_blksz_scale_def_max ( dim_t num, dim_t den, num_t dt, blksz_t* b ) { bli_blksz_scale_def( num, den, dt, b ); bli_blksz_scale_max( num, den, dt, b ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS blksz_t* bli_blksz_create_ed ( dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS blksz_t* bli_blksz_create ( dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_ed ( blksz_t* b, dim_t b_s, dim_t be_s, dim_t b_d, dim_t be_d, dim_t b_c, dim_t be_c, dim_t b_z, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z, dim_t be_s, dim_t be_d, dim_t be_c, dim_t be_z ); BLIS_EXPORT_BLIS void bli_blksz_init_easy ( blksz_t* b, dim_t b_s, dim_t b_d, dim_t b_c, dim_t b_z ); BLIS_EXPORT_BLIS void bli_blksz_free ( blksz_t* b ); // ----------------------------------------------------------------------------- #if 0 BLIS_EXPORT_BLIS void bli_blksz_reduce_dt_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); #endif void bli_blksz_reduce_def_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); void bli_blksz_reduce_max_to ( num_t dt_bm, blksz_t* bmult, num_t dt_bs, blksz_t* blksz ); // ----------------------------------------------------------------------------- dim_t bli_determine_blocksize ( dir_t direct, dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_b ( dim_t i, dim_t dim, obj_t* obj, bszid_t bszid, cntx_t* cntx ); dim_t bli_determine_blocksize_f_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); dim_t bli_determine_blocksize_b_sub ( dim_t i, dim_t dim, dim_t b_alg, dim_t b_max ); // end bli_blksz.h // begin bli_func.h // ----------------------------------------------------------------------------- // func_t query BLIS_INLINE void_fp bli_func_get_dt ( num_t dt, func_t* func ) { return func->ptr[ dt ]; } // func_t modification BLIS_INLINE void bli_func_set_dt ( void_fp fp, num_t dt, func_t* func ) { func->ptr[ dt ] = fp; } BLIS_INLINE void bli_func_copy_dt ( num_t dt_src, func_t* func_src, num_t dt_dst, func_t* func_dst ) { void_fp fp = bli_func_get_dt( dt_src, func_src ); bli_func_set_dt( fp, dt_dst, func_dst ); } // ----------------------------------------------------------------------------- func_t* bli_func_create ( void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init ( func_t* f, void_fp ptr_s, void_fp ptr_d, void_fp ptr_c, void_fp ptr_z ); void bli_func_init_null ( func_t* f ); void bli_func_free( func_t* f ); // ----------------------------------------------------------------------------- bool bli_func_is_null_dt( num_t dt, func_t* f ); bool bli_func_is_null( func_t* f ); // end bli_func.h // begin bli_mbool.h // ----------------------------------------------------------------------------- // mbool_t query BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb ) { return ( bool )( mb->v[ dt ] ); } // mbool_t modification BLIS_INLINE void bli_mbool_set_dt( bool val, num_t dt, mbool_t* mb ) { mb->v[ dt ] = val; } // ----------------------------------------------------------------------------- mbool_t* bli_mbool_create ( bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_init ( mbool_t* b, bool b_s, bool b_d, bool b_c, bool b_z ); void bli_mbool_free( mbool_t* b ); // end bli_mbool.h // begin bli_cntx.h #ifndef BLIS_CNTX_H #define BLIS_CNTX_H // Context object type (defined in bli_type_defs.h) // ----------------------------------------------------------------------------- // // -- cntx_t query (fields only) ----------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx ) { return cntx->blkszs; } BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx ) { return cntx->bmults; } BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx ) { return cntx->l3_vir_ukrs; } BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs; } BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx ) { return cntx->l3_nat_ukrs_prefs; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx ) { return cntx->l3_sup_thresh; } BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx ) { return cntx->l3_sup_handlers; } BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx ) { return cntx->l3_sup_blkszs; } BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx ) { return cntx->l3_sup_kers; } BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx ) { return cntx->l3_sup_kers_prefs; } BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx ) { return cntx->l1f_kers; } BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx ) { return cntx->l1v_kers; } BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx ) { return cntx->packm_kers; } BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx ) { return cntx->unpackm_kers; } BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx ) { return cntx->method; } // ----------------------------------------------------------------------------- // // -- cntx_t modification (fields only) ---------------------------------------- // BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx ) { cntx->method = method; } // ----------------------------------------------------------------------------- // // -- cntx_t query (complex) --------------------------------------------------- // BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx ) { bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx ); bszid_t bm_id = bmults[ bs_id ]; return bm_id; } BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx ) { bszid_t bm_id = bli_cntx_get_bmult_id( bs_id, cntx ); blksz_t* restrict bmult = bli_cntx_get_blksz( bm_id, cntx ); return bmult; } BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* bmult = bli_cntx_get_bmult( bs_id, cntx ); dim_t bm_dt = bli_blksz_get_def( dt, bmult ); return bm_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); func_t* func = &funcs[ ukr_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbool_t* mbool = &mbools[ ukr_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx ); blksz_t* thresh = &threshs[ thresh_id ]; // Return the address of the blksz_t identified by thresh_id. return thresh; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx ) { blksz_t* threshs = bli_cntx_get_l3_sup_thresh( thresh_id, cntx ); dim_t thresh_dt = bli_blksz_get_def( dt, threshs ); // Return the main (default) threshold value for the datatype given. return thresh_dt; } BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx ) { if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE; if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE; if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE; return FALSE; } // ----------------------------------------------------------------------------- BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx ) { void** funcs = bli_cntx_l3_sup_handlers_buf( cntx ); void* func = funcs[ op ]; return func; } // ----------------------------------------------------------------------------- BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; // Return the address of the blksz_t identified by bs_id. return blksz; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_def( dt, blksz ); // Return the main (default) blocksize value for the datatype given. return bs_dt; } BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx ) { blksz_t* blksz = bli_cntx_get_l3_sup_blksz( bs_id, cntx ); dim_t bs_dt = bli_blksz_get_max( dt, blksz ); // Return the auxiliary (maximum) blocksize value for the datatype given. return bs_dt; } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx ); func_t* func = &funcs[ stor_id ]; return func; } BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); mbool_t* mbool = &mbools[ stor_id ]; return mbool; } BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx ); return ( bool )bli_mbool_get_dt( dt, mbool ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); func_t* func = &funcs[ ker_id ]; return func; } BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx ) { func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx ); return bli_func_get_dt( dt, func ); } // ----------------------------------------------------------------------------- BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested packm func_t if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* funcs = bli_cntx_packm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the packm func_t (and then extract the // datatype-specific function pointer) if the packm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_PACKM_KERS ) { func_t* func = bli_cntx_get_packm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = NULL; // Only index to the requested unpackm func_t if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* funcs = bli_cntx_unpackm_kers_buf( cntx ); func = &funcs[ ker_id ]; } return func; } BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { void_fp fp = NULL; // Only query the context for the unpackm func_t (and then extract the // datatype-specific function pointer) if the unpackm kernel being // requested is one that is explicitly supported. if ( 0 <= ( gint_t )ker_id && ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS ) { func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx ); fp = bli_func_get_dt( dt, func ); } return fp; } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ) { // For induced methods, return the ukernel storage preferences of the // corresponding real micro-kernel. // NOTE: This projection to real domain becomes unnecessary if you // set the exec_dt for 1m to the real projection of the storage // datatype. if ( bli_cntx_method( cntx ) != BLIS_NAT ) dt = bli_dt_proj_to_real( dt ); return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx ); } BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { // Note that we use the computation datatype, which may differ from the // storage datatype of C (when performing a mixed datatype operation). const num_t dt = bli_obj_comp_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx ); } // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of TRUE means the ukernel prefers row storage. return ( bool ) ( prefs == TRUE ); } BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx ) { const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx ); // A ukernel preference of FALSE means the ukernel prefers column storage. return ( bool ) ( prefs == FALSE ); } #if 0 // NOTE: These static functions aren't needed yet. BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { const num_t dt = bli_obj_dt( obj ); const bool ukr_prefers_rows = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx ); const bool ukr_prefers_cols = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx ); bool r_val = FALSE; if ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE; else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx ) { return ( bool ) !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx ); } #endif // ----------------------------------------------------------------------------- // // -- cntx_t modification (complex) -------------------------------------------- // // NOTE: The framework does not use any of the following functions. We provide // them in order to facilitate creating/modifying custom contexts. BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); bszid_t* bmults = bli_cntx_bmults_buf( cntx ); blkszs[ bs_id ] = *blksz; bmults[ bs_id ] = mult_id; } BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_def( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx ) { blksz_t* blkszs = bli_cntx_blkszs_buf( cntx ); blksz_t* blksz = &blkszs[ bs_id ]; bli_blksz_set_max( bs, dt, blksz ); } BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); funcs[ ukr_id ] = *func; } BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx ) { mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); mbools[ ukr_id ] = *prefs; } BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1f_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_l1v_kers_buf( cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx ) { func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx ); funcs[ ker_id ] = *func; } BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx ) { func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx ); bli_func_set_dt( fp, dt, func ); } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx ); BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... ); BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx ); #endif // end bli_cntx.h // begin bli_rntm.h #ifndef BLIS_RNTM_H #define BLIS_RNTM_H // Runtime object type (defined in bli_type_defs.h) // // -- rntm_t query (public API) ------------------------------------------------ // BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm ) { return rntm->auto_factor; } BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm ) { return rntm->num_threads; } BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm ) { return rntm->thrloop[ bszid ]; } BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NC, rntm ); } BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KC, rntm ); } BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MC, rntm ); } BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_NR, rntm ); } BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_MR, rntm ); } BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm ) { return bli_rntm_ways_for( BLIS_KR, rntm ); } BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm ) { return ( bool )( rntm->pack_a ); } BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm ) { return ( bool )( rntm->pack_b ); } BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm ) { return rntm->l3_sup; } // // -- rntm_t query (internal use only) ----------------------------------------- // BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm ) { return rntm->sba_pool; } BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm ) { return rntm->pba; } #if 0 BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 ) { const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 ); const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 ); const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 ); const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 ); const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 ); const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 ); const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 ); if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE; else return FALSE; } #endif // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_set_auto_factor_only( bool auto_factor, rntm_t* rntm ) { rntm->auto_factor = auto_factor; } BLIS_INLINE void bli_rntm_set_num_threads_only( dim_t nt, rntm_t* rntm ) { rntm->num_threads = nt; } BLIS_INLINE void bli_rntm_set_ways_for_only( bszid_t loop, dim_t n_ways, rntm_t* rntm ) { rntm->thrloop[ loop ] = n_ways; } BLIS_INLINE void bli_rntm_set_jc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pc_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ic_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MC, ways, rntm ); } BLIS_INLINE void bli_rntm_set_jr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_NR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ir_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_MR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_pr_ways_only( dim_t ways, rntm_t* rntm ) { bli_rntm_set_ways_for_only( BLIS_KR, ways, rntm ); } BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); } BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm ) { rntm->sba_pool = sba_pool; } BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm ) { rntm->pba = pba; } BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm ) { bli_rntm_set_num_threads_only( -1, rntm ); } BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm ) { bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm ); } BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm ) { bli_rntm_set_sba_pool( NULL, rntm ); } BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm ) { bli_rntm_set_pba( NULL, rntm ); } // // -- rntm_t modification (public API) ----------------------------------------- // BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm ) { // Record the total number of threads to use. bli_rntm_set_num_threads_only( nt, rntm ); // Set the individual ways of parallelism to default states. bli_rntm_clear_ways_only( rntm ); } BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm ) { // Record the number of ways of parallelism per loop. bli_rntm_set_jc_ways_only( jc, rntm ); bli_rntm_set_pc_ways_only( pc, rntm ); bli_rntm_set_ic_ways_only( ic, rntm ); bli_rntm_set_jr_ways_only( jr, rntm ); bli_rntm_set_ir_ways_only( ir, rntm ); bli_rntm_set_pr_ways_only( 1, rntm ); // Set the num_threads field to a default state. bli_rntm_clear_num_threads_only( rntm ); } BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm ) { // Set the bool indicating whether matrix A should be packed. rntm->pack_a = pack_a; } BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm ) { // Set the bool indicating whether matrix B should be packed. rntm->pack_b = pack_b; } BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm ) { // Set the bool indicating whether level-3 sup handling is enabled. rntm->l3_sup = l3_sup; } BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( FALSE, rntm ); } // // -- rntm_t modification (internal use only) ---------------------------------- // BLIS_INLINE void bli_rntm_clear_pack_a( rntm_t* rntm ) { bli_rntm_set_pack_a( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_pack_b( rntm_t* rntm ) { bli_rntm_set_pack_b( FALSE, rntm ); } BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm ) { bli_rntm_set_l3_sup( TRUE, rntm ); } // // -- rntm_t initialization ---------------------------------------------------- // // NOTE: Initialization is not necessary as long the user calls at least ONE // of the public "set" accessors, each of which guarantees that the rntm_t // will be in a good state upon return. #define BLIS_RNTM_INITIALIZER \ { \ .auto_factor = TRUE, \ .num_threads = -1, \ .thrloop = { -1, -1, -1, -1, -1, -1 }, \ .pack_a = FALSE, \ .pack_b = FALSE, \ .l3_sup = TRUE, \ .sba_pool = NULL, \ .pba = NULL, \ } \ BLIS_INLINE void bli_rntm_init( rntm_t* rntm ) { bli_rntm_set_auto_factor_only( TRUE, rntm ); bli_rntm_clear_num_threads_only( rntm ); bli_rntm_clear_ways_only( rntm ); bli_rntm_clear_pack_a( rntm ); bli_rntm_clear_pack_b( rntm ); bli_rntm_clear_l3_sup( rntm ); bli_rntm_clear_sba_pool( rntm ); bli_rntm_clear_pba( rntm ); } // -- rntm_t total thread calculation ------------------------------------------ BLIS_INLINE dim_t bli_rntm_calc_num_threads ( rntm_t* restrict rntm ) { dim_t n_threads; n_threads = bli_rntm_ways_for( BLIS_NC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_KC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MC, rntm ); n_threads *= bli_rntm_ways_for( BLIS_NR, rntm ); n_threads *= bli_rntm_ways_for( BLIS_MR, rntm ); return n_threads; } // ----------------------------------------------------------------------------- // Function prototypes BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm ); BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op ( opid_t l3_op, side_t side, dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_set_ways_from_rntm_sup ( dim_t m, dim_t n, dim_t k, rntm_t* rntm ); void bli_rntm_print ( rntm_t* rntm ); dim_t bli_rntm_calc_num_threads_in ( bszid_t* restrict bszid_cur, rntm_t* restrict rntm ); #endif // end bli_rntm.h // begin bli_gks.h #ifndef BLIS_GKS_H #define BLIS_GKS_H void bli_gks_init( void ); void bli_gks_finalize( void ); void bli_gks_init_index( void ); cntx_t* bli_gks_lookup_nat_cntx( arch_t id ); cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind ); cntx_t** bli_gks_lookup_id( arch_t id ); void bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void ); cntx_t* bli_gks_query_cntx_noinit( void ); BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt ); BLIS_EXPORT_BLIS void bli_gks_init_ref_cntx( cntx_t* cntx ); bool bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx ); BLIS_EXPORT_BLIS char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt ); BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt ); //char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt ); #endif // end bli_gks.h // begin bli_ind.h #ifndef BLIS_IND_H #define BLIS_IND_H // level-3 induced method management // begin bli_l3_ind.h #ifndef BLIS_L3_IND_H #define BLIS_L3_IND_H // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ ind_t PASTEMAC(opname,ind_find_avail)( num_t dt ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- //bool bli_l3_ind_oper_is_avail( opid_t oper, ind_t method, num_t dt ); ind_t bli_l3_ind_oper_find_avail( opid_t oper, num_t dt ); void bli_l3_ind_set_enable_dt( ind_t method, num_t dt, bool status ); void bli_l3_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); void bli_l3_ind_oper_set_enable_all( opid_t oper, num_t dt, bool status ); void bli_l3_ind_oper_set_enable( opid_t oper, ind_t method, num_t dt, bool status ); bool bli_l3_ind_oper_get_enable( opid_t oper, ind_t method, num_t dt ); bool bli_l3_ind_oper_is_impl( opid_t oper, ind_t method ); #endif // end bli_l3_ind.h void bli_ind_init( void ); void bli_ind_finalize( void ); BLIS_EXPORT_BLIS void bli_ind_enable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable( ind_t method ); BLIS_EXPORT_BLIS void bli_ind_disable_all( void ); BLIS_EXPORT_BLIS void bli_ind_enable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_dt( ind_t method, num_t dt ); BLIS_EXPORT_BLIS void bli_ind_disable_all_dt( num_t dt ); BLIS_EXPORT_BLIS void bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt ); BLIS_EXPORT_BLIS bool bli_ind_oper_is_impl( opid_t oper, ind_t method ); BLIS_EXPORT_BLIS ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt ); BLIS_EXPORT_BLIS char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt ); char* bli_ind_get_impl_string( ind_t method ); num_t bli_ind_map_cdt_to_index( num_t dt ); #endif // end bli_ind.h // begin bli_pba.h #ifndef BLIS_MEMBRK_H #define BLIS_MEMBRK_H // Packing block allocator (formerly memory broker) // pba init //BLIS_INLINE void bli_pba_init_mutex( pba_t* pba ) //{ // bli_pthread_mutex_init( &(pba->mutex), NULL ); //} //BLIS_INLINE void bli_pba_finalize_mutex( pba_t* pba ) //{ // bli_pthread_mutex_destroy( &(pba->mutex) ); //} // pba query BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba ) { return &(pba->pools[ pool_index ]); } BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba ) { return pba->align_size; } BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba ) { return pba->malloc_fp; } BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba ) { return pba->free_fp; } // pba modification BLIS_INLINE void bli_pba_set_align_size( siz_t align_size, pba_t* pba ) { pba->align_size = align_size; } BLIS_INLINE void bli_pba_set_malloc_fp( malloc_ft malloc_fp, pba_t* pba ) { pba->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pba_set_free_fp( free_ft free_fp, pba_t* pba ) { pba->free_fp = free_fp; } // pba action BLIS_INLINE void bli_pba_lock( pba_t* pba ) { bli_pthread_mutex_lock( &(pba->mutex) ); } BLIS_INLINE void bli_pba_unlock( pba_t* pba ) { bli_pthread_mutex_unlock( &(pba->mutex) ); } // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS pba_t* bli_pba_query( void ); void bli_pba_init ( cntx_t* cntx ); void bli_pba_finalize ( void ); void bli_pba_acquire_m ( rntm_t* rntm, siz_t req_size, packbuf_t buf_type, mem_t* mem ); void bli_pba_release ( rntm_t* rntm, mem_t* mem ); BLIS_INLINE void bli_pba_rntm_set_pba ( rntm_t* rntm ) { pba_t* pba = bli_pba_query(); bli_rntm_set_pba( pba, rntm ); } siz_t bli_pba_pool_size ( pba_t* pba, packbuf_t buf_type ); // ---------------------------------------------------------------------------- void bli_pba_init_pools ( cntx_t* cntx, pba_t* pba ); void bli_pba_finalize_pools ( pba_t* pba ); void bli_pba_compute_pool_block_sizes ( siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); void bli_pba_compute_pool_block_sizes_dt ( num_t dt, siz_t* bs_a, siz_t* bs_b, siz_t* bs_c, cntx_t* cntx ); #endif // end bli_pba.h // begin bli_pool.h #ifndef BLIS_POOL_H #define BLIS_POOL_H // -- Pool block type -- // -- Pool type -- // Pool block query BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk ) { return pblk->buf; } BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk ) { return pblk->block_size; } // Pool block modification BLIS_INLINE void bli_pblk_set_buf( void* buf, pblk_t* pblk ) { pblk->buf = buf; } BLIS_INLINE void bli_pblk_set_block_size( siz_t block_size, pblk_t* pblk ) { pblk->block_size = block_size; } // // -- pool block initialization ------------------------------------------------ // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the pblk_t type definition. An alternative to the initializer is // calling bli_pblk_clear() at runtime. #define BLIS_PBLK_INITIALIZER \ { \ .buf = NULL, \ .block_size = 0, \ } \ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk ) { bli_pblk_set_buf( NULL, pblk ); bli_pblk_set_block_size( 0, pblk ); } // Pool entry query BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool ) { return pool->block_ptrs; } BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool ) { return pool->block_ptrs_len; } BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool ) { return pool->num_blocks; } BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool ) { return pool->block_size; } BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool ) { return pool->align_size; } BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool ) { return pool->offset_size; } BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool ) { return pool->malloc_fp; } BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool ) { return pool->free_fp; } BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool ) { return pool->top_index; } BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool ) { return ( bool ) ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) ); } // Pool entry modification BLIS_INLINE void bli_pool_set_block_ptrs( void* block_ptrs, pool_t* pool ) \ { pool->block_ptrs = block_ptrs; } BLIS_INLINE void bli_pool_set_block_ptrs_len( siz_t block_ptrs_len, pool_t* pool ) \ { pool->block_ptrs_len = block_ptrs_len; } BLIS_INLINE void bli_pool_set_num_blocks( siz_t num_blocks, pool_t* pool ) \ { pool->num_blocks = num_blocks; } BLIS_INLINE void bli_pool_set_block_size( siz_t block_size, pool_t* pool ) \ { pool->block_size = block_size; } BLIS_INLINE void bli_pool_set_align_size( siz_t align_size, pool_t* pool ) \ { pool->align_size = align_size; } BLIS_INLINE void bli_pool_set_offset_size( siz_t offset_size, pool_t* pool ) \ { pool->offset_size = offset_size; } BLIS_INLINE void bli_pool_set_malloc_fp( malloc_ft malloc_fp, pool_t* pool ) \ { pool->malloc_fp = malloc_fp; } BLIS_INLINE void bli_pool_set_free_fp( free_ft free_fp, pool_t* pool ) \ { pool->free_fp = free_fp; } BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \ { pool->top_index = top_index; } // ----------------------------------------------------------------------------- void bli_pool_init ( siz_t num_blocks, siz_t block_ptrs_len, siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, free_ft free_fp, pool_t* restrict pool ); void bli_pool_finalize ( pool_t* restrict pool ); void bli_pool_reinit ( siz_t num_blocks_new, siz_t block_ptrs_len_new, siz_t block_size_new, siz_t align_size_new, siz_t offset_size_new, pool_t* restrict pool ); void bli_pool_checkout_block ( siz_t req_size, pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_checkin_block ( pblk_t* restrict block, pool_t* restrict pool ); void bli_pool_grow ( siz_t num_blocks_add, pool_t* restrict pool ); void bli_pool_shrink ( siz_t num_blocks_sub, pool_t* restrict pool ); void bli_pool_alloc_block ( siz_t block_size, siz_t align_size, siz_t offset_size, malloc_ft malloc_fp, pblk_t* restrict block ); void bli_pool_free_block ( siz_t offset_size, free_ft free_fp, pblk_t* restrict block ); void bli_pool_print ( pool_t* restrict pool ); void bli_pblk_print ( pblk_t* restrict pblk ); #endif // end bli_pool.h // begin bli_array.h #ifndef BLIS_ARRAY_H #define BLIS_ARRAY_H // -- Array type -- // Array entry query BLIS_INLINE void* bli_array_buf( array_t* array ) { return array->buf; } BLIS_INLINE siz_t bli_array_num_elem( array_t* array ) { return array->num_elem; } BLIS_INLINE siz_t bli_array_elem_size( array_t* array ) { return array->elem_size; } // Array entry modification BLIS_INLINE void bli_array_set_buf( void* buf, array_t* array ) \ { array->buf = buf; } BLIS_INLINE void bli_array_set_num_elem( siz_t num_elem, array_t* array ) \ { array->num_elem = num_elem; } BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \ { array->elem_size = elem_size; } // ----------------------------------------------------------------------------- void bli_array_init ( const siz_t num_elem, const siz_t elem_size, array_t* restrict array ); void bli_array_resize ( const siz_t num_elem_new, array_t* restrict array ); void bli_array_finalize ( array_t* restrict array ); void* bli_array_elem ( const siz_t index, array_t* restrict array ); void bli_array_set_elem ( void* restrict elem, const siz_t index, array_t* restrict array ); #endif // end bli_array.h // begin bli_apool.h #ifndef BLIS_APOOL_H #define BLIS_APOOL_H // -- Locked pool-of-arrays type -- // apool entry query BLIS_INLINE pool_t* bli_apool_pool( apool_t* apool ) { return &(apool->pool); } BLIS_INLINE bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool ) { return &(apool->mutex); } BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool ) { return pool->def_array_len; } BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool ) { pool_t* restrict pool = bli_apool_pool( apool ); return bli_pool_is_exhausted( pool ); } // apool action BLIS_INLINE void bli_apool_lock( apool_t* apool ) { bli_pthread_mutex_lock( bli_apool_mutex( apool ) ); } BLIS_INLINE void bli_apool_unlock( apool_t* apool ) { bli_pthread_mutex_unlock( bli_apool_mutex( apool ) ); } // apool entry modification BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool ) \ { pool->def_array_len = def_array_len; } // ----------------------------------------------------------------------------- void bli_apool_init ( apool_t* restrict apool ); void bli_apool_finalize ( apool_t* restrict apool ); array_t* bli_apool_checkout_array ( siz_t n_threads, apool_t* restrict apool ); void bli_apool_checkin_array ( array_t* restrict array, apool_t* restrict apool ); pool_t* bli_apool_array_elem ( siz_t index, array_t* restrict array ); void bli_apool_grow ( siz_t num_blocks_add, apool_t* restrict apool ); void bli_apool_alloc_block ( siz_t num_elem, array_t** restrict array_p ); void bli_apool_free_block ( array_t* restrict array ); #endif // end bli_apool.h // begin bli_sba.h #ifndef BLIS_SBA_H #define BLIS_SBA_H apool_t* bli_sba_query( void ); // ----------------------------------------------------------------------------- void bli_sba_init( void ); void bli_sba_finalize( void ); array_t* bli_sba_checkout_array ( const siz_t n_threads ); void bli_sba_checkin_array ( array_t* restrict array ); void bli_sba_rntm_set_pool ( siz_t index, array_t* restrict array, rntm_t* restrict rntm ); void* bli_sba_acquire ( rntm_t* restrict rntm, siz_t req_size ); void bli_sba_release ( rntm_t* restrict rntm, void* restrict block ); #endif // end bli_sba.h // begin bli_memsys.h #ifndef BLIS_MEMSYS_H #define BLIS_MEMSYS_H // ----------------------------------------------------------------------------- void bli_memsys_init( void ); void bli_memsys_finalize( void ); #endif // end bli_memsys.h // begin bli_mem.h #ifndef BLIS_MEM_H #define BLIS_MEM_H // mem_t object type (defined in bli_type_defs.h) // // -- mem_t query -------------------------------------------------------------- // BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem ) { return &(mem->pblk); } BLIS_INLINE void* bli_mem_buffer( mem_t* mem ) { return bli_pblk_buf( bli_mem_pblk( mem ) ); } BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem ) { return mem->buf_type; } BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem ) { return mem->pool; } BLIS_INLINE siz_t bli_mem_size( mem_t* mem ) { return mem->size; } BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) != NULL ); } BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem ) { return ( bool ) ( bli_mem_buffer( mem ) == NULL ); } // // -- mem_t modification ------------------------------------------------------- // BLIS_INLINE void bli_mem_set_pblk( pblk_t* pblk, mem_t* mem ) { mem->pblk = *pblk; } BLIS_INLINE void bli_mem_set_buffer( void* buf, mem_t* mem ) { bli_pblk_set_buf( buf, &(mem->pblk) ); } BLIS_INLINE void bli_mem_set_buf_type( packbuf_t buf_type, mem_t* mem ) { mem->buf_type = buf_type; } BLIS_INLINE void bli_mem_set_pool( pool_t* pool, mem_t* mem ) { mem->pool = pool; } BLIS_INLINE void bli_mem_set_size( siz_t size, mem_t* mem ) { mem->size = size; } // // -- mem_t initialization ----------------------------------------------------- // // NOTE: This initializer macro must be updated whenever fields are added or // removed from the mem_t type definition. An alternative to the initializer is // calling bli_mem_clear() at runtime. #define BLIS_MEM_INITIALIZER \ { \ .pblk = BLIS_PBLK_INITIALIZER, \ .buf_type = -1, \ .pool = NULL, \ .size = 0, \ } \ BLIS_INLINE void bli_mem_clear( mem_t* mem ) { bli_mem_set_buffer( NULL, mem ); #ifdef __cplusplus const packbuf_t pb = BLIS_BUFFER_FOR_GEN_USE; // When using C++, which is strongly typed, we avoid use of -1 as a // packbuf_t value since it will result in a compile-time error. bli_mem_set_buf_type( pb, mem ); #else bli_mem_set_buf_type( ( packbuf_t )-1, mem ); #endif bli_mem_set_pool( NULL, mem ); bli_mem_set_size( 0, mem ); } #endif // end bli_mem.h // begin bli_part.h // begin bli_part_check.h void bli_acquire_mpart_t2b_check( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_l2r_check( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_acquire_mpart_tl2br_check( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); // end bli_part_check.h // -- Matrix partitioning ------------------------------------------------------ BLIS_EXPORT_BLIS void bli_acquire_mpart ( dim_t i, dim_t j, dim_t m, dim_t n, obj_t* obj, obj_t* sub_obj ); #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_t2b ) GENPROT( acquire_mpart_b2t ) GENPROT( acquire_mpart_l2r ) GENPROT( acquire_mpart_r2l ) GENPROT( acquire_mpart_tl2br ) GENPROT( acquire_mpart_br2tl ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ dir_t direct, \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_mpart_mdim ) GENPROT( acquire_mpart_ndim ) GENPROT( acquire_mpart_mndim ) // -- Vector partitioning ------------------------------------------------------ #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \ ( \ subpart_t req_part, \ dim_t i, \ dim_t b, \ obj_t* obj, \ obj_t* sub_obj \ ); GENPROT( acquire_vpart_f2b ) GENPROT( acquire_vpart_b2f ) // -- Scalar acquisition ------------------------------------------------------- BLIS_EXPORT_BLIS void bli_acquire_mij ( dim_t i, dim_t j, obj_t* obj, obj_t* sub_obj ); BLIS_EXPORT_BLIS void bli_acquire_vi ( dim_t i, obj_t* obj, obj_t* sub_obj ); // end bli_part.h // begin bli_prune.h void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ); // end bli_prune.h // begin bli_query.h BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b ); BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a ); // end bli_query.h // begin bli_auxinfo.h #ifndef BLIS_AUXINFO_MACRO_DEFS_H #define BLIS_AUXINFO_MACRO_DEFS_H // auxinfo_t field query BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai ) { return ai->schema_a; } BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai ) { return ai->schema_b; } BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai ) { return ai->a_next; } BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai ) { return ai->b_next; } BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai ) { return ai->is_a; } BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai ) { return ai->is_b; } BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai ) { return ai->ps_a; } BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai ) { return ai->ps_b; } BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai ) { return ai->ukr; } BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai ) { return ai->params; } // auxinfo_t field modification BLIS_INLINE void bli_auxinfo_set_schema_a( pack_t schema, auxinfo_t* ai ) { ai->schema_a = schema; } BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai ) { ai->schema_b = schema; } BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai ) { ai->a_next = p; } BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai ) { ai->b_next = p; } BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai ) { ai->a_next = ap; ai->b_next = bp; } BLIS_INLINE void bli_auxinfo_set_is_a( inc_t is, auxinfo_t* ai ) { ai->is_a = is; } BLIS_INLINE void bli_auxinfo_set_is_b( inc_t is, auxinfo_t* ai ) { ai->is_b = is; } BLIS_INLINE void bli_auxinfo_set_ps_a( inc_t ps, auxinfo_t* ai ) { ai->ps_a = ps; } BLIS_INLINE void bli_auxinfo_set_ps_b( inc_t ps, auxinfo_t* ai ) { ai->ps_b = ps; } BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai ) { ai->ukr = ukr; } BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai ) { ai->params = params; } #endif // end bli_auxinfo.h // begin bli_param_map.h // --- BLIS to BLAS/LAPACK mappings -------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_side( side_t side, char* blas_side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_uplo( uplo_t uplo, char* blas_uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_trans( trans_t trans, char* blas_trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_diag( diag_t diag, char* blas_diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_netlib_machval( machval_t machval, char* blas_machval ); // --- BLAS/LAPACK to BLIS mappings -------------------------------------------- // NOTE: These static functions were converted from regular functions in order // to reduce function call overhead within the BLAS compatibility layer. BLIS_INLINE void bli_param_map_netlib_to_blis_side( char side, side_t* blis_side ) { if ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT; else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT; else { // Instead of reporting an error to the framework, default to // an arbitrary value. This is needed because this function is // called by the BLAS compatibility layer AFTER it has already // checked errors and called xerbla(). If the application wants // to override the BLAS compatibility layer's xerbla--which // responds to errors with abort()--we need to also NOT call // abort() here, since either way it has already been dealt // with. //bli_check_error_code( BLIS_INVALID_SIDE ); *blis_side = BLIS_LEFT; } } BLIS_INLINE void bli_param_map_netlib_to_blis_uplo( char uplo, uplo_t* blis_uplo ) { if ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER; else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_UPLO ); *blis_uplo = BLIS_LOWER; } } BLIS_INLINE void bli_param_map_netlib_to_blis_trans( char trans, trans_t* blis_trans ) { if ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE; else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE; else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_TRANS ); *blis_trans = BLIS_NO_TRANSPOSE; } } BLIS_INLINE void bli_param_map_netlib_to_blis_diag( char diag, diag_t* blis_diag ) { if ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG; else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG; else { // See comment for bli_param_map_netlib_to_blis_side() above. //bli_check_error_code( BLIS_INVALID_DIAG ); *blis_diag = BLIS_NONUNIT_DIAG; } } // --- BLIS char to BLIS mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_side( char side, side_t* blis_side ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_uplo( char uplo, uplo_t* blis_uplo ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_trans( char trans, trans_t* blis_trans ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_conj( char conj, conj_t* blis_conj ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_diag( char diag, diag_t* blis_diag ); BLIS_EXPORT_BLIS void bli_param_map_char_to_blis_dt( char dt, num_t* blis_dt ); // --- BLIS to BLIS char mappings ---------------------------------------------- BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_side( side_t blis_side, char* side ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_uplo( uplo_t blis_uplo, char* uplo ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_trans( trans_t blis_trans, char* trans ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_conj( conj_t blis_conj, char* conj ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_diag( diag_t blis_diag, char* diag ); BLIS_EXPORT_BLIS void bli_param_map_blis_to_char_dt( num_t blis_dt, char* dt ); // end bli_param_map.h // begin bli_clock.h BLIS_EXPORT_BLIS double bli_clock( void ); BLIS_EXPORT_BLIS double bli_clock_min_diff( double time_min, double time_start ); double bli_clock_helper( void ); // end bli_clock.h // begin bli_check.h BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line ); err_t bli_check_valid_error_level( errlev_t level ); err_t bli_check_null_pointer( void* ptr ); err_t bli_check_valid_side( side_t side ); err_t bli_check_valid_uplo( uplo_t uplo ); err_t bli_check_valid_trans( trans_t trans ); err_t bli_check_valid_diag( diag_t diag ); err_t bli_check_nonunit_diag( obj_t* a ); err_t bli_check_valid_datatype( num_t dt ); err_t bli_check_object_valid_datatype( obj_t* a ); err_t bli_check_noninteger_datatype( num_t dt ); err_t bli_check_noninteger_object( obj_t* a ); err_t bli_check_nonconstant_datatype( num_t dt ); err_t bli_check_nonconstant_object( obj_t* a ); err_t bli_check_floating_datatype( num_t dt ); err_t bli_check_floating_object( obj_t* a ); err_t bli_check_real_datatype( num_t dt ); err_t bli_check_real_object( obj_t* a ); err_t bli_check_integer_datatype( num_t dt ); err_t bli_check_integer_object( obj_t* a ); err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b ); err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r ); err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r ); err_t bli_check_real_valued_object( obj_t* a ); err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b ); err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b ); err_t bli_check_conformal_dims( obj_t* a, obj_t* b ); err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c ); err_t bli_check_scalar_object( obj_t* a ); err_t bli_check_vector_object( obj_t* a ); err_t bli_check_matrix_object( obj_t* a ); err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y ); err_t bli_check_square_object( obj_t* a ); err_t bli_check_object_length_equals( obj_t* a, dim_t m ); err_t bli_check_object_width_equals( obj_t* a, dim_t n ); err_t bli_check_vector_dim_equals( obj_t* a, dim_t n ); err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset ); err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is ); err_t bli_check_general_object( obj_t* a ); err_t bli_check_hermitian_object( obj_t* a ); err_t bli_check_symmetric_object( obj_t* a ); err_t bli_check_triangular_object( obj_t* a ); err_t bli_check_object_struc( obj_t* a, struc_t struc ); err_t bli_check_upper_or_lower_object( obj_t* a ); err_t bli_check_valid_3x1_subpart( subpart_t part ); err_t bli_check_valid_1x3_subpart( subpart_t part ); err_t bli_check_valid_3x3_subpart( subpart_t part ); err_t bli_check_valid_cntl( void* cntl ); err_t bli_check_packm_schema_on_unpack( obj_t* a ); err_t bli_check_packv_schema_on_unpack( obj_t* a ); err_t bli_check_object_buffer( obj_t* a ); err_t bli_check_valid_malloc_buf( void* ptr ); err_t bli_check_valid_packbuf( packbuf_t buf_type ); err_t bli_check_if_exhausted_pool( pool_t* pool ); err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx ); err_t bli_check_alignment_is_power_of_two( size_t align_size ); err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size ); err_t bli_check_object_alias_of( obj_t* a, obj_t* b ); err_t bli_check_valid_arch_id( arch_t id ); err_t bli_check_initialized_gks_cntx( cntx_t** cntx ); err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr ); err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr ); err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr ); // end bli_check.h // begin bli_error.h BLIS_EXPORT_BLIS errlev_t bli_error_checking_level( void ); BLIS_EXPORT_BLIS void bli_error_checking_level_set( errlev_t new_level ); BLIS_EXPORT_BLIS bool bli_error_checking_is_enabled( void ); void bli_print_msg( char* str, char* file, guint_t line ); BLIS_EXPORT_BLIS void bli_abort( void ); char* bli_error_string_for_code( gint_t code ); // end bli_error.h // begin bli_f2c.h // f2c.h -- Standard Fortran to C header file // barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed." // - From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) #ifndef BLIS_F2C_H #define BLIS_F2C_H typedef f77_int bla_integer; typedef f77_char bla_character; //typedef char *address; //typedef short int shortint; typedef float bla_real; typedef double bla_double; typedef scomplex bla_scomplex; typedef dcomplex bla_dcomplex; typedef f77_int bla_logical; //typedef short int shortlogical; //typedef char logical1; //typedef char integer1; #ifdef INTEGER_STAR_8 // Adjust for integer*8. typedef long long longint; // system-dependent typedef unsigned long long ulongint; // system-dependent #define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b))) #define qbit_set(a,b) ((a) | ((ulongint)1 << (b))) #endif #ifndef TRUE_ #define TRUE_ (1) #endif #ifndef FALSE_ #define FALSE_ (0) #endif // Extern is for use with -E #ifndef Extern #define Extern extern #endif // I/O stuff #ifdef f2c_i2 // for -i2 //typedef short flag; //typedef short ftnlen; typedef bla_integer ftnlen; //typedef short ftnint; #else //typedef long int flag; //typedef long int ftnlen; typedef bla_integer ftnlen; //typedef long int ftnint; #endif #ifndef VOID #define VOID void #endif #ifndef f2c_abs #define f2c_abs(x) ((x) >= 0 ? (x) : -(x)) #endif #ifndef f2c_dabs #define f2c_dabs(x) (doublereal)f2c_abs(x) #endif #ifndef f2c_min #define f2c_min(a,b) ((a) <= (b) ? (a) : (b)) #endif #ifndef f2c_max #define f2c_max(a,b) ((a) >= (b) ? (a) : (b)) #endif #ifndef f2c_dmin #define f2c_dmin(a,b) (doublereal)f2c_min(a,b) #endif #ifndef f2c_dmax #define f2c_dmax(a,b) (doublereal)f2c_max(a,b) #endif #ifndef bit_test #define bit_test(a,b) ((a) >> (b) & 1) #endif #ifndef bit_clear #define bit_clear(a,b) ((a) & ~((uinteger)1 << (b))) #endif #ifndef bit_set #define bit_set(a,b) ((a) | ((uinteger)1 << (b))) #endif // undef any lower-case symbols that your C compiler predefines, e.g.: #ifndef Skip_f2c_Undefs #undef cray #undef gcos #undef mc68010 #undef mc68020 #undef mips #undef pdp11 #undef sgi #undef sparc #undef sun #undef sun2 #undef sun3 #undef sun4 #undef u370 #undef u3b #undef u3b2 #undef u3b5 #undef unix #undef vax #endif #endif // end bli_f2c.h // begin bli_machval.h // begin bli_lsame.h bla_logical bli_lsame( bla_character* ca, bla_character* cb, ftnlen ca_len, ftnlen cb_len ); // end bli_lsame.h // begin bli_slamch.h bla_real bli_slamch( bla_character* cmach, ftnlen cmach_len ); // end bli_slamch.h // begin bli_dlamch.h bla_double bli_dlamch( bla_character* cmach, ftnlen cmach_len ); // end bli_dlamch.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_machval( machval_t mval, obj_t* v ); // // Prototype BLAS-like interfaces. // #undef GENTPROTR #define GENTPROTR( ctype_v, ctype_vr, chv, chvr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(chv,opname) \ ( \ machval_t mval, \ void* v \ ); INSERT_GENTPROTR_BASIC0( machval ) // end bli_machval.h // begin bli_getopt.h typedef struct getopt_s { char* optarg; int optind; int opterr; int optopt; } getopt_t; BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state ); BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state ); // end bli_getopt.h // begin bli_opid.h BLIS_INLINE bool bli_opid_is_level3( opid_t opid ) { return ( bool ) ( BLIS_GEMM <= opid && opid <= BLIS_TRSM ); } // end bli_opid.h // begin bli_cntl.h // -- Control tree prototypes -- BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, void* params, cntl_t* sub_node ); BLIS_EXPORT_BLIS void bli_cntl_free_node ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_clear_node ( cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy ( rntm_t* rntm, cntl_t* cntl ); BLIS_EXPORT_BLIS void bli_cntl_mark_family ( opid_t family, cntl_t* cntl ); // ----------------------------------------------------------------------------- dim_t bli_cntl_calc_num_threads_in ( rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- // cntl_t query (fields only) BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl ) { return cntl->family; } BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl ) { return cntl->bszid; } BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl ) { return cntl->var_func; } BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl ) { return cntl->sub_prenode; } BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl ) { return cntl->sub_node; } BLIS_INLINE void* bli_cntl_params( cntl_t* cntl ) { return cntl->params; } BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl ) { // The first 64 bytes is always the size of the params structure. return *( ( uint64_t* )(cntl->params) ); } BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl ) { return &(cntl->pack_mem); } // cntl_t query (complex) BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl ) { return ( bool ) ( cntl == NULL ); } BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl ) { return ( bool ) ( bli_cntl_sub_node( cntl ) == NULL ); } BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl ) { return ( bool ) ( bli_cntl_bszid( cntl ) != BLIS_NO_PART ); } // cntl_t modification BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl ) { cntl->family = family; } BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl ) { cntl->bszid = bszid; } BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl ) { cntl->var_func = var_func; } BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl ) { cntl->sub_prenode = sub_prenode; } BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl ) { cntl->sub_node = sub_node; } BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl ) { cntl->params = params; } BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl ) { cntl->pack_mem = *pack_mem; } // end bli_cntl.h // begin bli_env.h #ifndef BLIS_ENV_H #define BLIS_ENV_H gint_t bli_env_get_var( const char* env, gint_t fallback ); //void bli_env_set_var( const char* env, dim_t value ); #endif // end bli_env.h // begin bli_pack.h #ifndef BLIS_PACK_H #define BLIS_PACK_H void bli_pack_init( void ); void bli_pack_finalize( void ); BLIS_EXPORT_BLIS void bli_pack_get_pack_a( bool* pack_a ); BLIS_EXPORT_BLIS void bli_pack_get_pack_b( bool* pack_b ); BLIS_EXPORT_BLIS void bli_pack_set_pack_a( bool pack_a ); BLIS_EXPORT_BLIS void bli_pack_set_pack_b( bool pack_b ); void bli_pack_init_rntm_from_env( rntm_t* rntm ); #endif // end bli_pack.h // begin bli_info.h // -- General library information ---------------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_version_str( void ); BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void ); // -- General configuration-related -------------------------------------------- BLIS_EXPORT_BLIS gint_t bli_info_get_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_num_fp_types( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_max_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_page_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_num_registers( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_simd_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_max_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_stack_buf_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_addr_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_heap_stride_align_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_align_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_a( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_b( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_c( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_pool_addr_offset_size_gen( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_stay_auto_init( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_blas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_cblas( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_blas_int_type_size( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sba_pools( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_threading( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); // -- Kernel implementation-related -------------------------------------------- // -- Level-3 kernel definitions -- BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt ); // -- BLIS implementation query (level-3) -------------------------------------- BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt ); BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt ); // end bli_info.h // begin bli_arch.h #ifndef BLIS_ARCH_H #define BLIS_ARCH_H BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void ); void bli_arch_set_id_once( void ); void bli_arch_set_id( void ); BLIS_EXPORT_BLIS char* bli_arch_string( arch_t id ); void bli_arch_set_logging( bool dolog ); bool bli_arch_get_logging( void ); void bli_arch_log( char*, ... ); #endif // end bli_arch.h // begin bli_cpuid.h #if 0 // Used only during standalone testing of ARM support. #define FALSE 0 #define TRUE 1 typedef enum { BLIS_ARCH_CORTEXA57 = 10, BLIS_ARCH_CORTEXA15 = 11, BLIS_ARCH_CORTEXA9 = 12, BLIS_ARCH_GENERIC = 13 } arch_t; typedef uint64_t bool; #define bli_abort abort #endif #ifndef BLIS_CPUID_H #define BLIS_CPUID_H arch_t bli_cpuid_query_id( void ); // Intel bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_haswell( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_sandybridge( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features ); // AMD bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_excavator( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features ); bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features ); // ARM bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa53( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_armsve( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_a64fx( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa15( uint32_t model, uint32_t part, uint32_t features ); bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features ); uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features ); // ----------------------------------------------------------------------------- // // This section of the file was based off of cpuid.hpp from TBLIS [1]. // // [1] https://github.com/devinamatthews/tblis // BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want ) { return ( have & want ) == want; } // ----------------------------------------------------------------------------- #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) // cpuid.h is now #included in bli_cpuid.c instead of here. See issue #393 // for more information why this move was made. //#include "cpuid.h" void get_cpu_name( char *cpu_name ); int vpu_count( void ); enum { VENDOR_INTEL = 0, VENDOR_AMD, VENDOR_UNKNOWN }; enum { FEATURE_SSE3 = 0x0001, FEATURE_SSSE3 = 0x0002, FEATURE_SSE41 = 0x0004, FEATURE_SSE42 = 0x0008, FEATURE_AVX = 0x0010, FEATURE_AVX2 = 0x0020, FEATURE_FMA3 = 0x0040, FEATURE_FMA4 = 0x0080, FEATURE_AVX512F = 0x0100, FEATURE_AVX512DQ = 0x0200, FEATURE_AVX512PF = 0x0400, FEATURE_AVX512ER = 0x0800, FEATURE_AVX512CD = 0x1000, FEATURE_AVX512BW = 0x2000, FEATURE_AVX512VL = 0x4000 }; #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) char* find_string_in( char* target, char* buffer, size_t buf_len, char* filepath ); enum { VENDOR_ARM = 0, VENDOR_UNKNOWN }; enum { MODEL_ARMV7 = 0, MODEL_ARMV8, MODEL_UNKNOWN }; enum { FEATURE_NEON = 0x01, FEATURE_SVE = 0x02 }; #endif #endif // end bli_cpuid.h // begin bli_string.h void bli_string_mkupper( char* s ); // end bli_string.h // begin bli_setgetijm.h BLIS_EXPORT_BLIS err_t bli_setijm ( double ar, double ai, dim_t i, dim_t j, obj_t* b ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs \ ); INSERT_GENTPROT_BASIC0( setijm ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijm ( dim_t i, dim_t j, obj_t* b, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ dim_t j, \ void* restrict b, inc_t rs, inc_t cs, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijm ) // end bli_setgetijm.h // begin bli_setgetijv.h BLIS_EXPORT_BLIS err_t bli_setijv ( double ar, double ai, dim_t i, obj_t* x ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double ar, \ double ai, \ dim_t i, \ void* restrict x, inc_t incx \ ); INSERT_GENTPROT_BASIC0( setijv ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS err_t bli_getijv ( dim_t i, obj_t* x, double* ar, double* ai ); #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ dim_t i, \ void* restrict b, inc_t incx, \ double* ar, \ double* ai \ ); INSERT_GENTPROT_BASIC0( getijv ) // end bli_setgetijv.h // begin bli_setri.h // -- setr --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setrm ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setrv ( obj_t* alpha, obj_t* x ); // -- seti --------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_setim ( obj_t* alpha, obj_t* b ); BLIS_EXPORT_BLIS void bli_setiv ( obj_t* alpha, obj_t* x ); // end bli_setri.h // begin bli_castm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castm ) INSERT_GENTPROT2_MIXDP0( castm ) // // Prototype object-based _check() function. // void bli_castm_check ( obj_t* a, obj_t* b ); // end bli_castm.h // begin bli_castnzm.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castnzm ( obj_t* a, obj_t* b ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_b, cha, chb, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \ ( \ trans_t transa, \ dim_t m, \ dim_t n, \ void* a, inc_t rs_a, inc_t cs_a, \ void* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT2_BASIC0( castnzm ) INSERT_GENTPROT2_MIXDP0( castnzm ) // // Prototype object-based _check() function. // void bli_castnzm_check ( obj_t* a, obj_t* b ); // end bli_castnzm.h // begin bli_castv.h // // Prototype object-based interface. // BLIS_EXPORT_BLIS void bli_castv ( obj_t* x, obj_t* y ); // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \ ( \ conj_t conjx, \ dim_t n, \ void* x, inc_t incx, \ void* y, inc_t incy \ ); INSERT_GENTPROT2_BASIC0( castv ) INSERT_GENTPROT2_MIXDP0( castv ) // // Prototype object-based _check() function. // void bli_castv_check ( obj_t* x, obj_t* y ); // end bli_castv.h // begin bli_projm.h BLIS_EXPORT_BLIS void bli_projm ( obj_t* a, obj_t* b ); void bli_projm_check ( obj_t* a, obj_t* b ); // end bli_projm.h // begin bli_projv.h BLIS_EXPORT_BLIS void bli_projv ( obj_t* x, obj_t* y ); void bli_projv_check ( obj_t* x, obj_t* y ); // end bli_projv.h // -- Level-0 operations -- // begin bli_l0.h // begin bli_l0_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENTPROT( addsc ) GENTPROT( copysc ) GENTPROT( divsc ) GENTPROT( mulsc ) GENTPROT( sqrtsc ) GENTPROT( subsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi \ ); GENTPROT( invertsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENTPROT( absqsc ) GENTPROT( normfsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENTPROT( getsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENTPROT( setsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENTPROT( unzipsc ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENTPROT( zipsc ) // ----------------------------------------------------------------------------- void bli_l0_xsc_check ( obj_t* chi ); void bli_l0_xxsc_check ( obj_t* chi, obj_t* psi ); void bli_l0_xx2sc_check ( obj_t* chi, obj_t* norm ); void bli_l0_xxbsc_check ( obj_t* chi, obj_t* psi, bool* is_eq ); // end bli_l0_check.h // begin bli_l0_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* absq \ ); GENPROT( absqsc ) GENPROT( normfsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( sqrtsc ) GENPROT( subsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi \ ); GENPROT( invertsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ double* zeta_r, \ double* zeta_i \ ); GENPROT( getsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ double zeta_r, \ double zeta_i, \ obj_t* chi \ ); GENPROT( setsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* zeta_r, \ obj_t* zeta_i \ ); GENPROT( unzipsc ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* zeta_r, \ obj_t* zeta_i, \ obj_t* chi \ ); GENPROT( zipsc ) // end bli_l0_oapi.h // begin bli_l0_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( addsc ) INSERT_GENTPROT_BASIC0( divsc ) INSERT_GENTPROT_BASIC0( mulsc ) INSERT_GENTPROT_BASIC0( subsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( invertsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTPROTR_BASIC0( absqsc ) INSERT_GENTPROTR_BASIC0( normfsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTPROT_BASIC0( sqrtsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTPROT_BASIC0( getsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTPROT_BASIC0( setsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTPROTR_BASIC0( unzipsc ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTPROTR_BASIC0( zipsc ) // ----------------------------------------------------------------------------- BLIS_EXPORT_BLIS void bli_igetsc ( dim_t* chi, double* zeta_r, double* zeta_i ); BLIS_EXPORT_BLIS void bli_isetsc ( double zeta_r, double zeta_i, dim_t* chi ); // end bli_l0_tapi.h // begin bli_l0_ft.h // // -- Level-0 function types --------------------------------------------------- // // addsc, divsc, subsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( addsc ) INSERT_GENTDEF( divsc ) INSERT_GENTDEF( subsc ) // invertsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi \ ); INSERT_GENTDEF( invertsc ) // mulsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( mulsc ) // absqsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* absq \ ); INSERT_GENTDEFR( absqsc ) // normfsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* norm \ ); INSERT_GENTDEFR( normfsc ) // sqrtsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype* psi \ ); INSERT_GENTDEF( sqrtsc ) // getsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ double* zeta_r, \ double* zeta_i \ ); INSERT_GENTDEF( getsc ) // setsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ double zeta_r, \ double zeta_i, \ ctype* chi \ ); INSERT_GENTDEF( setsc ) // unzipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype* chi, \ ctype_r* zeta_r, \ ctype_r* zeta_i \ ); INSERT_GENTDEFR( unzipsc ) // zipsc #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ ctype_r* zeta_r, \ ctype_r* zeta_i, \ ctype* chi \ ); INSERT_GENTDEFR( zipsc ) // end bli_l0_ft.h // Generate function pointer arrays for tapi functions. // begin bli_l0_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( absqsc ) GENPROT( normfsc ) GENPROT( addsc ) GENPROT( divsc ) GENPROT( mulsc ) GENPROT( subsc ) GENPROT( invertsc ) GENPROT( sqrtsc ) GENPROT( unzipsc ) GENPROT( zipsc ) GENPROT( getsc ) GENPROT( setsc ) // end bli_l0_fpa.h // copysc // begin bli_copysc.h // // Prototype object-based interfaces. // #undef GENFRONT #define GENFRONT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* chi, \ obj_t* psi \ ); GENFRONT( copysc ) // // Prototype BLAS-like interfaces with heterogeneous-typed operands. // #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, varname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \ ( \ conj_t conjchi, \ void* chi, \ void* psi \ ); INSERT_GENTPROT2_BASIC0( copysc ) INSERT_GENTPROT2_MIX_D0( copysc ) INSERT_GENTPROT2_MIX_P0( copysc ) // end bli_copysc.h // end bli_l0.h // -- Level-1v operations -- // begin bli_l1v.h // begin bli_l1v_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* index \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyv ) // ----------------------------------------------------------------------------- void bli_l1v_xy_check ( obj_t* x, obj_t* y ); void bli_l1v_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1v_xby_check ( obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_axby_check ( obj_t* alpha, obj_t* x, obj_t* beta, obj_t* y ); void bli_l1v_dot_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* beta, obj_t* rho ); void bli_l1v_x_check ( obj_t* x ); void bli_l1v_ax_check ( obj_t* alpha, obj_t* x ); void bli_l1v_xi_check ( obj_t* x, obj_t* index ); // end bli_l1v_check.h // Define kernel function types. //#include "bli_l1v_ft_ex.h" // begin bli_l1v_ft_ker.h #ifndef BLIS_L1V_FT_KER_H #define BLIS_L1V_FT_KER_H // // -- Level-1v kernel function types ------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict index, \ cntx_t* cntx \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( xpbyv ) #endif // end bli_l1v_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1v_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addv ) GENTPROT( copyv ) GENTPROT( subv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* index \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( amaxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpbyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyv ) GENTPROT( scal2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* beta, \ obj_t* rho \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scalv ) GENTPROT( setv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( swapv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyv ) // end bli_l1v_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1v_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addv ) INSERT_GENTPROT_BASIC0( copyv ) INSERT_GENTPROT_BASIC0( subv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( amaxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpbyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( axpyv ) INSERT_GENTPROT_BASIC0( scal2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( dotxv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( invertv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( scalv ) INSERT_GENTPROT_BASIC0( setv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( swapv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); \ INSERT_GENTPROT_BASIC0( xpbyv ) // end bli_l1v_tapi.h // begin bli_l1v_ft.h // // -- Level-1v function types -------------------------------------------------- // // addv, copyv, subv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addv ) INSERT_GENTDEF( copyv ) INSERT_GENTDEF( subv ) // amaxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* index \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( amaxv ) // axpbyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpbyv ) // axpyv, scal2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyv ) INSERT_GENTDEF( scal2v ) // dotv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotv ) // dotxv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* beta, \ ctype* rho \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxv ) // invertv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertv ) // scalv, setv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalv ) INSERT_GENTDEF( setv ) // swapv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( swapv ) // xpybv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyv ) // end bli_l1v_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1v_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addv ) GENPROT( copyv ) GENPROT( subv ) GENPROT( amaxv ) GENPROT( axpbyv ) GENPROT( axpyv ) GENPROT( scal2v ) GENPROT( dotv ) GENPROT( dotxv ) GENPROT( invertv ) GENPROT( scalv ) GENPROT( setv ) GENPROT( swapv ) GENPROT( xpbyv ) // end bli_l1v_fpa.h // Pack-related // NOTE: packv and unpackv are temporarily disabled. //#include "bli_packv.h" //#include "bli_unpackv.h" // Other // NOTE: scalv control tree code is temporarily disabled. //#include "bli_scalv_cntl.h" //#include "bli_scalv_int.h" // end bli_l1v.h // -- Level-1d operations -- // begin bli_l1d.h // begin bli_l1d_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( xpbyd ) // ----------------------------------------------------------------------------- void bli_l1d_xy_check ( obj_t* x, obj_t* y ); void bli_l1d_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1d_x_check ( obj_t* x ); void bli_l1d_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1d_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1d_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( addd ) GENTPROT( copyd ) GENTPROT( subd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyd ) GENTPROT( scal2d ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( invertd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( scald ) GENTPROT( setd ) GENTPROT( setid ) GENTPROT( shiftd ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( xpbyd ) // end bli_l1d_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1d_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addd ) INSERT_GENTPROT_BASIC0( copyd ) INSERT_GENTPROT_BASIC0( subd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyd ) INSERT_GENTPROT_BASIC0( scal2d ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( invertd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scald ) INSERT_GENTPROT_BASIC0( setd ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( setid ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( shiftd ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbyd ) // end bli_l1d_tapi.h // begin bli_l1d_ft.h // // -- Level-1d function types -------------------------------------------------- // // addd, copyd, subd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addd ) INSERT_GENTDEF( copyd ) INSERT_GENTDEF( subd ) // axpyd, scal2d #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyd ) INSERT_GENTDEF( scal2d ) // invertd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( invertd ) // scald, setd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scald ) INSERT_GENTDEF( setd ) // setid #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype_r* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( setid ) // shiftd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( shiftd ) // xpbyd #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbyd ) // end bli_l1d_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1d_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addd ) GENPROT( copyd ) GENPROT( subd ) GENPROT( axpyd ) GENPROT( scal2d ) GENPROT( invertd ) GENPROT( scald ) GENPROT( setd ) GENPROT( setid ) GENPROT( shiftd ) GENPROT( xpbyd ) // end bli_l1d_fpa.h // end bli_l1d.h // -- Level-1f operations -- // begin bli_l1f.h // begin bli_l1f_check.h // // Prototype object-based check functions. // #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENTPROT( dotxf ) // end bli_l1f_check.h // Define kernel function types. // begin bli_l1f_ft_ker.h #ifndef BLIS_L1F_FT_KER_H #define BLIS_L1F_FT_KER_H // // -- Level-1f kernel function types ------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha1, \ ctype* restrict alpha2, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ); INSERT_GENTDEF( dotxaxpyf ) #endif // end bli_l1f_ft_ker.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1f_oapi.h // // Prototype object-based interfaces. // #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alphax, \ obj_t* alphay, \ obj_t* x, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpy2v ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( axpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* xt, \ obj_t* x, \ obj_t* y, \ obj_t* rho, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotaxpyv ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* at, \ obj_t* a, \ obj_t* w, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ obj_t* z \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxaxpyf ) #undef GENTPROT #define GENTPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENTPROT( dotxf ) // end bli_l1f_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1f_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alphax, \ ctype* alphay, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpy2v ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotaxpyv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxaxpyf ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( dotxf ) // end bli_l1f_tapi.h // begin bli_l1f_ft.h // // -- Level-1f function types -------------------------------------------------- // // axpy2v #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* alpha1, \ ctype* alpha2, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpy2v ) // axpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpyf ) // dotaxpyv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* rho, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotaxpyv ) // dotxf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxf ) // dotxaxpyf #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* alpha, \ ctype* a, inc_t inca, inc_t lda, \ ctype* w, inc_t incw, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ ctype* z, inc_t incz \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( dotxaxpyf ) // end bli_l1f_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1f_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( axpy2v ) GENPROT( axpyf ) GENPROT( dotaxpyv ) GENPROT( dotxaxpyf ) GENPROT( dotxf ) // end bli_l1f_fpa.h // end bli_l1f.h // -- Level-1m operations -- // begin bli_l1m.h // begin bli_l1m_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( xpbym ) // ----------------------------------------------------------------------------- void bli_l1m_xy_check ( obj_t* x, obj_t* y ); void bli_l1m_axy_check ( obj_t* alpha, obj_t* x, obj_t* y ); void bli_l1m_ax_check ( obj_t* alpha, obj_t* x ); // end bli_l1m_check.h // Define kernel function types. // begin bli_l1m_ft_ker.h #ifndef BLIS_L1M_FT_KER_H #define BLIS_L1M_FT_KER_H // // -- Level-1m kernel function types ------------------------------------------- // // packm // NOTE: This is the function type for the structure-aware "kernel". #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTDEF( packm ) // NOTE: the following macros generate packm kernel function type definitions // that are "ctyped" and void-typed, for each of the floating-point datatypes. // packm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk ) // unpackm_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conjp, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict p, inc_t ldp, \ ctype* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( unpackm_cxk ) // packm_1er_ker #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( packm_cxk_1er ) #endif // end bli_l1m_ft_ker.h // Define object function types for variants. // begin bli_l1m_oft_var.h #ifndef BLIS_L1M_OFT_VAR_H #define BLIS_L1M_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* p, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( packm ) #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* p, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( unpackm ) #endif // end bli_l1m_oft_var.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l1m_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( axpym ) GENPROT( scal2m ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( scalm ) GENPROT( setm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( xpbym ) GENPROT( xpbym_md ) // end bli_l1m_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l1m_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_tapi.h // begin bli_l1m_ft.h // // -- Level-1v function types -------------------------------------------------- // // addm, subm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( addm ) INSERT_GENTDEF( subm ) INSERT_GENTDEF( copym ) // axpym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( axpym ) // scal2m #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scal2m ) // scalm, setm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( scalm ) INSERT_GENTDEF( setm ) // xpbym #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( xpbym ) INSERT_GENTDEF( xpbym_md ) // end bli_l1m_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l1m_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( addm ) GENPROT( copym ) GENPROT( subm ) GENPROT( axpym ) GENPROT( scal2m ) GENPROT( scalm ) GENPROT( setm ) GENPROT( xpbym ) #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp2)( num_t dtx, num_t dty ); GENPROT( xpbym_md ) // end bli_l1m_fpa.h // Prototype level-1m implementations. // begin bli_l1m_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( addm ) INSERT_GENTPROT_BASIC0( copym ) INSERT_GENTPROT_BASIC0( subm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( axpym ) INSERT_GENTPROT_BASIC0( scal2m ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ conj_t conjalpha, \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( scalm ) INSERT_GENTPROT_BASIC0( setm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC2(ch,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* beta, \ ctype* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( xpbym ) #undef GENTPROT2 #define GENTPROT2( ctype_x, ctype_y, chx, chy, opname ) \ \ void PASTEMAC3(chx,chy,opname,_unb_var1) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype_x* x, inc_t rs_x, inc_t cs_x, \ ctype_y* beta, \ ctype_y* y, inc_t rs_y, inc_t cs_y, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT2_BASIC0( xpbym_md ) INSERT_GENTPROT2_MIXDP0( xpbym_md ) // end bli_l1m_unb_var1.h // Pack-related // begin bli_packm.h // begin bli_packm_alloc.h BLIS_EXPORT_BLIS void* bli_packm_alloc ( siz_t size_needed, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); BLIS_EXPORT_BLIS void* bli_packm_alloc_ex ( siz_t size_needed, packbuf_t pack_buf_type, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_alloc.h // begin bli_packm_cntl.h struct packm_params_s { uint64_t size; // size field must be present and come first. bszid_t bmid_m; bszid_t bmid_n; bool does_invert_diag; bool rev_iter_if_upper; bool rev_iter_if_lower; pack_t pack_schema; packbuf_t pack_buf_type; }; typedef struct packm_params_s packm_params_t; BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m; } BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n; } BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper; } BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower; } BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema; } BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl ) { packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type; } // ----------------------------------------------------------------------------- cntl_t* bli_packm_cntl_create_node ( rntm_t* rntm, void_fp var_func, bszid_t bmid_m, bszid_t bmid_n, bool does_invert_diag, bool rev_iter_if_upper, bool rev_iter_if_lower, pack_t pack_schema, packbuf_t pack_buf_type, cntl_t* sub_node ); // end bli_packm_cntl.h // begin bli_packm_check.h void bli_packm_init_check ( obj_t* a, obj_t* p, cntx_t* cntx ); void bli_packm_int_check ( obj_t* a, obj_t* p, cntx_t* cntx ); // end bli_packm_check.h // begin bli_packm_init.h BLIS_EXPORT_BLIS bool bli_packm_init ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_init.h // begin bli_packm_int.h void bli_packm_int ( obj_t* a, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_packm_int.h // begin bli_packm_scalar.h BLIS_EXPORT_BLIS void* bli_packm_scalar( obj_t* kappa, obj_t* p ); // end bli_packm_scalar.h // begin bli_packm_part.h // -- Matrix partitioning ------------------------------------------------------ void bli_packm_acquire_mpart_t2b( subpart_t requested_part, dim_t i, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_l2r( subpart_t requested_part, dim_t j, dim_t b, obj_t* obj, obj_t* sub_obj ); void bli_packm_acquire_mpart_tl2br( subpart_t requested_part, dim_t ij, dim_t b, obj_t* obj, obj_t* sub_obj ); dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p ); // end bli_packm_part.h // begin bli_packm_struc_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_struc_cxk ) INSERT_GENTPROT_BASIC0( packm_herm_cxk ) INSERT_GENTPROT_BASIC0( packm_tri_cxk ) // end bli_packm_struc_cxk.h // begin bli_packm_struc_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype* restrict kappa, \ ctype* restrict c, inc_t incc, inc_t ldc, \ ctype* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROTCO_BASIC0( packm_struc_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_herm_cxk_1er ) INSERT_GENTPROTCO_BASIC0( packm_tri_cxk_1er ) // end bli_packm_struc_cxk_1er.h // begin bli_packm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( packm_cxk ) // end bli_packm_cxk.h // begin bli_packm_cxk_1er.h #undef GENTPROTCO #define GENTPROTCO( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ pack_t schema, \ dim_t panel_dim, \ dim_t panel_dim_max, \ dim_t panel_len, \ dim_t panel_len_max, \ ctype* kappa, \ ctype* a, inc_t inca, inc_t lda, \ ctype* p, inc_t ldp, \ cntx_t* cntx \ ); INSERT_GENTPROTCO_BASIC0( packm_cxk_1er ) // end bli_packm_cxk_1er.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_packm_struc_cxk_md.h #undef GENTPROT2 #define GENTPROT2( ctype_c, ctype_p, chc, chp, varname ) \ \ void PASTEMAC2(chc,chp,varname) \ ( \ struc_t strucc, \ diag_t diagc, \ uplo_t uploc, \ conj_t conjc, \ pack_t schema, \ bool invdiag, \ dim_t panel_dim, \ dim_t panel_len, \ dim_t panel_dim_max, \ dim_t panel_len_max, \ dim_t panel_dim_off, \ dim_t panel_len_off, \ ctype_p* restrict kappa, \ ctype_c* restrict c, inc_t incc, inc_t ldc, \ ctype_p* restrict p, inc_t ldp, \ inc_t is_p, \ cntx_t* cntx, \ void* params \ ); INSERT_GENTPROT2_BASIC0( packm_struc_cxk_md ) INSERT_GENTPROT2_MIXDP0( packm_struc_cxk_md ) #undef GENTPROT2 #define GENTPROT2( ctype_a, ctype_p, cha, chp, opname ) \ \ void PASTEMAC2(cha,chp,opname) \ ( \ conj_t conja, \ dim_t m, \ dim_t n, \ ctype_p* restrict kappa, \ ctype_a* restrict a, inc_t inca, inc_t lda, \ ctype_p* restrict p, inc_t ldp \ ); INSERT_GENTPROT2_BASIC0( packm_cxk_1e_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1e_md ) INSERT_GENTPROT2_BASIC0( packm_cxk_1r_md ) INSERT_GENTPROT2_MIXDP0( packm_cxk_1r_md ) // end bli_packm_struc_cxk_md.h #endif // begin bli_packm_blk_var1.h // // packm params types. // typedef struct { // Type of C Type of P packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES]; } packm_blk_var1_params_t; // // Prototype object-based interfaces. // BLIS_EXPORT_BLIS void bli_packm_blk_var1 ( obj_t* c, obj_t* p, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* t ); // end bli_packm_blk_var1.h // end bli_packm.h // begin bli_unpackm.h // begin bli_unpackm_cntl.h struct unpackm_params_s { uint64_t size; // size field must be present and come first. unpackm_var_oft var_func; }; typedef struct unpackm_params_s unpackm_params_t; #define bli_cntl_unpackm_params_var_func( cntl ) \ \ ( ( (unpackm_params_t*)(cntl)->params )->var_func ) // ----------------------------------------------------------------------------- cntl_t* bli_unpackm_cntl_create_node ( rntm_t* rntm, void_fp var_func, void_fp unpackm_var_func, cntl_t* sub_node ); // end bli_unpackm_cntl.h // begin bli_unpackm_check.h void bli_unpackm_int_check ( obj_t* p, obj_t* a, cntx_t* cntx ); // end bli_unpackm_check.h // begin bli_unpackm_int.h void bli_unpackm_int ( obj_t* p, obj_t* a, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); // end bli_unpackm_int.h // begin bli_unpackm_blk_var1.h void bli_unpackm_blk_var1 ( obj_t* p, obj_t* c, cntx_t* cntx, cntl_t* cntl, thrinfo_t* thread ); #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ struc_t strucc, \ doff_t diagoffc, \ diag_t diagc, \ uplo_t uploc, \ trans_t transc, \ dim_t m, \ dim_t n, \ dim_t m_panel, \ dim_t n_panel, \ void* p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_blk_var1 ) // end bli_unpackm_blk_var1.h // begin bli_unpackm_cxk.h #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjp, \ dim_t panel_dim, \ dim_t panel_len, \ ctype* kappa, \ ctype* p, inc_t ldp, \ ctype* a, inc_t inca, inc_t lda, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( unpackm_cxk ) // end bli_unpackm_cxk.h // end bli_unpackm.h // end bli_l1m.h // -- Level-2 operations -- // begin bli_l2.h // begin bli_l2_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ ); GENPROT( trmv ) GENPROT( trsv ) // ----------------------------------------------------------------------------- void bli_xxmv_check ( obj_t* alpha, obj_t* a, obj_t* x, obj_t* beta, obj_t* y ); void bli_xxr_check ( obj_t* alpha, obj_t* x, obj_t* y, obj_t* a ); // end bli_l2_check.h // Define function types. // begin bli_l2_ft_unb.h #ifndef BLIS_L2_FT_UNB_H #define BLIS_L2_FT_UNB_H // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( ger ) // hemv (and symv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTDEF( hemv ) // her (and syr) #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEFR( her ) // her2 (and syr2) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTDEF( her2 ) // trmv (and trsv) #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_unb,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) #endif // end bli_l2_ft_unb.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_l2_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( gemv ) GENPROT( hemv ) GENPROT( symv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( ger ) GENPROT( her2 ) GENPROT( syr2 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( her ) GENPROT( syr ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( trmv ) GENPROT( trsv ) // end bli_l2_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_l2_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( gemv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( ger ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( hemv ) INSERT_GENTPROT_BASIC0( symv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( her ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( syr ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( her2 ) INSERT_GENTPROT_BASIC0( syr2 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( trmv ) INSERT_GENTPROT_BASIC0( trsv ) // end bli_l2_tapi.h // begin bli_l2_ft.h // // -- Level-2 function types --------------------------------------------------- // // gemv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( gemv ) // ger #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( ger ) // hemv, symv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conja, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( hemv ) INSERT_GENTDEF( symv ) // her #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype_r* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( her ) // syr #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( syr ) // her2, syr2 #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( her2 ) INSERT_GENTDEF( syr2 ) // trmv, trsv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( trmv ) INSERT_GENTDEF( trsv ) // end bli_l2_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_l2_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( gemv ) GENPROT( ger ) GENPROT( hemv ) GENPROT( symv ) GENPROT( her ) GENPROT( syr ) GENPROT( her2 ) GENPROT( syr2 ) GENPROT( trmv ) GENPROT( trsv ) // // Prototype function pointer query interfaces for level-2 implementations. // #undef GENPROT #define GENPROT( opname, varname ) \ \ PASTECH2(opname,_unb,_vft) \ PASTEMAC(varname,_qfp)( num_t dt ); GENPROT( gemv, gemv_unb_var1 ) GENPROT( gemv, gemv_unb_var2 ) GENPROT( gemv, gemv_unf_var1 ) GENPROT( gemv, gemv_unf_var2 ) GENPROT( ger, ger_unb_var1 ) GENPROT( ger, ger_unb_var2 ) GENPROT( hemv, hemv_unb_var1 ) GENPROT( hemv, hemv_unb_var2 ) GENPROT( hemv, hemv_unb_var3 ) GENPROT( hemv, hemv_unb_var4 ) GENPROT( hemv, hemv_unf_var1 ) GENPROT( hemv, hemv_unf_var3 ) GENPROT( hemv, hemv_unf_var1a ) GENPROT( hemv, hemv_unf_var3a ) GENPROT( her, her_unb_var1 ) GENPROT( her, her_unb_var2 ) GENPROT( her2, her2_unb_var1 ) GENPROT( her2, her2_unb_var2 ) GENPROT( her2, her2_unb_var3 ) GENPROT( her2, her2_unb_var4 ) GENPROT( her2, her2_unf_var1 ) GENPROT( her2, her2_unf_var4 ) GENPROT( trmv, trmv_unb_var1 ) GENPROT( trmv, trmv_unb_var2 ) GENPROT( trmv, trmv_unf_var1 ) GENPROT( trmv, trmv_unf_var2 ) GENPROT( trsv, trsv_unb_var1 ) GENPROT( trsv, trsv_unb_var2 ) GENPROT( trsv, trsv_unf_var1 ) GENPROT( trsv, trsv_unf_var2 ) // end bli_l2_fpa.h // Operation-specific headers // begin bli_gemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_gemv_cntl.h" //#include "bli_gemv_front.h" //#include "bli_gemv_int.h" // begin bli_gemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( gemv_blk_var1 ) GENPROT( gemv_blk_var2 ) GENPROT( gemv_unb_var1 ) GENPROT( gemv_unb_var2 ) GENPROT( gemv_unf_var1 ) GENPROT( gemv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transa, \ conj_t conjx, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( gemv_unb_var1 ) INSERT_GENTPROT_BASIC0( gemv_unb_var2 ) INSERT_GENTPROT_BASIC0( gemv_unf_var1 ) INSERT_GENTPROT_BASIC0( gemv_unf_var2 ) // end bli_gemv_var.h // end bli_gemv.h // begin bli_ger.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_ger_cntl.h" //#include "bli_ger_front.h" //#include "bli_ger_int.h" // begin bli_ger_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* x, \ obj_t* y, \ obj_t* a, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( ger_blk_var1 ) GENPROT( ger_blk_var2 ) GENPROT( ger_unb_var1 ) GENPROT( ger_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( ger_unb_var1 ) INSERT_GENTPROT_BASIC0( ger_unb_var2 ) // end bli_ger_var.h // end bli_ger.h // begin bli_hemv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_hemv_cntl.h" //#include "bli_hemv_front.h" //#include "bli_hemv_int.h" // begin bli_hemv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ obj_t* beta, \ obj_t* y, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( hemv_blk_var1 ) GENPROT( hemv_blk_var2 ) GENPROT( hemv_blk_var3 ) GENPROT( hemv_blk_var4 ) GENPROT( hemv_unb_var1 ) GENPROT( hemv_unb_var2 ) GENPROT( hemv_unb_var3 ) GENPROT( hemv_unb_var4 ) GENPROT( hemv_unf_var1 ) GENPROT( hemv_unf_var3 ) GENPROT( hemv_unf_var1a ) GENPROT( hemv_unf_var3a ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conja, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ ctype* beta, \ ctype* y, inc_t incy, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( hemv_unb_var1 ) INSERT_GENTPROT_BASIC0( hemv_unb_var2 ) INSERT_GENTPROT_BASIC0( hemv_unb_var3 ) INSERT_GENTPROT_BASIC0( hemv_unb_var4 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1 ) INSERT_GENTPROT_BASIC0( hemv_unf_var3 ) INSERT_GENTPROT_BASIC0( hemv_unf_var1a ) INSERT_GENTPROT_BASIC0( hemv_unf_var3a ) // end bli_hemv_var.h // end bli_hemv.h // begin bli_her.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her_cntl.h" //#include "bli_her_front.h" //#include "bli_her_int.h" // begin bli_her_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* x, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her_blk_var1 ) GENPROT( her_blk_var2 ) GENPROT( her_unb_var1 ) GENPROT( her_unb_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROTR_BASIC0( her_unb_var1 ) INSERT_GENTPROTR_BASIC0( her_unb_var2 ) // end bli_her_var.h // end bli_her.h // begin bli_her2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_her2_cntl.h" //#include "bli_her2_front.h" //#include "bli_her2_int.h" // begin bli_her2_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ conj_t conjh, \ obj_t* alpha, \ obj_t* alpha_conj, \ obj_t* x, \ obj_t* y, \ obj_t* c, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( her2_blk_var1 ) GENPROT( her2_blk_var2 ) GENPROT( her2_blk_var3 ) GENPROT( her2_blk_var4 ) GENPROT( her2_unb_var1 ) GENPROT( her2_unb_var2 ) GENPROT( her2_unb_var3 ) GENPROT( her2_unb_var4 ) GENPROT( her2_unf_var1 ) GENPROT( her2_unf_var4 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uplo, \ conj_t conjx, \ conj_t conjy, \ conj_t conjh, \ dim_t m, \ ctype* alpha, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( her2_unb_var1 ) INSERT_GENTPROT_BASIC0( her2_unb_var2 ) INSERT_GENTPROT_BASIC0( her2_unb_var3 ) INSERT_GENTPROT_BASIC0( her2_unb_var4 ) INSERT_GENTPROT_BASIC0( her2_unf_var1 ) INSERT_GENTPROT_BASIC0( her2_unf_var4 ) // end bli_her2_var.h // end bli_her2.h // begin bli_symv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_symv_front.h" // end bli_symv.h // begin bli_syr.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr_front.h" // end bli_syr.h // begin bli_syr2.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_syr2_front.h" // end bli_syr2.h // begin bli_trmv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trmv_cntl.h" //#include "bli_trmv_front.h" //#include "bli_trmv_int.h" // begin bli_trmv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trmv_l_blk_var1 ) GENPROT( trmv_l_blk_var2 ) GENPROT( trmv_u_blk_var1 ) GENPROT( trmv_u_blk_var2 ) GENPROT( trmv_unb_var1 ) GENPROT( trmv_unb_var2 ) GENPROT( trmv_unf_var1 ) GENPROT( trmv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trmv_unb_var1 ) INSERT_GENTPROT_BASIC0( trmv_unb_var2 ) INSERT_GENTPROT_BASIC0( trmv_unf_var1 ) INSERT_GENTPROT_BASIC0( trmv_unf_var2 ) // end bli_trmv_var.h // end bli_trmv.h // begin bli_trsv.h // NOTE: level-2 control tree code is temporarily disabled. //#include "bli_trsv_cntl.h" //#include "bli_trsv_front.h" //#include "bli_trsv_int.h" // begin bli_trsv_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* x, \ cntx_t* cntx, \ cntl_t* cntl \ ); GENPROT( trsv_l_blk_var1 ) GENPROT( trsv_l_blk_var2 ) GENPROT( trsv_u_blk_var1 ) GENPROT( trsv_u_blk_var2 ) GENPROT( trsv_unb_var1 ) GENPROT( trsv_unb_var2 ) GENPROT( trsv_unf_var1 ) GENPROT( trsv_unf_var2 ) // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* x, inc_t incx, \ cntx_t* cntx \ ); INSERT_GENTPROT_BASIC0( trsv_unb_var1 ) INSERT_GENTPROT_BASIC0( trsv_unb_var2 ) INSERT_GENTPROT_BASIC0( trsv_unf_var1 ) INSERT_GENTPROT_BASIC0( trsv_unf_var2 ) // end bli_trsv_var.h // end bli_trsv.h // end bli_l2.h // -- Level-3 operations -- // begin bli_l3.h // begin bli_l3_cntl.h // // Prototype conditional control tree creation functions. // void bli_l3_cntl_create_if ( opid_t family, pack_t schema_a, pack_t schema_b, obj_t* a, obj_t* b, obj_t* c, rntm_t* rntm, cntl_t* cntl_orig, cntl_t** cntl_use ); void bli_l3_cntl_free ( rntm_t* rntm, cntl_t* cntl_use, thrinfo_t* thread ); // end bli_l3_cntl.h // begin bli_l3_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx \ ); GENPROT( trmm ) GENPROT( trsm ) // ----------------------------------------------------------------------------- void bli_gemm_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_gemmt_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_hemm_basic_check ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_herk_basic_check ( obj_t* alpha, obj_t* a, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_her2k_basic_check ( obj_t* alpha, obj_t* a, obj_t* bh, obj_t* b, obj_t* ah, obj_t* beta, obj_t* c, cntx_t* cntx ); void bli_l3_basic_check ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx ); // end bli_l3_check.h // begin bli_l3_int.h void bli_l3_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_int.h // begin bli_l3_packab.h void bli_l3_packa ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); void bli_l3_packb ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // end bli_l3_packab.h // Define function types. //#include "bli_l3_ft_ex.h" // begin bli_l3_ft_ukr.h #ifndef BLIS_L3_FT_UKR_H #define BLIS_L3_FT_UKR_H // // -- Level-3 micro-kernel function types -------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemm ) // gemmtrsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmtrsm ) // trsm_[lu] #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( trsm ) #endif // end bli_l3_ft_ukr.h // begin bli_l3_oft.h #ifndef BLIS_L3_OFT_H #define BLIS_L3_OFT_H // // -- Level-3 object function types -------------------------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemm ) GENTDEF( gemmt ) GENTDEF( her2k ) GENTDEF( syr2k ) // hemm, symm, trmm3 #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( hemm ) GENTDEF( symm ) GENTDEF( trmm3 ) // herk, syrk #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( herk ) GENTDEF( syrk ) // trmm, trsm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_oft)) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( trmm ) GENTDEF( trsm ) #endif // end bli_l3_oft.h // begin bli_l3_oft_var.h #ifndef BLIS_L3_OFT_VAR_H #define BLIS_L3_OFT_VAR_H // // -- Level-3 variant function types ------------------------------------------- // #undef GENTDEF #define GENTDEF( opname ) \ \ typedef void (*PASTECH(opname,_var_oft)) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENTDEF( l3 ) #endif // end bli_l3_oft_var.h // begin bli_l3_blocksize.h dim_t bli_l3_determine_kc ( dir_t direct, dim_t i, dim_t dim, obj_t* a, obj_t* b, bszid_t bszid, cntx_t* cntx, cntl_t* cntl ); #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dir_t direct, \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc ) GENPROT( gemmt_determine_kc ) GENPROT( trmm_determine_kc ) GENPROT( trsm_determine_kc ) #undef GENPROT #define GENPROT( opname ) \ \ dim_t PASTEMAC0(opname) \ ( \ dim_t i, \ dim_t dim, \ obj_t* a, \ obj_t* b, \ bszid_t bszid, \ cntx_t* cntx \ ); GENPROT( gemm_determine_kc_f ) GENPROT( gemm_determine_kc_b ) GENPROT( gemmt_determine_kc_f ) GENPROT( gemmt_determine_kc_b ) GENPROT( trmm_determine_kc_f ) GENPROT( trmm_determine_kc_b ) GENPROT( trsm_determine_kc_f ) GENPROT( trsm_determine_kc_b ) // end bli_l3_blocksize.h // begin bli_l3_direct.h dir_t bli_l3_direct ( obj_t* a, obj_t* b, obj_t* c, cntl_t* cntl ); // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ dir_t PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm_direct ) GENPROT( gemmt_direct ) GENPROT( trmm_direct ) GENPROT( trsm_direct ) // end bli_l3_direct.h // begin bli_l3_prune.h #undef GENPROT #define GENPROT( dim ) \ \ void PASTEMAC(l3_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntl_t* cntl \ ); GENPROT( m ) GENPROT( n ) GENPROT( k ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname, dim ) \ \ void PASTEMAC2(opname,_prune_unref_mparts_,dim) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c \ ); GENPROT( gemm, m ) GENPROT( gemm, n ) GENPROT( gemm, k ) GENPROT( gemmt, m ) GENPROT( gemmt, n ) GENPROT( gemmt, k ) GENPROT( trmm, m ) GENPROT( trmm, n ) GENPROT( trmm, k ) GENPROT( trsm, m ) GENPROT( trsm, n ) GENPROT( trsm, k ) // end bli_l3_prune.h // begin bli_l3_schema.h void bli_l3_set_schemas ( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx ); // end bli_l3_schema.h // Prototype object APIs (basic and expert). // begin bli_l3_oapi.h // // Prototype object-based interfaces (basic). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi.h // begin bli_l3_oapi_ex.h // // Prototype object-based interfaces (expert). // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( gemm ) GENPROT( gemmt ) GENPROT( her2k ) GENPROT( syr2k ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( hemm ) GENPROT( symm ) GENPROT( trmm3 ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( herk ) GENPROT( syrk ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \ ( \ side_t side, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENPROT( trmm ) GENPROT( trsm ) // end bli_l3_oapi_ex.h // Prototype typed APIs (basic and expert). // begin bli_l3_tapi.h // // Prototype BLAS-like interfaces with typed operands (basic). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi.h // begin bli_l3_tapi_ex.h // // Prototype BLAS-like interfaces with typed operands (expert). // #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ conj_t conja, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( hemm ) INSERT_GENTPROT_BASIC0( symm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype_r* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( herk ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype_r* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( her2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( syrk ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ uplo_t uploc, \ trans_t transa, \ trans_t transb, \ dim_t m, \ dim_t k, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( gemmt ) INSERT_GENTPROT_BASIC0( syr2k ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ trans_t transb, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype* beta, \ ctype* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm3 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \ ( \ side_t side, \ uplo_t uploa, \ trans_t transa, \ diag_t diaga, \ dim_t m, \ dim_t n, \ ctype* alpha, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype* b, inc_t rs_b, inc_t cs_b, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( trmm ) INSERT_GENTPROT_BASIC0( trsm ) // end bli_l3_tapi_ex.h // Define function types for small/unpacked handlers/kernels. // begin bli_l3_sup_oft.h #ifndef BLIS_L3_SUP_OFT_H #define BLIS_L3_SUP_OFT_H // // -- Level-3 small/unpacked object function types ----------------------------- // // gemm #undef GENTDEF #define GENTDEF( opname ) \ \ typedef err_t (*PASTECH(opname,_oft)) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm \ ); GENTDEF( gemmsup ) GENTDEF( gemmtsup ) #endif // end bli_l3_sup_oft.h // begin bli_l3_sup_ft_ker.h #ifndef BLIS_L3_SUP_FT_KER_H #define BLIS_L3_SUP_FT_KER_H // // -- Level-3 small/unpacked kernel function types ----------------------------- // // gemmsup #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ); INSERT_GENTDEF( gemmsup ) #endif // end bli_l3_sup_ft_ker.h // Define static edge case logic for use in small/unpacked kernels. //#include "bli_l3_sup_edge.h" // Prototype object API to small/unpacked matrix dispatcher. // begin bli_l3_sup.h err_t bli_gemmsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup.h // Prototype reference implementation of small/unpacked matrix handler. // begin bli_l3_sup_ref.h err_t bli_gemmsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); err_t bli_gemmtsup_ref ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm ); // end bli_l3_sup_ref.h // begin bli_l3_sup_int.h err_t bli_gemmsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); err_t bli_gemmtsup_int ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, thrinfo_t* thread ); // end bli_l3_sup_int.h // begin bli_l3_sup_vars.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ trans_t trans, \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ stor3_t eff_id, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); GENPROT( gemmsup_ref_var1 ) GENPROT( gemmsup_ref_var2 ) GENPROT( gemmsup_ref_var1n ) GENPROT( gemmsup_ref_var2m ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ bool packa, \ bool packb, \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ void* restrict alpha, \ void* restrict a, inc_t rs_a, inc_t cs_a, \ void* restrict b, inc_t rs_b, inc_t cs_b, \ void* restrict beta, \ void* restrict c, inc_t rs_c, inc_t cs_c, \ stor3_t eff_id, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n ) INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m ) // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases ( num_t dt, trans_t* trans, bool packa, bool packb, stor3_t* eff_id, cntx_t* cntx ) { const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx ); // Handle row- and column-preferrential kernels separately. if ( row_pref ) { if ( packa && packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } else if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCC; // BLIS_RRR when transposed below. } } else if ( packb ) { if ( *eff_id == BLIS_RRC ) { // Since C is already row-stored, we can use BLIS_RRR kernel instead. *eff_id = BLIS_RRR; } else if ( *eff_id == BLIS_CRC ) { // BLIS_RRC when transposed below (with packa instead of packb). // No transformation is beneficial here. } else if ( *eff_id == BLIS_RCC ) { // C is already row-stored; cancel transposition and use BLIS_RCR // kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_RCR; } #if 0 // This transformation performs poorly. Theory: packing A (formerly B) // when eff_id == BLIS_RCC (formerly BLIS_CRR) to row storage is slow // and kills the performance? else if ( eff_id == BLIS_CRR ) { trans = bli_trans_toggled( trans ); eff_id = BLIS_CRC; // BLIS_RRC when transposed below. } #endif } else if ( packa ) { if ( *eff_id == BLIS_CRR ) { // Induce a transpose to make C row-stored. // BLIS_RCC when transposed below (both matrices still packed). // This allows us to use the BLIS_RRR kernel instead. *trans = bli_trans_toggled( *trans ); *eff_id = BLIS_CCR; // BLIS_RCR when transposed below. } } } else { //bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" ); bli_abort(); } } // end bli_l3_sup_vars.h // begin bli_l3_sup_packm_a.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t m, \ dim_t k, \ dim_t mr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t m, \ dim_t k, \ dim_t mr, \ dim_t* restrict m_max, \ dim_t* restrict k_max, \ ctype* a, inc_t rs_a, inc_t cs_a, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_a ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t m_alloc, \ dim_t k_alloc, \ dim_t m, \ dim_t k, \ dim_t mr, \ ctype* restrict kappa, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_a ) // end bli_l3_sup_packm_a.h // begin bli_l3_sup_packm_b.h #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ dim_t k, \ dim_t n, \ dim_t nr, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool did_pack, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ stor3_t stor_id, \ pack_t* restrict schema, \ dim_t k, \ dim_t n, \ dim_t nr, \ dim_t* restrict k_max, \ dim_t* restrict n_max, \ ctype* b, inc_t rs_b, inc_t cs_b, \ ctype** p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ dim_t* restrict pd_p, inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_init_b ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ bool will_pack, \ packbuf_t pack_buf_type, \ stor3_t stor_id, \ trans_t transc, \ dim_t k_alloc, \ dim_t n_alloc, \ dim_t k, \ dim_t n, \ dim_t nr, \ ctype* restrict kappa, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \ inc_t* restrict ps_p, \ cntx_t* restrict cntx, \ rntm_t* restrict rntm, \ mem_t* restrict mem, \ thrinfo_t* restrict thread \ ); \ INSERT_GENTPROT_BASIC0( packm_sup_b ) // end bli_l3_sup_packm_b.h // begin bli_l3_sup_packm_var.h // // Prototype BLAS-like interfaces to the variants. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ dim_t m_max, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ dim_t pd_p, inc_t ps_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ trans_t transc, \ pack_t schema, \ dim_t m, \ dim_t n, \ ctype* restrict kappa, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ ctype* restrict p, inc_t rs_p, inc_t cs_p, \ cntx_t* restrict cntx, \ thrinfo_t* restrict thread \ ); INSERT_GENTPROT_BASIC0( packm_sup_var2 ) // end bli_l3_sup_packm_var.h // Prototype microkernel wrapper APIs. // begin bli_l3_ukr_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a, \ obj_t* b, \ obj_t* beta, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( gemm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* alpha, \ obj_t* a1x, \ obj_t* a11, \ obj_t* bx1, \ obj_t* b11, \ obj_t* c11, \ cntx_t* cntx \ ); GENPROT( gemmtrsm_ukernel ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx \ ); GENPROT( trsm_ukernel ) // end bli_l3_ukr_oapi.h // begin bli_l3_ukr_tapi.h // // Generate prototypes for level-3 micro-kernel wrappers. // #undef gemm_ukr_name #define gemm_ukr_name gemm_ukernel #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name gemmtrsm_l_ukernel #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name gemmtrsm_u_ukernel #undef trsm_l_ukr_name #define trsm_l_ukr_name trsm_l_ukernel #undef trsm_u_ukr_name #define trsm_u_ukr_name trsm_u_ukernel // Include the level-3 micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_l3_ukr_tapi.h // Generate function pointer arrays for tapi microkernel functions. // begin bli_l3_ukr_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( tname, opname ) \ \ PASTECH2(tname,_ukr,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( gemm, gemm_ukernel ) GENPROT( gemmtrsm, gemmtrsm_l_ukernel ) GENPROT( gemmtrsm, gemmtrsm_u_ukernel ) GENPROT( trsm, trsm_l_ukernel ) GENPROT( trsm, trsm_u_ukernel ) // end bli_l3_ukr_fpa.h // Operation-specific headers. // begin bli_gemm.h // begin bli_gemm_cntl.h cntl_t* bli_gemm_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); // ----------------------------------------------------------------------------- cntl_t* bli_gemmbp_cntl_create ( rntm_t* rntm, opid_t family, pack_t schema_a, pack_t schema_b, void_fp ker ); #if 0 cntl_t* bli_gemmpb_cntl_create ( opid_t family, ); #endif // ----------------------------------------------------------------------------- void bli_gemm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_gemm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_gemm_cntl.h // begin bli_gemm_front.h void bli_gemm_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_gemm_front.h // begin bli_gemm_var.h // // gemm kernel parameter struct. // typedef struct { gemm_ukr_vft ukr; } gemm_ker_params_t; // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) // end bli_gemm_var.h // begin bli_gemm_ind_opt.h BLIS_INLINE void bli_gemm_ind_recast_1m_params ( num_t* dt_exec, num_t* dt_c, pack_t schema_a, obj_t* c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, inc_t* rs_c, inc_t* cs_c ) { obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( bli_obj_imag_is_zero( &beta ) && !bli_is_gen_stored( *rs_c, *cs_c ) ) { *dt_exec = bli_dt_proj_to_real( *dt_exec ); *dt_c = bli_dt_proj_to_real( *dt_c ); if ( bli_is_1e_packed( schema_a ) ) { *m *= 2; *n *= 1; *k *= 2; *pd_a *= 2; *ps_a *= 2; *pd_b *= 1; *ps_b *= 2; *rs_c *= 1; *cs_c *= 2; } else { *m *= 1; *n *= 2; *k *= 2; *pd_a *= 1; *ps_a *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; *cs_c *= 1; } } } // end bli_gemm_ind_opt.h // Mixed datatype support. #ifdef BLIS_ENABLE_GEMM_MD // begin bli_gemm_md.h // begin bli_gemm_md_c2r_ref.h // -- Level-3 native micro-kernel prototype redefinitions ---------------------- #undef gemm_ukr_name #define gemm_ukr_name gemm_md_c2r_ref // Include the native micro-kernel API template. // begin bli_l3_ukr.h // // Define template prototypes for level-3 micro-kernels. // // Note: Instead of defining function prototype macro templates and then // instantiating those macros to define the individual function prototypes, // we simply alias the official operations' prototypes as defined in // bli_l3_ukr_prot.h. #undef GENTPROT #define GENTPROT GEMM_UKR_PROT INSERT_GENTPROT_BASIC0( gemm_ukr_name ) #undef GENTPROT #define GENTPROT GEMMTRSM_UKR_PROT INSERT_GENTPROT_BASIC0( gemmtrsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( gemmtrsm_u_ukr_name ) #undef GENTPROT #define GENTPROT TRSM_UKR_PROT INSERT_GENTPROT_BASIC0( trsm_l_ukr_name ) INSERT_GENTPROT_BASIC0( trsm_u_ukr_name ) // end bli_l3_ukr.h // end bli_gemm_md_c2r_ref.h // Define a local struct type that makes returning two values easier. typedef struct mddm_s { dom_t comp; dom_t exec; } mddm_t; void bli_gemm_md ( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_local, cntx_t** cntx ); mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx ); // ----------------------------------------------------------------------------- void bli_gemm_md_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); void bli_gemm_md_zgemm ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // ----------------------------------------------------------------------------- BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crr is already unconditionally associated with an // execution domain of BLIS_REAL.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_REAL ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since ccr is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_complex( a ) && bli_obj_is_real( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c ) { bool r_val = FALSE; // NOTE: The last conditional subexpression is necessary if/when we // allow the user to specify the computation domain. (The computation // domain is currently ignored, but once it is honored as a user- // settable value, it will affect the execution domain, which is what // is checked below. Until then, the last expression is not actually // necessary since crc is already unconditionally associated with an // execution domain of BLIS_COMPLEX.) if ( bli_obj_is_complex( c ) && bli_obj_is_real( a ) && bli_obj_is_complex( b ) && bli_obj_exec_domain( c ) == BLIS_COMPLEX ) r_val = TRUE; return r_val; } // ----------------------------------------------------------------------------- BLIS_INLINE void bli_gemm_md_ker_var2_recast ( num_t* dt_comp, num_t dt_a, num_t dt_b, num_t* dt_c, dim_t* m, dim_t* n, dim_t* k, inc_t* pd_a, inc_t* ps_a, inc_t* pd_b, inc_t* ps_b, obj_t* c, inc_t* rs_c, inc_t* cs_c ) { if ( bli_is_real( *dt_c ) && bli_is_complex( dt_a ) && bli_is_complex( dt_b ) ) { // The rcc case is executed with a real macrokernel, so we need to // double the k dimension (because both A and B are packed to the 1r // schema), and also the panel strides of A and B since they were // packed as complex matrices and we now need to convert them to // units of real elements. *k *= 2; *ps_a *= 2; *ps_b *= 2; } else if ( bli_is_complex( *dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_row_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *n *= 2; *pd_b *= 2; *ps_b *= 2; *rs_c *= 2; } else #endif { // Generally speaking, the crc case is executed with a complex // macrokernel, so we need to halve the panel stride of A (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_a /= 2; } } else if ( bli_is_complex( *dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { #if 1 obj_t beta; bli_obj_scalar_detach( c, &beta ); if ( //bli_obj_imag_equals( &beta, &BLIS_ZERO ) && bli_obj_imag_is_zero( &beta ) && bli_is_col_stored( *rs_c, *cs_c ) && bli_obj_prec( c ) == bli_obj_comp_prec( c ) ) { // If beta is real, and C is not general-stored, and the computation // precision is equal to the storage precision of C, we can use the // real macrokernel (and real microkernel, which is already stored // to the real virtual microkernel slots of the context) instead of // the complex macrokernel and c2r virtual microkernel. *dt_comp = bli_dt_proj_to_real( *dt_comp ); *dt_c = bli_dt_proj_to_real( *dt_c ); *m *= 2; *pd_a *= 2; *ps_a *= 2; *cs_c *= 2; } else #endif { // Generally speaking, the ccr case is executed with a complex // macrokernel, so we need to halve the panel stride of B (which // is real) since the macrokernel will perform the pointer // arithmetic in units of complex elements. *ps_b /= 2; } } #if 0 else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. //printf( "gemm_md.h: rrr: m n k are now %d %d %d\n", (int)*m, (int)*n, (int)*k ); } else if ( bli_is_complex( dt_c ) && bli_is_real( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_complex( dt_a ) && bli_is_real( dt_b ) ) { // No action needed. } else if ( bli_is_real( dt_c ) && bli_is_real( dt_a ) && bli_is_complex( dt_b ) ) { // No action needed. } #endif } // end bli_gemm_md.h #endif // end bli_gemm.h // begin bli_hemm.h // begin bli_hemm_front.h void bli_hemm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_hemm_front.h // end bli_hemm.h // begin bli_symm.h // begin bli_symm_front.h void bli_symm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_symm_front.h // end bli_symm.h // begin bli_trmm.h // begin bli_trmm_front.h void bli_trmm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm_front.h // begin bli_trmm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); //GENPROT( trmm_blk_var1 ) //GENPROT( trmm_blk_var2 ) //GENPROT( trmm_blk_var3 ) GENPROT( trmm_xx_ker_var2 ) GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) // end bli_trmm_var.h // end bli_trmm.h // begin bli_trmm3.h // begin bli_trmm3_front.h void bli_trmm3_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_trmm3_front.h // end bli_trmm3.h // begin bli_trsm.h // begin bli_trsm_cntl.h cntl_t* bli_trsm_cntl_create ( rntm_t* rntm, side_t side, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_l_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); cntl_t* bli_trsm_r_cntl_create ( rntm_t* rntm, pack_t schema_a, pack_t schema_b, void_fp ker ); void bli_trsm_cntl_free ( rntm_t* rntm, cntl_t* cntl, thrinfo_t* thread ); // ----------------------------------------------------------------------------- cntl_t* bli_trsm_cntl_create_node ( rntm_t* rntm, opid_t family, bszid_t bszid, void_fp var_func, cntl_t* sub_node ); // end bli_trsm_cntl.h // begin bli_trsm_front.h void bli_trsm_front ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); #ifdef BLIS_ENABLE_SMALL_MATRIX err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); #endif // end bli_trsm_front.h // begin bli_trsm_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* b, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( trsm_blk_var1 ) GENPROT( trsm_blk_var2 ) GENPROT( trsm_blk_var3 ) GENPROT( trsm_xx_ker_var2 ) GENPROT( trsm_ll_ker_var2 ) GENPROT( trsm_lu_ker_var2 ) GENPROT( trsm_rl_ker_var2 ) GENPROT( trsm_ru_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoff, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha1, \ void* a, inc_t cs_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, \ dim_t pd_b, inc_t ps_b, \ void* alpha2, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 ) INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 ) // end bli_trsm_var.h // end bli_trsm.h // begin bli_gemmt.h // begin bli_gemmt_front.h void bli_gemmt_front ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, rntm_t* rntm, cntl_t* cntl ); // end bli_gemmt_front.h // begin bli_gemmt_var.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC0(opname) \ ( \ obj_t* a, \ obj_t* ah, \ obj_t* c, \ cntx_t* cntx, \ rntm_t* rntm, \ cntl_t* cntl, \ thrinfo_t* thread \ ); GENPROT( gemmt_x_ker_var2 ) GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) // // Prototype BLAS-like interfaces with void pointer operands. // #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffc, \ pack_t schema_a, \ pack_t schema_b, \ dim_t m, \ dim_t n, \ dim_t k, \ void* alpha, \ void* a, inc_t cs_a, inc_t is_a, \ dim_t pd_a, inc_t ps_a, \ void* b, inc_t rs_b, inc_t is_b, \ dim_t pd_b, inc_t ps_b, \ void* beta, \ void* c, inc_t rs_c, inc_t cs_c, \ cntx_t* cntx, \ rntm_t* rntm, \ thrinfo_t* thread \ ); INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) // end bli_gemmt_var.h // end bli_gemmt.h // end bli_l3.h // -- Utility operations -- // begin bli_util.h // begin bli_util_check.h // // Prototype object-based check functions. // #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* asum \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* norm \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x \ ); GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* chi, \ obj_t* psi, \ bool* is_eq \ ); GENTPROT( eqsc ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ void PASTEMAC(opname,_check) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) // ----------------------------------------------------------------------------- void bli_utilv_xi_check ( obj_t* x, obj_t* index ); void bli_utilv_xa_check ( obj_t* x, obj_t* asum ); void bli_utilm_mkhst_check ( obj_t* a ); void bli_utilv_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_norm_check ( obj_t* x, obj_t* norm ); void bli_utilm_fprint_check ( FILE* file, char* s1, obj_t* x, char* format, char* s2 ); void bli_utilm_rand_check ( obj_t* x ); void bli_utilv_sumsqv_check ( obj_t* x, obj_t* scale, obj_t* sumsq ); // end bli_util_check.h // Prototype object APIs (expert and non-expert). // begin bli_oapi_ex.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_EXPERT #define BLIS_OAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_OAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS // end bli_oapi_ex.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_oapi_ba.h // This file defines macros used to allow the _oapi.c files to produce // object APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #define BLIS_OAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #define BLIS_OAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_OAPI_EX_DECLS #define BLIS_OAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_oapi_ba.h // begin bli_util_oapi.h // // Prototype object-based interfaces. // #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* asum \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( asumv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* a \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* norm \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randv ) GENPROT( randnv ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( randm ) GENPROT( randnm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \ ( \ obj_t* x, \ obj_t* scale, \ obj_t* sumsq \ BLIS_OAPI_EX_PARAMS \ ); GENPROT( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_OAPI_BASIC #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ obj_t* x, \ obj_t* y, \ bool* is_eq \ ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ FILE* file, \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( fprintv ) GENPROT( fprintm ) #undef GENPROT #define GENPROT( opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \ ( \ char* s1, \ obj_t* x, \ char* format, \ char* s2 \ ); GENPROT( printv ) GENPROT( printm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_oapi.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Prototype typed APIs (expert and non-expert). // begin bli_tapi_ex.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that contain context parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_EXPERT #define BLIS_TAPI_EXPERT // Define the macro to add a suffix to the function names (in function // definitions). #undef EX_SUF #define EX_SUF BLIS_TAPI_EX_SUF // Define the macro to add expert arguments to function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS ,cntx_t* cntx, rntm_t* rntm // Define the macro to omit the expert variable declaration block, since // it is not needed when expert parameters are passed in through the API. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS // end bli_tapi_ex.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // begin bli_tapi_ba.h // This file defines macros used to allow the _tapi.c files to produce // typed APIs that omit expert parameters. // Define a macro that allows the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_TAPI_BASIC #define BLIS_TAPI_BASIC // Define the macro to omit a suffix from the function names (in function // definitions). #undef EX_SUF #define EX_SUF // Define the macro to omit expert arguments from function signatures // and prototypes. #undef BLIS_TAPI_EX_PARAMS #define BLIS_TAPI_EX_PARAMS // Define the macro to add local expert variables that are initialized // to NULL. The "( void )" statements are to prevent unused variable // warnings by the compiler. #undef BLIS_TAPI_EX_DECLS #define BLIS_TAPI_EX_DECLS cntx_t* cntx = NULL; ( void )cntx; \ rntm_t* rntm = NULL; ( void )rntm; // end bli_tapi_ba.h // begin bli_util_tapi.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( asumv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( mkherm ) INSERT_GENTPROT_BASIC0( mksymm ) INSERT_GENTPROT_BASIC0( mktrim ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1v ) INSERT_GENTPROTR_BASIC0( normfv ) INSERT_GENTPROTR_BASIC0( normiv ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( norm1m ) INSERT_GENTPROTR_BASIC0( normfm ) INSERT_GENTPROTR_BASIC0( normim ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randv ) INSERT_GENTPROT_BASIC0( randnv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROT_BASIC0( randm ) INSERT_GENTPROT_BASIC0( randnm ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTPROTR_BASIC0( sumsqv ) // ----------------------------------------------------------------------------- // Operations with basic interfaces only. #ifdef BLIS_TAPI_BASIC #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqsc ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTPROT_BASIC0( eqm ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t n, \ void* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ char* s1, \ dim_t m, \ dim_t n, \ void* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( printm ) #endif // #ifdef BLIS_TAPI_BASIC // end bli_util_tapi.h // begin bli_util_ft.h // // -- Utility function types --------------------------------------------------- // // asumv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( asumv ) // mkherm, mksymm, mktrim #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( mkherm ) INSERT_GENTDEF( mksymm ) INSERT_GENTDEF( mktrim ) // norm1v, normfv, normiv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1v ) INSERT_GENTDEFR( normfv ) INSERT_GENTDEFR( normiv ) // norm1m, normfm, normim #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( norm1m ) INSERT_GENTDEFR( normfm ) INSERT_GENTDEFR( normim ) // fprintv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintv ) // fprintm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTDEF( fprintm ) // randv, randnv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randv ) INSERT_GENTDEF( randnv ) // randm, randnm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEF( randm ) INSERT_GENTDEF( randnm ) // sumsqv #undef GENTDEFR #define GENTDEFR( ctype, ctype_r, ch, chr, opname, tsuf ) \ \ typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq \ BLIS_TAPI_EX_PARAMS \ ); INSERT_GENTDEFR( sumsqv ) // ----------------------------------------------------------------------------- // Operations with only basic interfaces. #ifdef BLIS_TAPI_BASIC // eqsc #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjchi, \ ctype* chi, \ ctype* psi, \ bool* is_eq \ ); INSERT_GENTDEF( eqsc ) // eqv #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy, \ bool* is_eq \ ); INSERT_GENTDEF( eqv ) // eqm #undef GENTDEF #define GENTDEF( ctype, ch, opname, tsuf ) \ \ typedef void (*PASTECH2(ch,opname,tsuf)) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y, \ bool* is_eq \ ); INSERT_GENTDEF( eqm ) #endif // #ifdef BLIS_OAPI_BASIC // end bli_util_ft.h // begin bli_xapi_undef.h // This file un-defines macros used to allow the _oapi.c and _tapi.c files to // produce object and typed APIs that omit or contain expert parameters. // Un-define all macros that allow the source code to determine which interface // (basic or expert) we are compiling. #undef BLIS_OAPI_BASIC #undef BLIS_OAPI_EXPERT #undef BLIS_TAPI_BASIC #undef BLIS_TAPI_EXPERT // Un-define the macro to omit or add the function name suffix (in function // definitions). #undef EX_SUF // Un-define the macro to omit or add expert arguments from function signatures // and prototypes. #undef BLIS_OAPI_EX_PARAMS #undef BLIS_TAPI_EX_PARAMS // Un-define the macro to omit or add local expert variables. #undef BLIS_OAPI_EX_DECLS #undef BLIS_TAPI_EX_DECLS // end bli_xapi_undef.h // Generate function pointer arrays for tapi functions (expert only). // begin bli_util_fpa.h // // Prototype function pointer query interface. // #undef GENPROT #define GENPROT( opname ) \ \ PASTECH2(opname,BLIS_TAPI_EX_SUF,_vft) \ PASTEMAC2(opname,BLIS_TAPI_EX_SUF,_qfp)( num_t dt ); GENPROT( asumv ) GENPROT( mkherm ) GENPROT( mksymm ) GENPROT( mktrim ) GENPROT( norm1v ) GENPROT( normfv ) GENPROT( normiv ) GENPROT( norm1m ) GENPROT( normfm ) GENPROT( normim ) GENPROT( randv ) GENPROT( randnv ) GENPROT( randm ) GENPROT( randnm ) GENPROT( sumsqv ) // ----------------------------------------------------------------------------- #undef GENPROT #define GENPROT( opname ) \ \ PASTECH(opname,_vft) \ PASTEMAC(opname,_qfp)( num_t dt ); GENPROT( eqsc ) GENPROT( eqv ) GENPROT( eqm ) GENPROT( fprintv ) GENPROT( fprintm ) //GENPROT( printv ) //GENPROT( printm ) // end bli_util_fpa.h // Prototype level-1m implementations. // begin bli_util_unb_var1.h // // Prototype BLAS-like interfaces with typed operands. // #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* asum, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( asumv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ uplo_t uploa, \ dim_t m, \ ctype* a, inc_t rs_a, inc_t cs_a, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( mkherm_unb_var1 ) INSERT_GENTPROT_BASIC0( mksymm_unb_var1 ) INSERT_GENTPROT_BASIC0( mktrim_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfv_unb_var1 ) INSERT_GENTPROTR_BASIC0( normiv_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype_r* norm, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 ) INSERT_GENTPROTR_BASIC0( normfm_unb_var1 ) INSERT_GENTPROTR_BASIC0( normim_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randv_unb_var1 ) INSERT_GENTPROT_BASIC0( randnv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ uplo_t uplox, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROT_BASIC0( randm_unb_var1 ) INSERT_GENTPROT_BASIC0( randnm_unb_var1 ) #undef GENTPROTR #define GENTPROTR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ ctype_r* scale, \ ctype_r* sumsq, \ cntx_t* cntx, \ rntm_t* rntm \ ); INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 ) // ----------------------------------------------------------------------------- #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ conj_t conjx, \ dim_t n, \ ctype* x, inc_t incx, \ ctype* y, inc_t incy \ ); INSERT_GENTPROT_BASIC0( eqv_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ bool PASTEMAC(ch,varname) \ ( \ doff_t diagoffx, \ diag_t diagx, \ uplo_t uplox, \ trans_t transx, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ ctype* y, inc_t rs_y, inc_t cs_y \ ); INSERT_GENTPROT_BASIC0( eqm_unb_var1 ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t n, \ ctype* x, inc_t incx, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintv ) #undef GENTPROT #define GENTPROT( ctype, ch, opname ) \ \ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \ ( \ FILE* file, \ char* s1, \ dim_t m, \ dim_t n, \ ctype* x, inc_t rs_x, inc_t cs_x, \ char* format, \ char* s2 \ ); INSERT_GENTPROT_BASIC0_I( fprintm ) // end bli_util_unb_var1.h // end bli_util.h // -- addon definitions -- // NOTE: These definitions should not be included much earlier since an addon // may wish to utilize other types and definitions provided by BLIS. // begin bli_addon.h #ifndef BLIS_ADDON_H #define BLIS_ADDON_H #if 0 #define BLIS_ENABLE_ADDONS #else #define BLIS_DISABLE_ADDONS #endif // Enabled addons #endif // end bli_addon.h // -- sandbox implementation -- // begin bli_sbox.h #ifndef BLIS_SBOX_H #define BLIS_SBOX_H // Each sandbox must have a bli_sandbox.h file present somewhere inside. // If a sandbox was enabled at configure-time, we need to #include its // header file here so that it will get pulled into blis.h when it is // flattened into a monolithic header. #ifdef BLIS_ENABLE_SANDBOX #include "bli_sandbox.h" // skipped #endif #endif // end bli_sbox.h // -- BLAS compatibility layer -- // begin bli_blas.h // If the CBLAS compatibility layer was enabled while the BLAS layer // was not enabled, we must enable it here. #ifdef BLIS_ENABLE_CBLAS #ifndef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS #endif #endif // BLIS_ENABLE_CBLAS // By default, if the BLAS compatibility layer is enabled, we define // (include) all of the BLAS prototypes. However, if the user is // #including "blis.h" and also #including another header that also // declares the BLAS functions, then we provide an opportunity to // #undefine the BLIS_ENABLE_BLAS_DEFS macro (see below). #ifdef BLIS_ENABLE_BLAS #define BLIS_ENABLE_BLAS_DEFS #else #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the BLAS test drivers are being // compiled. #ifdef BLIS_VIA_BLASTEST #undef BLIS_ENABLE_BLAS_DEFS #endif // Skip prototyping all of the BLAS if the environment has defined the // macro BLIS_DISABLE_BLAS_DEFS. #ifdef BLIS_DISABLE_BLAS_DEFS #undef BLIS_ENABLE_BLAS_DEFS #endif // Begin including all BLAS prototypes. #ifdef BLIS_ENABLE_BLAS_DEFS // -- System headers needed by BLAS compatibility layer -- #include // skipped // -- Constants -- #define BLIS_MAX_BLAS_FUNC_STR_LENGTH (7+1) // -- Utility macros -- // begin bla_r_sign.h #ifdef BLIS_ENABLE_BLAS double bla_r_sign(const bla_real *a, const bla_real *b); #endif // end bla_r_sign.h // begin bla_d_sign.h #ifdef BLIS_ENABLE_BLAS double bla_d_sign(const bla_double *a, const bla_double *b); #endif // end bla_d_sign.h // begin bla_r_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src); #endif // end bla_r_cnjg.h // begin bla_d_cnjg.h #ifdef BLIS_ENABLE_BLAS void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src); #endif // end bla_d_cnjg.h // begin bla_r_imag.h #ifdef BLIS_ENABLE_BLAS bla_real bla_r_imag(const bla_scomplex *z); #endif // end bla_r_imag.h // begin bla_d_imag.h #ifdef BLIS_ENABLE_BLAS double bla_d_imag(const bla_dcomplex *z); #endif // end bla_d_imag.h // begin bla_c_div.h #ifdef BLIS_ENABLE_BLAS void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp); #endif // end bla_c_div.h // begin bla_z_div.h #ifdef BLIS_ENABLE_BLAS void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp); #endif // end bla_z_div.h // begin bla_f__cabs.h #ifdef BLIS_ENABLE_BLAS double bla_f__cabs(double real, double imag); #endif // end bla_f__cabs.h // begin bla_r_abs.h #ifdef BLIS_ENABLE_BLAS double bla_r_abs(const bla_real *x); #endif // end bla_r_abs.h // begin bla_d_abs.h #ifdef BLIS_ENABLE_BLAS double bla_d_abs(const bla_double *x); #endif // end bla_d_abs.h // begin bla_c_abs.h #ifdef BLIS_ENABLE_BLAS double bla_c_abs(const bla_scomplex *z); #endif // end bla_c_abs.h // begin bla_z_abs.h #ifdef BLIS_ENABLE_BLAS double bla_z_abs(const bla_dcomplex *z); #endif // end bla_z_abs.h // begin bla_lsame.h #ifdef BLIS_ENABLE_BLAS #ifdef LAPACK_ILP64 long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len); #else BLIS_EXPORT_BLAS int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len); #endif #endif // end bla_lsame.h // begin bla_xerbla.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS BLIS_OVERRIDABLE int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len); #endif // end bla_xerbla.h // begin bla_xerbla_array.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info); #endif // end bla_xerbla_array.h // -- Level-0 BLAS prototypes -- // begin bla_cabs1.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS bla_real PASTEF77(s,cabs1)(bla_scomplex *z); BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z); #endif // end bla_cabs1.h // -- Level-1 BLAS prototypes -- // begin bla_amax.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( amax ) #endif // end bla_amax.h // begin bla_asum.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end bla_asum.h // begin bla_axpy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpy ) #endif // end bla_axpy.h // begin bla_copy.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( copy ) #endif // end bla_copy.h // begin bla_dot.h #ifdef BLIS_ENABLE_BLAS // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTR_BLAS( dot ) #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL INSERT_GENTPROTDOTC_BLAS( dot ) #else // For the "intel" complex return type, we use a hidden parameter (passed by // address) to return the result. #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \ ( \ ftype* rhop, \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy \ ); INSERT_GENTPROTDOTC_BLAS( dot ) #endif // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS float PASTEF77(sd,sdot) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); BLIS_EXPORT_BLAS double PASTEF77(d,sdot) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy ); #endif // end bla_dot.h // begin bla_nrm2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end bla_nrm2.h // begin bla_rot.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s); #endif // end bla_rot.h // begin bla_rotg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s); BLIS_EXPORT_BLAS int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s); BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s); BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s); #endif // end bla_rotg.h // begin bla_rotm.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam); #endif // end bla_rotm.h // begin bla_rotmg.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam); BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam); #endif // end bla_rotmg.h // begin bla_scal.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTSCAL #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \ ( \ const f77_int* n, \ const ftype_a* alpha, \ ftype_x* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTSCAL_BLAS( scal ) #endif // end bla_scal.h // begin bla_swap.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ ftype* x, const f77_int* incx, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( swap ) #endif // end bla_swap.h // begin f77_amax_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROT #define GENTPROT( ftype_x, chx, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ f77_int* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROT_BLAS( amax ) #endif // end f77_amax_sub.h // begin f77_asum_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( asum ) #endif // end f77_asum_sub.h // begin f77_dot_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, ch, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \ ( \ const f77_int* n, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTDOT_BLAS( dot ) // -- "Black sheep" dot product function prototypes -- BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub) ( const f77_int* n, const float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval ); BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub) ( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* rval ); #endif // end f77_dot_sub.h // begin f77_nrm2_sub.h // // Prototype CBLAS subroutine wrapper interfaces. // #undef GENTPROTR2 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \ ( \ const f77_int* n, \ const ftype_x* x, const f77_int* incx, \ ftype_r* rval \ ); #ifdef BLIS_ENABLE_CBLAS INSERT_GENTPROTR2_BLAS( nrm2 ) #endif // end f77_nrm2_sub.h // -- Level-2 BLAS prototypes -- // dense // begin bla_gemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemv ) #endif // end bla_gemv.h // begin bla_ger.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTDOT #define GENTPROTDOT( ftype, chxy, chc, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \ ( \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTDOT_BLAS( ger ) #endif // end bla_ger.h // begin bla_hemv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemv ) #endif // end bla_hemv.h // begin bla_her.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype_r* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her ) #endif // end bla_her.h // begin bla_her2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2 ) #endif // end bla_her2.h // begin bla_symv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( symv ) #endif // end bla_symv.h // begin bla_syr.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr ) #endif // end bla_syr.h // begin bla_syr2.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTRO #define GENTPROTRO( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_int* m, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* y, const f77_int* incy, \ ftype* a, const f77_int* lda \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTRO_BLAS( syr2 ) #endif // end bla_syr2.h // begin bla_trmv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmv ) #endif // end bla_trmv.h // begin bla_trsv.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const ftype* a, const f77_int* lda, \ ftype* x, const f77_int* incx \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsv ) #endif // end bla_trsv.h // begin bla_gemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int nota, ta, conja; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ \ if ( !nota && !ta && !conja ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *n < 0 ) \ info = 3; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ else if ( *incy == 0 ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemv_check.h // begin bla_ger_check.h #ifdef BLIS_ENABLE_BLAS #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \ { \ f77_int info = 0; \ \ if ( *m < 0 ) \ info = 1; \ else if ( *n < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ \ sprintf( func_str, "%s%s%-2s", dt_str, op_str, conj_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_ger_check.h // begin bla_hemv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 5; \ else if ( *incx == 0 ) \ info = 7; \ else if ( *incy == 0 ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemv_check.h // begin bla_her_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 7; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her_check.h // begin bla_her2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( *m < 0 ) \ info = 2; \ else if ( *incx == 0 ) \ info = 5; \ else if ( *incy == 0 ) \ info = 7; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 9; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2_check.h // begin bla_symv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symv_check bla_hemv_check #endif // end bla_symv_check.h // begin bla_syr_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr_check bla_her_check #endif // end bla_syr_check.h // begin bla_syr2_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2_check bla_her2_check #endif // end bla_syr2_check.h // begin bla_trmv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \ { \ f77_int info = 0; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && !conja ) \ info = 2; \ else if ( !unita && !nonua ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, *m ) ) \ info = 6; \ else if ( *incx == 0 ) \ info = 8; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmv_check.h // begin bla_trsv_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsv_check bla_trmv_check #endif // end bla_trsv_check.h // packed // begin bla_hpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hpmv.h // begin bla_hpr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap); #endif // end bla_hpr.h // begin bla_hpr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap); BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap); #endif // end bla_hpr2.h // begin bla_spmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_spmv.h // begin bla_spr.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap); #endif // end bla_spr.h // begin bla_spr2.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap); BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap); #endif // end bla_spr2.h // begin bla_tpmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpmv.h // begin bla_tpsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tpsv.h // banded // begin bla_gbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy); #endif // end bla_gbmv.h // begin bla_hbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy); #endif // end bla_hbmv.h // begin bla_sbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy); BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy); #endif // end bla_sbmv.h // begin bla_tbmv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbmv.h // begin bla_tbsv.h #ifdef BLIS_ENABLE_BLAS BLIS_EXPORT_BLAS int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx); BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx); #endif // end bla_tbsv.h // -- Level-3 BLAS prototypes -- // begin bla_gemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm ) #endif // end bla_gemm.h // begin bla_hemm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( hemm ) #endif // end bla_hemm.h // begin bla_herk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype_r* alpha, \ const ftype* a, const f77_int* lda, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( herk ) #endif // end bla_herk.h // begin bla_her2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype_r* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( her2k ) #endif // end bla_her2k.h // begin bla_symm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( symm ) #endif // end bla_symm.h // begin bla_syrk.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syrk ) #endif // end bla_syrk.h // begin bla_syr2k.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( syr2k ) #endif // end bla_syr2k.h // begin bla_trmm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trmm ) #endif // end bla_trmm.h // begin bla_trsm.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* side, \ const f77_char* uploa, \ const f77_char* transa, \ const f77_char* diaga, \ const f77_int* m, \ const f77_int* n, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ ftype* b, const f77_int* ldb \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( trsm ) #endif // end bla_trsm.h // begin bla_gemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm_check.h // begin bla_hemm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_hemm_check.h // begin bla_herk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_herk_check.h // begin bla_her2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, conja; \ f77_int lower, upper; \ f77_int nrowa; \ \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_her2k_check.h // begin bla_symm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_symm_check bla_hemm_check #endif // end bla_symm_check.h // begin bla_syrk_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 10; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syrk_check.h // begin bla_syr2k_check.h #ifdef BLIS_ENABLE_BLAS #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int is_r; \ f77_int nota, ta, cta; \ f77_int lower, upper; \ f77_int nrowa; \ \ static char* dt_cst = dt_str; \ \ is_r = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \ nota = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \ cta = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !ta && (is_r ? !cta : 1) ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *k < 0 ) \ info = 4; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 7; \ else if ( *ldb < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 12; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_syr2k_check.h // begin bla_trmm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \ { \ f77_int info = 0; \ f77_int left, right; \ f77_int lower, upper; \ f77_int nota, ta, conja; \ f77_int unita, nonua; \ f77_int nrowa; \ \ left = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \ right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \ lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ unita = PASTEF770(lsame)( diaga, "U", (ftnlen)1, (ftnlen)1 ); \ nonua = PASTEF770(lsame)( diaga, "N", (ftnlen)1, (ftnlen)1 ); \ \ if ( left ) { nrowa = *m; } \ else { nrowa = *n; } \ \ if ( !left && !right ) \ info = 1; \ else if ( !lower && !upper ) \ info = 2; \ else if ( !nota && !ta && !conja ) \ info = 3; \ else if ( !unita && !nonua ) \ info = 4; \ else if ( *m < 0 ) \ info = 5; \ else if ( *n < 0 ) \ info = 6; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 9; \ else if ( *ldb < bli_max( 1, *m ) ) \ info = 11; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_trmm_check.h // begin bla_trsm_check.h #ifdef BLIS_ENABLE_BLAS #define bla_trsm_check bla_trmm_check #endif // end bla_trsm_check.h // -- BLAS extension prototypes -- // unique to BLIS // begin bla_axpby.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_int* n, \ const ftype* alpha, \ const ftype* x, const f77_int* incx, \ const ftype* beta, \ ftype* y, const f77_int* incy \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( axpby ) #endif // end bla_axpby.h // level-3 // begin bla_gemmt.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* uploc, \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemmt ) #endif // end bla_gemmt.h // begin bla_gemmt_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, m, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int lower, upper; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ lower = PASTEF770(lsame)( uploc, "L", (ftnlen)1, (ftnlen)1 ); \ upper = PASTEF770(lsame)( uploc, "U", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *m; } \ \ if ( !lower && !upper ) \ info = 1; \ else if ( !nota && !conja && !ta ) \ info = 2; \ else if ( !notb && !conjb && !tb ) \ info = 3; \ else if ( *m < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemmt_check.h // batch // begin bla_gemm_batch.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROT #define GENTPROT( ftype, ch, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa_array, \ const f77_char* transb_array, \ const f77_int* m_array, \ const f77_int* n_array, \ const f77_int* k_array, \ const ftype* alpha_array, \ const ftype** a_array, const f77_int* lda_array, \ const ftype** b_array, const f77_int* ldb_array, \ const ftype* beta_array, \ ftype** c_array, const f77_int* ldc_array, \ const f77_int* group_count, \ const f77_int* group_size \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROT_BLAS( gemm_batch ) #endif // end bla_gemm_batch.h // 3m // begin bla_gemm3m.h // // Prototype BLAS-to-BLIS interfaces. // #undef GENTPROTCO #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \ \ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \ ( \ const f77_char* transa, \ const f77_char* transb, \ const f77_int* m, \ const f77_int* n, \ const f77_int* k, \ const ftype* alpha, \ const ftype* a, const f77_int* lda, \ const ftype* b, const f77_int* ldb, \ const ftype* beta, \ ftype* c, const f77_int* ldc \ ); #ifdef BLIS_ENABLE_BLAS INSERT_GENTPROTCO_BLAS( gemm3m ) #endif // end bla_gemm3m.h // begin bla_gemm3m_check.h #ifdef BLIS_ENABLE_BLAS #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \ { \ f77_int info = 0; \ f77_int nota, notb; \ f77_int conja, conjb; \ f77_int ta, tb; \ f77_int nrowa, nrowb; \ \ nota = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \ notb = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \ conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \ conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \ ta = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \ tb = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \ \ if ( nota ) { nrowa = *m; } \ else { nrowa = *k; } \ if ( notb ) { nrowb = *k; } \ else { nrowb = *n; } \ \ if ( !nota && !conja && !ta ) \ info = 1; \ else if ( !notb && !conjb && !tb ) \ info = 2; \ else if ( *m < 0 ) \ info = 3; \ else if ( *n < 0 ) \ info = 4; \ else if ( *k < 0 ) \ info = 5; \ else if ( *lda < bli_max( 1, nrowa ) ) \ info = 8; \ else if ( *ldb < bli_max( 1, nrowb ) ) \ info = 10; \ else if ( *ldc < bli_max( 1, *m ) ) \ info = 13; \ \ if ( info != 0 ) \ { \ char func_str[ BLIS_MAX_BLAS_FUNC_STR_LENGTH ]; \ \ sprintf( func_str, "%s%-5s", dt_str, op_str ); \ \ bli_string_mkupper( func_str ); \ \ PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \ \ return; \ } \ } #endif // end bla_gemm3m_check.h // -- Fortran-compatible APIs to BLIS functions -- // begin b77_thread.h // // Prototype Fortran-compatible BLIS interfaces. // BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_ways) ( const f77_int* jc, const f77_int* pc, const f77_int* ic, const f77_int* jr, const f77_int* ir ); BLIS_EXPORT_BLAS void PASTEF770(bli_thread_set_num_threads) ( const f77_int* nt ); // end b77_thread.h #endif // BLIS_ENABLE_BLAS // end bli_blas.h // -- CBLAS compatibility layer -- // begin bli_cblas.h #ifndef BLIS_CBLAS_H #define BLIS_CBLAS_H #ifdef BLIS_ENABLE_CBLAS // Undefine these macros so that no internal conversion is done by CBLAS. // The function signatures have been modified to use the proper integer types // directly. #undef F77_INT #undef F77_CHAR // Include the main CBLAS header so that including this header file // (probably via blis.h) allows applications to access CBLAS // prototypes and definitions. // begin cblas.h #ifndef CBLAS_H #define CBLAS_H #include // skipped // We need to #include "bli_type_defs.h" in order to pull in the // definition of f77_int. But in order to #include that header, we // also need to pull in the headers that precede it in blis.h. // begin bli_system.h #ifndef BLIS_SYSTEM_H #define BLIS_SYSTEM_H // NOTE: If not yet defined, we define _POSIX_C_SOURCE to make sure that // various parts of POSIX are defined and made available. #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200809L #endif #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped #include // skipped // Determine the compiler (hopefully) and define conveniently named macros // accordingly. #if defined(__ICC) || defined(__INTEL_COMPILER) #define BLIS_ICC #elif defined(__clang__) #define BLIS_CLANG #elif defined(__GNUC__) #define BLIS_GCC #endif // Determine if we are on a 64-bit or 32-bit architecture. #if defined(_M_X64) || defined(__x86_64) || defined(__aarch64__) || \ defined(_ARCH_PPC64) || defined(__s390x__) || defined(_LP64) #define BLIS_ARCH_64 #else #define BLIS_ARCH_32 #endif // Determine the target operating system. #if defined(BLIS_ENABLE_SYSTEM) #if defined(_WIN32) || defined(__CYGWIN__) #define BLIS_OS_WINDOWS 1 #elif defined(__gnu_hurd__) #define BLIS_OS_GNU 1 #elif defined(__APPLE__) || defined(__MACH__) #define BLIS_OS_OSX 1 #elif defined(__ANDROID__) #define BLIS_OS_ANDROID 1 #elif defined(__linux__) #define BLIS_OS_LINUX 1 #elif defined(__bgq__) #define BLIS_OS_BGQ 1 #elif defined(__bg__) #define BLIS_OS_BGP 1 #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__bsdi__) || defined(__DragonFly__) || \ defined(__FreeBSD_kernel__) || defined(__HAIKU__) #define BLIS_OS_BSD 1 #elif defined(EMSCRIPTEN) #define BLIS_OS_EMSCRIPTEN #else #error "Cannot determine operating system" #endif #else // #if defined(BLIS_DISABLE_SYSTEM) #define BLIS_OS_NONE #endif // A few changes that may be necessary in Windows environments. #if BLIS_OS_WINDOWS // Include Windows header file. #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include // skipped #if !defined(__clang__) && !defined(__GNUC__) // Undefine attribute specifiers in Windows. #define __attribute__(x) // Undefine restrict. #define restrict #endif #endif // time.h provides clock_gettime(). #if BLIS_OS_WINDOWS #include // skipped #elif BLIS_OS_OSX #include // skipped #else //#include #include // skipped #endif #endif // end bli_system.h // begin bli_config.h #ifndef BLIS_CONFIG_H #define BLIS_CONFIG_H // Enabled configuration "family" (config_name) #define BLIS_FAMILY_X86_64 // Enabled sub-configurations (config_list) #define BLIS_CONFIG_SKX #define BLIS_CONFIG_KNL #define BLIS_CONFIG_HASWELL #define BLIS_CONFIG_SANDYBRIDGE #define BLIS_CONFIG_PENRYN #define BLIS_CONFIG_ZEN3 #define BLIS_CONFIG_ZEN2 #define BLIS_CONFIG_ZEN #define BLIS_CONFIG_EXCAVATOR #define BLIS_CONFIG_STEAMROLLER #define BLIS_CONFIG_PILEDRIVER #define BLIS_CONFIG_BULLDOZER #define BLIS_CONFIG_GENERIC // Enabled kernel sets (kernel_list) #define BLIS_KERNELS_SKX #define BLIS_KERNELS_KNL #define BLIS_KERNELS_SANDYBRIDGE #define BLIS_KERNELS_PENRYN #define BLIS_KERNELS_ZEN3 #define BLIS_KERNELS_ZEN2 #define BLIS_KERNELS_HASWELL #define BLIS_KERNELS_ZEN #define BLIS_KERNELS_PILEDRIVER #define BLIS_KERNELS_BULLDOZER #define BLIS_KERNELS_GENERIC #if 1 #define BLIS_ENABLE_SYSTEM #else #define BLIS_DISABLE_SYSTEM #endif #if 0 #define BLIS_ENABLE_OPENMP #endif #if 0 #define BLIS_ENABLE_PTHREADS #endif #if 1 #define BLIS_ENABLE_JRIR_SLAB #endif #if 0 #define BLIS_ENABLE_JRIR_RR #endif #if 1 #define BLIS_ENABLE_PBA_POOLS #else #define BLIS_DISABLE_PBA_POOLS #endif #if 1 #define BLIS_ENABLE_SBA_POOLS #else #define BLIS_DISABLE_SBA_POOLS #endif #if 0 #define BLIS_ENABLE_MEM_TRACING #else #define BLIS_DISABLE_MEM_TRACING #endif #if 64 == 64 #define BLIS_INT_TYPE_SIZE 64 #elif 64 == 32 #define BLIS_INT_TYPE_SIZE 32 #else // determine automatically #endif #if 32 == 64 #define BLIS_BLAS_INT_TYPE_SIZE 64 #elif 32 == 32 #define BLIS_BLAS_INT_TYPE_SIZE 32 #else // determine automatically #endif #ifndef BLIS_ENABLE_BLAS #ifndef BLIS_DISABLE_BLAS #if 0 #define BLIS_ENABLE_BLAS #else #define BLIS_DISABLE_BLAS #endif #endif #endif #ifndef BLIS_ENABLE_CBLAS #ifndef BLIS_DISABLE_CBLAS #if 0 #define BLIS_ENABLE_CBLAS #else #define BLIS_DISABLE_CBLAS #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT #ifndef BLIS_DISABLE_MIXED_DT #if 1 #define BLIS_ENABLE_MIXED_DT #else #define BLIS_DISABLE_MIXED_DT #endif #endif #endif #ifndef BLIS_ENABLE_MIXED_DT_EXTRA_MEM #ifndef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #if 1 #define BLIS_ENABLE_MIXED_DT_EXTRA_MEM #else #define BLIS_DISABLE_MIXED_DT_EXTRA_MEM #endif #endif #endif #if 1 #define BLIS_ENABLE_SUP_HANDLING #else #define BLIS_DISABLE_SUP_HANDLING #endif #if 0 #define BLIS_ENABLE_MEMKIND #else #define BLIS_DISABLE_MEMKIND #endif #if 1 #define BLIS_ENABLE_TRSM_PREINVERSION #else #define BLIS_DISABLE_TRSM_PREINVERSION #endif #if 1 #define BLIS_ENABLE_PRAGMA_OMP_SIMD #else #define BLIS_DISABLE_PRAGMA_OMP_SIMD #endif #if 0 #define BLIS_ENABLE_SANDBOX #else #define BLIS_DISABLE_SANDBOX #endif #if 0 #define BLIS_ENABLE_SHARED #else #define BLIS_DISABLE_SHARED #endif #if 0 #define BLIS_ENABLE_COMPLEX_RETURN_INTEL #else #define BLIS_DISABLE_COMPLEX_RETURN_INTEL #endif #endif // end bli_config.h // begin bli_config_macro_defs.h #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H // -- INTEGER PROPERTIES ------------------------------------------------------- // The bit size of the integer type used to track values such as dimensions, // strides, diagonal offsets. A value of 32 results in BLIS using 32-bit signed // integers while 64 results in 64-bit integers. Any other value results in use // of the C99 type "long int". Note that this ONLY affects integers used // internally within BLIS as well as those exposed in the native BLAS-like BLIS // interface. #ifndef BLIS_INT_TYPE_SIZE #ifdef BLIS_ARCH_64 #define BLIS_INT_TYPE_SIZE 64 #else #define BLIS_INT_TYPE_SIZE 32 #endif #endif // -- FLOATING-POINT PROPERTIES ------------------------------------------------ // Enable use of built-in C99 "float complex" and "double complex" types and // associated overloaded operations and functions? Disabling results in // scomplex and dcomplex being defined in terms of simple structs. // NOTE: AVOID USING THIS FEATURE. IT IS PROBABLY BROKEN. #ifdef BLIS_ENABLE_C99_COMPLEX // No additional definitions needed. #else // Default behavior is disabled. #endif // -- MULTITHREADING ----------------------------------------------------------- // Enable multithreading via POSIX threads. #ifdef BLIS_ENABLE_PTHREADS // No additional definitions needed. #else // Default behavior is disabled. #endif // Enable multithreading via OpenMP. #ifdef BLIS_ENABLE_OPENMP // No additional definitions needed. #else // Default behavior is disabled. #endif // Perform a sanity check to make sure the user doesn't try to enable // both OpenMP and pthreads. #if defined ( BLIS_ENABLE_OPENMP ) && \ defined ( BLIS_ENABLE_PTHREADS ) #error "BLIS_ENABLE_OPENMP and BLIS_ENABLE_PTHREADS may not be simultaneously defined." #endif // Here, we define BLIS_ENABLE_MULTITHREADING if either OpenMP // or pthreads are enabled. This macro is useful in situations when // we want to detect use of either OpenMP or pthreads (as opposed // to neither being used). #if defined ( BLIS_ENABLE_OPENMP ) || \ defined ( BLIS_ENABLE_PTHREADS ) #define BLIS_ENABLE_MULTITHREADING #endif // Enable the use of prime numbers of threads when requesting automatic thread // factorization. When disabled, requesting a prime number of threads will // result in a reduction (by one) of the number of threads, provided that the // prime number exceeds a minimum threshold (see below). #ifdef BLIS_ENABLE_AUTO_PRIME_NUM_THREADS #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #else // Default behavior is disabled. #undef BLIS_DISABLE_AUTO_PRIME_NUM_THREADS // In case user explicitly disabled. #define BLIS_DISABLE_AUTO_PRIME_NUM_THREADS #endif // Set the maximum requested number of threads that BLIS will accept from the // user that may be prime. If a larger prime number of threads is requested, // it will be reduced by one to allow for more efficient thread factorizations. // This value will only be used if BLIS_ENABLE_AUTO_PRIME_NUM_THREADS is defined. #ifndef BLIS_NT_MAX_PRIME #define BLIS_NT_MAX_PRIME 11 #endif // -- MIXED DATATYPE SUPPORT --------------------------------------------------- // Enable mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT #undef BLIS_ENABLE_GEMM_MD #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD #endif // Enable memory-intensive optimizations for mixed datatype support? #ifdef BLIS_DISABLE_MIXED_DT_EXTRA_MEM #undef BLIS_ENABLE_GEMM_MD_EXTRA_MEM #else // Default behavior is enabled. #define BLIS_ENABLE_GEMM_MD_EXTRA_MEM #endif // -- MISCELLANEOUS OPTIONS ---------------------------------------------------- // Do NOT require the cross-blocksize constraints. That is, do not enforce // MC % NR = 0 and NC % MR = 0 in bli_kernel_macro_defs.h. These are ONLY // needed when implementing trsm_r by allowing the right-hand matrix B to // be triangular. #ifndef BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #define BLIS_RELAX_MCNR_NCMR_CONSTRAINTS #endif // -- BLAS COMPATIBILITY LAYER ------------------------------------------------- // Enable the BLAS compatibility layer? #ifdef BLIS_DISABLE_BLAS #undef BLIS_ENABLE_BLAS #else // Default behavior is enabled. #undef BLIS_ENABLE_BLAS // In case user explicitly enabled. #define BLIS_ENABLE_BLAS #endif // The bit size of the integer type used to track values such as dimensions and // leading dimensions (ie: column strides) within the BLAS compatibility layer. // A value of 32 results in the compatibility layer using 32-bit signed integers // while 64 results in 64-bit integers. Any other value results in use of the // C99 type "long int". Note that this ONLY affects integers used within the // BLAS compatibility layer. #ifndef BLIS_BLAS_INT_TYPE_SIZE #define BLIS_BLAS_INT_TYPE_SIZE 32 #endif // By default, the level-3 BLAS routines are implemented by directly calling // the BLIS object API. Alternatively, they may first call the typed BLIS // API, which will then call the object API. //#define BLIS_BLAS3_CALLS_TAPI #ifdef BLIS_BLAS3_CALLS_TAPI #undef BLIS_BLAS3_CALLS_OAPI #else // Default behavior is to call object API directly. #undef BLIS_BLAS3_CALLS_OAPI // In case user explicitly enabled. #define BLIS_BLAS3_CALLS_OAPI #endif // -- CBLAS COMPATIBILITY LAYER ------------------------------------------------ // Enable the CBLAS compatibility layer? // NOTE: Enabling CBLAS will automatically enable the BLAS compatibility layer // regardless of whether or not it was explicitly enabled above. Furthermore, // the CBLAS compatibility layer will use the integer type size definition // specified above when defining the size of its own integers (regardless of // whether the BLAS layer was enabled directly or indirectly). #ifdef BLIS_ENABLE_CBLAS // No additional definitions needed. #else // Default behavior is disabled. #endif // -- SHARED LIBRARY SYMBOL EXPORT --------------------------------------------- // When building shared libraries, we can control which symbols are exported for // linking by external applications. BLIS annotates all function prototypes that // are meant to be "public" with BLIS_EXPORT_BLIS (with BLIS_EXPORT_BLAS playing // a similar role for BLAS compatibility routines). Which symbols are exported // is controlled by the default symbol visibility, as specifed by the gcc option // -fvisibility=[default|hidden]. The default for this option is 'default', or, // "public", which, if allowed to stand, causes all symbols in BLIS to be // linkable from the outside. But when compiling with -fvisibility=hidden, all // symbols start out hidden (that is, restricted only for internal use by BLIS), // with that setting overridden only for function prototypes or variable // declarations that are annotated with BLIS_EXPORT_BLIS. #ifndef BLIS_EXPORT #if !defined(BLIS_ENABLE_SHARED) #define BLIS_EXPORT #else #if defined(_WIN32) || defined(__CYGWIN__) #ifdef BLIS_IS_BUILDING_LIBRARY #define BLIS_EXPORT __declspec(dllexport) #else #define BLIS_EXPORT __declspec(dllimport) #endif #elif defined(__GNUC__) && __GNUC__ >= 4 #define BLIS_EXPORT __attribute__ ((visibility ("default"))) #else #define BLIS_EXPORT #endif #endif #endif #define BLIS_EXPORT_BLIS BLIS_EXPORT #define BLIS_EXPORT_BLAS BLIS_EXPORT #define BLIS_EXPORT_ADDON BLIS_EXPORT // -- OVERRIDABLE (WEAK) SYMBOLS ----------------------------------------------- // On Linux, functions called from a shared library can be overriden by the main // program simply by providing a new definition. However, macOS uses a "two-level // namespace" which causes calls to shared library functions to be tied to the // library and not overridable. As a workaround, certain symbols can be defined // as "weak" and are given lower preference during linking. #ifndef BLIS_OVERRIDABLE #if BLIS_OS_OSX #define BLIS_OVERRIDABLE __attribute__((weak)) #else #define BLIS_OVERRIDABLE #endif #endif // -- STATIC INLINE FUNCTIONS -------------------------------------------------- // C and C++ have different semantics for defining "inline" functions. In C, // the keyword phrase "static inline" accomplishes this, though the "inline" // is optional. In C++, the "inline" keyword is required and obviates "static" // altogether. Why does this matter? While BLIS is compiled in C99, blis.h may // be #included by a source file that is compiled with C++. #ifdef __cplusplus #define BLIS_INLINE inline #else //#define BLIS_INLINE static inline #define BLIS_INLINE static #endif #endif // end bli_config_macro_defs.h // begin bli_type_defs.h #ifndef BLIS_TYPE_DEFS_H #define BLIS_TYPE_DEFS_H // // -- BLIS basic types --------------------------------------------------------- // #ifdef __cplusplus // For C++, include stdint.h. #include // skipped #elif __STDC_VERSION__ >= 199901L // For C99 (or later), include stdint.h. #include // skipped #include // skipped #else // When stdint.h is not available, manually typedef the types we will use. #ifdef _WIN32 typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; #else #error "Attempting to compile on pre-C99 system without stdint.h." #endif #endif // -- General-purpose integers -- // If BLAS integers are 64 bits, mandate that BLIS integers also be 64 bits. // NOTE: This cpp guard will only meaningfully change BLIS's behavior on // systems where the BLIS integer size would have been automatically selected // to be 32 bits, since explicit selection of 32 bits is prohibited at // configure-time (and explicit or automatic selection of 64 bits is fine // and would have had the same result). #if BLIS_BLAS_INT_SIZE == 64 #undef BLIS_INT_TYPE_SIZE #define BLIS_INT_TYPE_SIZE 64 #endif // Define integer types depending on what size integer was requested. #if BLIS_INT_TYPE_SIZE == 32 typedef int32_t gint_t; typedef uint32_t guint_t; #elif BLIS_INT_TYPE_SIZE == 64 typedef int64_t gint_t; typedef uint64_t guint_t; #else typedef signed long int gint_t; typedef unsigned long int guint_t; #endif // -- Boolean type -- // NOTE: bool_t is no longer used and has been replaced with C99's bool type. //typedef bool bool_t; // BLIS uses TRUE and FALSE macro constants as possible boolean values, but we // define these macros in terms of true and false, respectively, which are // defined by C99 in stdbool.h. #ifndef TRUE #define TRUE true #endif #ifndef FALSE #define FALSE false #endif // -- Special-purpose integers -- // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DIM_T #define _DEFINED_DIM_T typedef gint_t dim_t; // dimension type #endif typedef gint_t inc_t; // increment/stride type typedef gint_t doff_t; // diagonal offset type typedef guint_t siz_t; // byte size type typedef uint32_t objbits_t; // object information bit field // -- Real types -- // Define the number of floating-point types supported, and the size of the // largest type. #define BLIS_NUM_FP_TYPES 4 #define BLIS_MAX_TYPE_SIZE sizeof(dcomplex) // There are some places where we need to use sizeof() inside of a C // preprocessor #if conditional, and so here we define the various sizes // for those purposes. #define BLIS_SIZEOF_S 4 // sizeof(float) #define BLIS_SIZEOF_D 8 // sizeof(double) #define BLIS_SIZEOF_C 8 // sizeof(scomplex) #define BLIS_SIZEOF_Z 16 // sizeof(dcomplex) // -- Complex types -- #ifdef BLIS_ENABLE_C99_COMPLEX #if __STDC_VERSION__ >= 199901L #include // skipped // Typedef official complex types to BLIS complex type names. typedef float complex scomplex; typedef double complex dcomplex; #else #error "Configuration requested C99 complex types, but C99 does not appear to be supported." #endif #else // ifndef BLIS_ENABLE_C99_COMPLEX // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_SCOMPLEX #define _DEFINED_SCOMPLEX typedef struct scomplex { float real; float imag; } scomplex; #endif // This cpp guard provides a temporary hack to allow libflame // interoperability with BLIS. #ifndef _DEFINED_DCOMPLEX #define _DEFINED_DCOMPLEX typedef struct dcomplex { double real; double imag; } dcomplex; #endif #endif // BLIS_ENABLE_C99_COMPLEX // -- Atom type -- // Note: atom types are used to hold "bufferless" scalar object values. Note // that it needs to be as large as the largest possible scalar value we might // want to hold. Thus, for now, it is a dcomplex. typedef dcomplex atom_t; // -- Fortran-77 types -- // Note: These types are typically only used by BLAS compatibility layer, but // we must define them even when the compatibility layer isn't being built // because they also occur in bli_slamch() and bli_dlamch(). // Define f77_int depending on what size of integer was requested. #if BLIS_BLAS_INT_TYPE_SIZE == 32 typedef int32_t f77_int; #elif BLIS_BLAS_INT_TYPE_SIZE == 64 typedef int64_t f77_int; #else typedef long int f77_int; #endif typedef char f77_char; typedef float f77_float; typedef double f77_double; typedef scomplex f77_scomplex; typedef dcomplex f77_dcomplex; // -- Misc. function pointer types -- // Note: This type should be used in any situation where the address of a // *function* will be conveyed or stored prior to it being typecast back // to the correct function type. It does not need to be used when conveying // or storing the address of *data* (such as an array of float or double). //typedef void (*void_fp)( void ); typedef void* void_fp; // Typedef function pointer types for malloc() and free() substitutes. typedef void* (*malloc_ft)( size_t size ); typedef void (*free_ft) ( void* p ); // // -- BLIS info bit field offsets ---------------------------------------------- // // info #define BLIS_DATATYPE_SHIFT 0 #define BLIS_DOMAIN_SHIFT 0 #define BLIS_PRECISION_SHIFT 1 #define BLIS_CONJTRANS_SHIFT 3 #define BLIS_TRANS_SHIFT 3 #define BLIS_CONJ_SHIFT 4 #define BLIS_UPLO_SHIFT 5 #define BLIS_UPPER_SHIFT 5 #define BLIS_DIAG_SHIFT 6 #define BLIS_LOWER_SHIFT 7 #define BLIS_UNIT_DIAG_SHIFT 8 #define BLIS_INVERT_DIAG_SHIFT 9 #define BLIS_TARGET_DT_SHIFT 10 #define BLIS_TARGET_DOMAIN_SHIFT 10 #define BLIS_TARGET_PREC_SHIFT 11 #define BLIS_EXEC_DT_SHIFT 13 #define BLIS_EXEC_DOMAIN_SHIFT 13 #define BLIS_EXEC_PREC_SHIFT 14 #define BLIS_PACK_SCHEMA_SHIFT 16 #define BLIS_PACK_RC_SHIFT 16 #define BLIS_PACK_PANEL_SHIFT 17 #define BLIS_PACK_FORMAT_SHIFT 18 #define BLIS_PACK_SHIFT 22 #define BLIS_PACK_REV_IF_UPPER_SHIFT 23 #define BLIS_PACK_REV_IF_LOWER_SHIFT 24 #define BLIS_PACK_BUFFER_SHIFT 25 #define BLIS_STRUC_SHIFT 27 #define BLIS_COMP_DT_SHIFT 29 #define BLIS_COMP_DOMAIN_SHIFT 29 #define BLIS_COMP_PREC_SHIFT 30 // info2 #define BLIS_SCALAR_DT_SHIFT 0 #define BLIS_SCALAR_DOMAIN_SHIFT 0 #define BLIS_SCALAR_PREC_SHIFT 1 // // -- BLIS info bit field masks ------------------------------------------------ // // info #define BLIS_DATATYPE_BITS ( 0x7 << BLIS_DATATYPE_SHIFT ) #define BLIS_DOMAIN_BIT ( 0x1 << BLIS_DOMAIN_SHIFT ) #define BLIS_PRECISION_BIT ( 0x1 << BLIS_PRECISION_SHIFT ) #define BLIS_CONJTRANS_BITS ( 0x3 << BLIS_CONJTRANS_SHIFT ) #define BLIS_TRANS_BIT ( 0x1 << BLIS_TRANS_SHIFT ) #define BLIS_CONJ_BIT ( 0x1 << BLIS_CONJ_SHIFT ) #define BLIS_UPLO_BITS ( 0x7 << BLIS_UPLO_SHIFT ) #define BLIS_UPPER_BIT ( 0x1 << BLIS_UPPER_SHIFT ) #define BLIS_DIAG_BIT ( 0x1 << BLIS_DIAG_SHIFT ) #define BLIS_LOWER_BIT ( 0x1 << BLIS_LOWER_SHIFT ) #define BLIS_UNIT_DIAG_BIT ( 0x1 << BLIS_UNIT_DIAG_SHIFT ) #define BLIS_INVERT_DIAG_BIT ( 0x1 << BLIS_INVERT_DIAG_SHIFT ) #define BLIS_TARGET_DT_BITS ( 0x7 << BLIS_TARGET_DT_SHIFT ) #define BLIS_TARGET_DOMAIN_BIT ( 0x1 << BLIS_TARGET_DOMAIN_SHIFT ) #define BLIS_TARGET_PREC_BIT ( 0x1 << BLIS_TARGET_PREC_SHIFT ) #define BLIS_EXEC_DT_BITS ( 0x7 << BLIS_EXEC_DT_SHIFT ) #define BLIS_EXEC_DOMAIN_BIT ( 0x1 << BLIS_EXEC_DOMAIN_SHIFT ) #define BLIS_EXEC_PREC_BIT ( 0x1 << BLIS_EXEC_PREC_SHIFT ) #define BLIS_PACK_SCHEMA_BITS ( 0x7F << BLIS_PACK_SCHEMA_SHIFT ) #define BLIS_PACK_RC_BIT ( 0x1 << BLIS_PACK_RC_SHIFT ) #define BLIS_PACK_PANEL_BIT ( 0x1 << BLIS_PACK_PANEL_SHIFT ) #define BLIS_PACK_FORMAT_BITS ( 0xF << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_PACK_BIT ( 0x1 << BLIS_PACK_SHIFT ) #define BLIS_PACK_REV_IF_UPPER_BIT ( 0x1 << BLIS_PACK_REV_IF_UPPER_SHIFT ) #define BLIS_PACK_REV_IF_LOWER_BIT ( 0x1 << BLIS_PACK_REV_IF_LOWER_SHIFT ) #define BLIS_PACK_BUFFER_BITS ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_STRUC_BITS ( 0x3 << BLIS_STRUC_SHIFT ) #define BLIS_COMP_DT_BITS ( 0x7 << BLIS_COMP_DT_SHIFT ) #define BLIS_COMP_DOMAIN_BIT ( 0x1 << BLIS_COMP_DOMAIN_SHIFT ) #define BLIS_COMP_PREC_BIT ( 0x1 << BLIS_COMP_PREC_SHIFT ) // info2 #define BLIS_SCALAR_DT_BITS ( 0x7 << BLIS_SCALAR_DT_SHIFT ) #define BLIS_SCALAR_DOMAIN_BIT ( 0x1 << BLIS_SCALAR_DOMAIN_SHIFT ) #define BLIS_SCALAR_PREC_BIT ( 0x1 << BLIS_SCALAR_PREC_SHIFT ) // // -- BLIS enumerated type value definitions ----------------------------------- // #define BLIS_BITVAL_REAL 0x0 #define BLIS_BITVAL_COMPLEX BLIS_DOMAIN_BIT #define BLIS_BITVAL_SINGLE_PREC 0x0 #define BLIS_BITVAL_DOUBLE_PREC BLIS_PRECISION_BIT #define BLIS_BITVAL_FLOAT_TYPE 0x0 #define BLIS_BITVAL_SCOMPLEX_TYPE BLIS_DOMAIN_BIT #define BLIS_BITVAL_DOUBLE_TYPE BLIS_PRECISION_BIT #define BLIS_BITVAL_DCOMPLEX_TYPE ( BLIS_DOMAIN_BIT | BLIS_PRECISION_BIT ) #define BLIS_BITVAL_INT_TYPE 0x04 #define BLIS_BITVAL_CONST_TYPE 0x05 #define BLIS_BITVAL_NO_TRANS 0x0 #define BLIS_BITVAL_TRANS BLIS_TRANS_BIT #define BLIS_BITVAL_NO_CONJ 0x0 #define BLIS_BITVAL_CONJ BLIS_CONJ_BIT #define BLIS_BITVAL_CONJ_TRANS ( BLIS_CONJ_BIT | BLIS_TRANS_BIT ) #define BLIS_BITVAL_ZEROS 0x0 #define BLIS_BITVAL_UPPER ( BLIS_UPPER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_LOWER ( BLIS_LOWER_BIT | BLIS_DIAG_BIT ) #define BLIS_BITVAL_DENSE BLIS_UPLO_BITS #define BLIS_BITVAL_NONUNIT_DIAG 0x0 #define BLIS_BITVAL_UNIT_DIAG BLIS_UNIT_DIAG_BIT #define BLIS_BITVAL_INVERT_DIAG BLIS_INVERT_DIAG_BIT #define BLIS_BITVAL_NOT_PACKED 0x0 #define BLIS_BITVAL_1E ( 0x1 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_1R ( 0x2 << BLIS_PACK_FORMAT_SHIFT ) #define BLIS_BITVAL_PACKED_UNSPEC ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_ROWS ( BLIS_PACK_BIT ) #define BLIS_BITVAL_PACKED_COLUMNS ( BLIS_PACK_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS ( BLIS_PACK_BIT | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1E ( BLIS_PACK_BIT | BLIS_BITVAL_1E | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACKED_ROW_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT ) #define BLIS_BITVAL_PACKED_COL_PANELS_1R ( BLIS_PACK_BIT | BLIS_BITVAL_1R | BLIS_PACK_PANEL_BIT | BLIS_PACK_RC_BIT ) #define BLIS_BITVAL_PACK_FWD_IF_UPPER 0x0 #define BLIS_BITVAL_PACK_REV_IF_UPPER BLIS_PACK_REV_IF_UPPER_BIT #define BLIS_BITVAL_PACK_FWD_IF_LOWER 0x0 #define BLIS_BITVAL_PACK_REV_IF_LOWER BLIS_PACK_REV_IF_LOWER_BIT #define BLIS_BITVAL_BUFFER_FOR_A_BLOCK 0x0 #define BLIS_BITVAL_BUFFER_FOR_B_PANEL ( 0x1 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_C_PANEL ( 0x2 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_BUFFER_FOR_GEN_USE ( 0x3 << BLIS_PACK_BUFFER_SHIFT ) #define BLIS_BITVAL_GENERAL 0x0 #define BLIS_BITVAL_HERMITIAN ( 0x1 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_SYMMETRIC ( 0x2 << BLIS_STRUC_SHIFT ) #define BLIS_BITVAL_TRIANGULAR ( 0x3 << BLIS_STRUC_SHIFT ) // // -- BLIS enumerated type definitions ----------------------------------------- // // -- Operational parameter types -- typedef enum { BLIS_NO_TRANSPOSE = 0x0, BLIS_TRANSPOSE = BLIS_BITVAL_TRANS, BLIS_CONJ_NO_TRANSPOSE = BLIS_BITVAL_CONJ, BLIS_CONJ_TRANSPOSE = BLIS_BITVAL_CONJ_TRANS } trans_t; typedef enum { BLIS_NO_CONJUGATE = 0x0, BLIS_CONJUGATE = BLIS_BITVAL_CONJ } conj_t; typedef enum { BLIS_ZEROS = BLIS_BITVAL_ZEROS, BLIS_LOWER = BLIS_BITVAL_LOWER, BLIS_UPPER = BLIS_BITVAL_UPPER, BLIS_DENSE = BLIS_BITVAL_DENSE } uplo_t; typedef enum { BLIS_LEFT = 0x0, BLIS_RIGHT } side_t; typedef enum { BLIS_NONUNIT_DIAG = 0x0, BLIS_UNIT_DIAG = BLIS_BITVAL_UNIT_DIAG } diag_t; typedef enum { BLIS_NO_INVERT_DIAG = 0x0, BLIS_INVERT_DIAG = BLIS_BITVAL_INVERT_DIAG } invdiag_t; typedef enum { BLIS_GENERAL = BLIS_BITVAL_GENERAL, BLIS_HERMITIAN = BLIS_BITVAL_HERMITIAN, BLIS_SYMMETRIC = BLIS_BITVAL_SYMMETRIC, BLIS_TRIANGULAR = BLIS_BITVAL_TRIANGULAR } struc_t; // -- Data type -- typedef enum { BLIS_FLOAT = BLIS_BITVAL_FLOAT_TYPE, BLIS_DOUBLE = BLIS_BITVAL_DOUBLE_TYPE, BLIS_SCOMPLEX = BLIS_BITVAL_SCOMPLEX_TYPE, BLIS_DCOMPLEX = BLIS_BITVAL_DCOMPLEX_TYPE, BLIS_INT = BLIS_BITVAL_INT_TYPE, BLIS_CONSTANT = BLIS_BITVAL_CONST_TYPE, BLIS_DT_LO = BLIS_FLOAT, BLIS_DT_HI = BLIS_DCOMPLEX } num_t; typedef enum { BLIS_REAL = BLIS_BITVAL_REAL, BLIS_COMPLEX = BLIS_BITVAL_COMPLEX } dom_t; typedef enum { BLIS_SINGLE_PREC = BLIS_BITVAL_SINGLE_PREC, BLIS_DOUBLE_PREC = BLIS_BITVAL_DOUBLE_PREC } prec_t; // -- Pack schema type -- typedef enum { BLIS_NOT_PACKED = BLIS_BITVAL_NOT_PACKED, BLIS_PACKED_UNSPEC = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_VECTOR = BLIS_BITVAL_PACKED_UNSPEC, BLIS_PACKED_ROWS = BLIS_BITVAL_PACKED_ROWS, BLIS_PACKED_COLUMNS = BLIS_BITVAL_PACKED_COLUMNS, BLIS_PACKED_ROW_PANELS = BLIS_BITVAL_PACKED_ROW_PANELS, BLIS_PACKED_COL_PANELS = BLIS_BITVAL_PACKED_COL_PANELS, BLIS_PACKED_ROW_PANELS_1E = BLIS_BITVAL_PACKED_ROW_PANELS_1E, BLIS_PACKED_COL_PANELS_1E = BLIS_BITVAL_PACKED_COL_PANELS_1E, BLIS_PACKED_ROW_PANELS_1R = BLIS_BITVAL_PACKED_ROW_PANELS_1R, BLIS_PACKED_COL_PANELS_1R = BLIS_BITVAL_PACKED_COL_PANELS_1R } pack_t; // We combine row and column packing into one "type", and we start // with BLIS_PACKED_ROW_PANELS, _COLUMN_PANELS. #define BLIS_NUM_PACK_SCHEMA_TYPES 3 // -- Pack order type -- typedef enum { BLIS_PACK_FWD_IF_UPPER = BLIS_BITVAL_PACK_FWD_IF_UPPER, BLIS_PACK_REV_IF_UPPER = BLIS_BITVAL_PACK_REV_IF_UPPER, BLIS_PACK_FWD_IF_LOWER = BLIS_BITVAL_PACK_FWD_IF_LOWER, BLIS_PACK_REV_IF_LOWER = BLIS_BITVAL_PACK_REV_IF_LOWER } packord_t; // -- Pack buffer type -- typedef enum { BLIS_BUFFER_FOR_A_BLOCK = BLIS_BITVAL_BUFFER_FOR_A_BLOCK, BLIS_BUFFER_FOR_B_PANEL = BLIS_BITVAL_BUFFER_FOR_B_PANEL, BLIS_BUFFER_FOR_C_PANEL = BLIS_BITVAL_BUFFER_FOR_C_PANEL, BLIS_BUFFER_FOR_GEN_USE = BLIS_BITVAL_BUFFER_FOR_GEN_USE } packbuf_t; // -- Partitioning direction -- typedef enum { BLIS_FWD, BLIS_BWD } dir_t; // -- Subpartition type -- typedef enum { BLIS_SUBPART0, BLIS_SUBPART1, BLIS_SUBPART2, BLIS_SUBPART1AND0, BLIS_SUBPART1AND2, BLIS_SUBPART1A, BLIS_SUBPART1B, BLIS_SUBPART00, BLIS_SUBPART10, BLIS_SUBPART20, BLIS_SUBPART01, BLIS_SUBPART11, BLIS_SUBPART21, BLIS_SUBPART02, BLIS_SUBPART12, BLIS_SUBPART22 } subpart_t; // -- Matrix dimension type -- typedef enum { BLIS_M = 0, BLIS_N = 1 } mdim_t; // -- Machine parameter types -- typedef enum { BLIS_MACH_EPS = 0, BLIS_MACH_SFMIN, BLIS_MACH_BASE, BLIS_MACH_PREC, BLIS_MACH_NDIGMANT, BLIS_MACH_RND, BLIS_MACH_EMIN, BLIS_MACH_RMIN, BLIS_MACH_EMAX, BLIS_MACH_RMAX, BLIS_MACH_EPS2 } machval_t; #define BLIS_NUM_MACH_PARAMS 11 #define BLIS_MACH_PARAM_FIRST BLIS_MACH_EPS #define BLIS_MACH_PARAM_LAST BLIS_MACH_EPS2 // -- Induced method types -- typedef enum { BLIS_1M = 0, BLIS_NAT, BLIS_IND_FIRST = 0, BLIS_IND_LAST = BLIS_NAT } ind_t; #define BLIS_NUM_IND_METHODS (BLIS_NAT+1) // These are used in bli_l3_*_oapi.c to construct the ind_t values from // the induced method substrings that go into function names. #define bli_1m BLIS_1M #define bli_nat BLIS_NAT // -- Kernel ID types -- typedef enum { BLIS_ADDV_KER = 0, BLIS_AMAXV_KER, BLIS_AXPBYV_KER, BLIS_AXPYV_KER, BLIS_COPYV_KER, BLIS_DOTV_KER, BLIS_DOTXV_KER, BLIS_INVERTV_KER, BLIS_SCALV_KER, BLIS_SCAL2V_KER, BLIS_SETV_KER, BLIS_SUBV_KER, BLIS_SWAPV_KER, BLIS_XPBYV_KER } l1vkr_t; #define BLIS_NUM_LEVEL1V_KERS 14 typedef enum { BLIS_AXPY2V_KER = 0, BLIS_DOTAXPYV_KER, BLIS_AXPYF_KER, BLIS_DOTXF_KER, BLIS_DOTXAXPYF_KER } l1fkr_t; #define BLIS_NUM_LEVEL1F_KERS 5 typedef enum { BLIS_PACKM_0XK_KER = 0, BLIS_PACKM_1XK_KER = 1, BLIS_PACKM_2XK_KER = 2, BLIS_PACKM_3XK_KER = 3, BLIS_PACKM_4XK_KER = 4, BLIS_PACKM_5XK_KER = 5, BLIS_PACKM_6XK_KER = 6, BLIS_PACKM_7XK_KER = 7, BLIS_PACKM_8XK_KER = 8, BLIS_PACKM_9XK_KER = 9, BLIS_PACKM_10XK_KER = 10, BLIS_PACKM_11XK_KER = 11, BLIS_PACKM_12XK_KER = 12, BLIS_PACKM_13XK_KER = 13, BLIS_PACKM_14XK_KER = 14, BLIS_PACKM_15XK_KER = 15, BLIS_PACKM_16XK_KER = 16, BLIS_PACKM_17XK_KER = 17, BLIS_PACKM_18XK_KER = 18, BLIS_PACKM_19XK_KER = 19, BLIS_PACKM_20XK_KER = 20, BLIS_PACKM_21XK_KER = 21, BLIS_PACKM_22XK_KER = 22, BLIS_PACKM_23XK_KER = 23, BLIS_PACKM_24XK_KER = 24, BLIS_PACKM_25XK_KER = 25, BLIS_PACKM_26XK_KER = 26, BLIS_PACKM_27XK_KER = 27, BLIS_PACKM_28XK_KER = 28, BLIS_PACKM_29XK_KER = 29, BLIS_PACKM_30XK_KER = 30, BLIS_PACKM_31XK_KER = 31, BLIS_UNPACKM_0XK_KER = 0, BLIS_UNPACKM_1XK_KER = 1, BLIS_UNPACKM_2XK_KER = 2, BLIS_UNPACKM_3XK_KER = 3, BLIS_UNPACKM_4XK_KER = 4, BLIS_UNPACKM_5XK_KER = 5, BLIS_UNPACKM_6XK_KER = 6, BLIS_UNPACKM_7XK_KER = 7, BLIS_UNPACKM_8XK_KER = 8, BLIS_UNPACKM_9XK_KER = 9, BLIS_UNPACKM_10XK_KER = 10, BLIS_UNPACKM_11XK_KER = 11, BLIS_UNPACKM_12XK_KER = 12, BLIS_UNPACKM_13XK_KER = 13, BLIS_UNPACKM_14XK_KER = 14, BLIS_UNPACKM_15XK_KER = 15, BLIS_UNPACKM_16XK_KER = 16, BLIS_UNPACKM_17XK_KER = 17, BLIS_UNPACKM_18XK_KER = 18, BLIS_UNPACKM_19XK_KER = 19, BLIS_UNPACKM_20XK_KER = 20, BLIS_UNPACKM_21XK_KER = 21, BLIS_UNPACKM_22XK_KER = 22, BLIS_UNPACKM_23XK_KER = 23, BLIS_UNPACKM_24XK_KER = 24, BLIS_UNPACKM_25XK_KER = 25, BLIS_UNPACKM_26XK_KER = 26, BLIS_UNPACKM_27XK_KER = 27, BLIS_UNPACKM_28XK_KER = 28, BLIS_UNPACKM_29XK_KER = 29, BLIS_UNPACKM_30XK_KER = 30, BLIS_UNPACKM_31XK_KER = 31 } l1mkr_t; #define BLIS_NUM_PACKM_KERS 32 #define BLIS_NUM_UNPACKM_KERS 32 typedef enum { BLIS_GEMM_UKR = 0, BLIS_GEMMTRSM_L_UKR, BLIS_GEMMTRSM_U_UKR, BLIS_TRSM_L_UKR, BLIS_TRSM_U_UKR } l3ukr_t; #define BLIS_NUM_LEVEL3_UKRS 5 typedef enum { BLIS_REFERENCE_UKERNEL = 0, BLIS_VIRTUAL_UKERNEL, BLIS_OPTIMIZED_UKERNEL, BLIS_NOTAPPLIC_UKERNEL } kimpl_t; #define BLIS_NUM_UKR_IMPL_TYPES 4 #if 0 typedef enum { // RV = row-stored, contiguous vector-loading // RG = row-stored, non-contiguous gather-loading // CV = column-stored, contiguous vector-loading // CG = column-stored, non-contiguous gather-loading // RD = row-stored, dot-based // CD = col-stored, dot-based // RC = row-stored, column-times-column // CR = column-stored, row-times-row // GX = general-stored generic implementation BLIS_GEMMSUP_RV_UKR = 0, BLIS_GEMMSUP_RG_UKR, BLIS_GEMMSUP_CV_UKR, BLIS_GEMMSUP_CG_UKR, BLIS_GEMMSUP_RD_UKR, BLIS_GEMMSUP_CD_UKR, BLIS_GEMMSUP_RC_UKR, BLIS_GEMMSUP_CR_UKR, BLIS_GEMMSUP_GX_UKR, } l3sup_t; #define BLIS_NUM_LEVEL3_SUP_UKRS 9 #endif typedef enum { // 3-operand storage combinations BLIS_RRR = 0, BLIS_RRC, // 1 BLIS_RCR, // 2 BLIS_RCC, // 3 BLIS_CRR, // 4 BLIS_CRC, // 5 BLIS_CCR, // 6 BLIS_CCC, // 7 BLIS_XXX, // 8 #if 0 BLIS_RRG, BLIS_RCG, BLIS_RGR, BLIS_RGC, BLIS_RGG, BLIS_CRG, BLIS_CCG, BLIS_CGR, BLIS_CGC, BLIS_CGG, BLIS_GRR, BLIS_GRC, BLIS_GRG, BLIS_GCR, BLIS_GCC, BLIS_GCG, BLIS_GGR, BLIS_GGC, BLIS_GGG, #endif } stor3_t; #define BLIS_NUM_3OP_RC_COMBOS 9 //#define BLIS_NUM_3OP_RCG_COMBOS 27 #if 0 typedef enum { BLIS_JC_IDX = 0, BLIS_PC_IDX, BLIS_IC_IDX, BLIS_JR_IDX, BLIS_IR_IDX, BLIS_PR_IDX } thridx_t; #endif #define BLIS_NUM_LOOPS 6 // -- Operation ID type -- typedef enum { // // NOTE: If/when additional type values are added to this enum, // you must either: // - keep the level-3 values (starting with _GEMM) beginning at // index 0; or // - if the value range is moved such that it does not begin at // index 0, implement something like a BLIS_OPID_LEVEL3_RANGE_START // value that can be subtracted from the opid_t value to map it // to a zero-based range. // This is needed because these level-3 opid_t values are used in // bli_l3_ind.c to index into arrays. // BLIS_GEMM = 0, BLIS_GEMMT, BLIS_HEMM, BLIS_HERK, BLIS_HER2K, BLIS_SYMM, BLIS_SYRK, BLIS_SYR2K, BLIS_TRMM3, BLIS_TRMM, BLIS_TRSM, BLIS_NOID } opid_t; #define BLIS_NUM_LEVEL3_OPS 11 // -- Blocksize ID type -- typedef enum { // NOTE: the level-3 blocksizes MUST be indexed starting at zero. // At one point, we made this assumption in bli_cntx_set_blkszs() // and friends. BLIS_KR = 0, BLIS_MR, BLIS_NR, BLIS_MC, BLIS_KC, BLIS_NC, BLIS_M2, // level-2 blocksize in m dimension BLIS_N2, // level-2 blocksize in n dimension BLIS_AF, // level-1f axpyf fusing factor BLIS_DF, // level-1f dotxf fusing factor BLIS_XF, // level-1f dotxaxpyf fusing factor BLIS_NO_PART // used as a placeholder when blocksizes are not applicable. } bszid_t; #define BLIS_NUM_BLKSZS 11 // -- Threshold ID type -- typedef enum { BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension BLIS_KT // level-3 small/unpacked matrix threshold in k dimension } threshid_t; #define BLIS_NUM_THRESH 3 // -- Architecture ID type -- // NOTE: This typedef enum must be kept up-to-date with the arch_t // string array in bli_arch.c. Whenever values are added/inserted // OR if values are rearranged, be sure to update the string array // in bli_arch.c. typedef enum { // NOTE: The C language standard guarantees that the first enum value // starts at 0. // Intel BLIS_ARCH_SKX, BLIS_ARCH_KNL, BLIS_ARCH_KNC, BLIS_ARCH_HASWELL, BLIS_ARCH_SANDYBRIDGE, BLIS_ARCH_PENRYN, // AMD BLIS_ARCH_ZEN3, BLIS_ARCH_ZEN2, BLIS_ARCH_ZEN, BLIS_ARCH_EXCAVATOR, BLIS_ARCH_STEAMROLLER, BLIS_ARCH_PILEDRIVER, BLIS_ARCH_BULLDOZER, // ARM BLIS_ARCH_ARMSVE, BLIS_ARCH_A64FX, BLIS_ARCH_FIRESTORM, BLIS_ARCH_THUNDERX2, BLIS_ARCH_CORTEXA57, BLIS_ARCH_CORTEXA53, BLIS_ARCH_CORTEXA15, BLIS_ARCH_CORTEXA9, // IBM/Power BLIS_ARCH_POWER10, BLIS_ARCH_POWER9, BLIS_ARCH_POWER7, BLIS_ARCH_BGQ, // Generic architecture/configuration BLIS_ARCH_GENERIC, // The total number of defined architectures. This must be last in the // list of enums since its definition assumes that the previous enum // value (BLIS_ARCH_GENERIC) is given index num_archs-1. BLIS_NUM_ARCHS } arch_t; // // -- BLIS misc. structure types ----------------------------------------------- // // This header must be included here (or earlier) because definitions it // provides are needed in the pool_t and related structs. // begin bli_pthread.h #ifndef BLIS_PTHREAD_H #define BLIS_PTHREAD_H // -- Type and macro definitions ----------------------------------------------- #if defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of "dummy" code that doesn't depend on POSIX threads or any other // threading mechanism. See issue #454 to see the use case that prompted this // feature. // NOTE: THIS CODE DOES NOT IMPLEMENT THREADING AND IS NOT THREAD-SAFE! // -- pthread types -- typedef int bli_pthread_t; typedef int bli_pthread_attr_t; typedef int bli_pthread_mutex_t; typedef int bli_pthread_mutexattr_t; typedef int bli_pthread_cond_t; typedef int bli_pthread_condattr_t; typedef int bli_pthread_once_t; typedef int bli_pthread_barrier_t; typedef int bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER 0 #define BLIS_PTHREAD_COND_INITIALIZER 0 #define BLIS_PTHREAD_ONCE_INIT 0 #elif defined(_MSC_VER) // !defined(BLIS_DISABLE_SYSTEM) // This branch defines a pthread-like API, bli_pthread_*(), and implements it // in terms of Windows API calls. // -- pthread types -- typedef struct { HANDLE handle; void* retval; } bli_pthread_t; typedef void bli_pthread_attr_t; typedef SRWLOCK bli_pthread_mutex_t; typedef void bli_pthread_mutexattr_t; typedef CONDITION_VARIABLE bli_pthread_cond_t; typedef void bli_pthread_condattr_t; typedef INIT_ONCE bli_pthread_once_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; typedef void bli_pthread_barrierattr_t; // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT #define BLIS_PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT #define BLIS_PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT #else // !defined(BLIS_DISABLE_SYSTEM) && !defined(_MSC_VER) #include // skipped // This branch defines a pthreads-like API, bli_pthreads_*(), and implements it // in terms of the corresponding pthreads_*() types, macros, and function calls. // -- pthread types -- typedef pthread_t bli_pthread_t; typedef pthread_attr_t bli_pthread_attr_t; typedef pthread_mutex_t bli_pthread_mutex_t; typedef pthread_mutexattr_t bli_pthread_mutexattr_t; typedef pthread_cond_t bli_pthread_cond_t; typedef pthread_condattr_t bli_pthread_condattr_t; typedef pthread_once_t bli_pthread_once_t; #if defined(__APPLE__) // For OS X, we must define the barrier types ourselves since Apple does // not implement barriers in their variant of pthreads. typedef void bli_pthread_barrierattr_t; typedef struct { bli_pthread_mutex_t mutex; bli_pthread_cond_t cond; int count; int tripCount; } bli_pthread_barrier_t; #else // For other non-Windows OSes (primarily Linux), we can define the barrier // types in terms of existing pthreads barrier types since we expect they // will be provided by the pthreads implementation. typedef pthread_barrier_t bli_pthread_barrier_t; typedef pthread_barrierattr_t bli_pthread_barrierattr_t; #endif // -- pthreads macros -- #define BLIS_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define BLIS_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER #define BLIS_PTHREAD_ONCE_INIT PTHREAD_ONCE_INIT #endif // -- Function definitions ----------------------------------------------------- // -- pthread_create(), pthread_join() -- BLIS_EXPORT_BLIS int bli_pthread_create ( bli_pthread_t* thread, const bli_pthread_attr_t* attr, void* (*start_routine)(void*), void* arg ); BLIS_EXPORT_BLIS int bli_pthread_join ( bli_pthread_t thread, void** retval ); // -- pthread_mutex_*() -- BLIS_EXPORT_BLIS int bli_pthread_mutex_init ( bli_pthread_mutex_t* mutex, const bli_pthread_mutexattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_mutex_destroy ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_lock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_trylock ( bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_mutex_unlock ( bli_pthread_mutex_t* mutex ); // -- pthread_cond_*() -- BLIS_EXPORT_BLIS int bli_pthread_cond_init ( bli_pthread_cond_t* cond, const bli_pthread_condattr_t* attr ); BLIS_EXPORT_BLIS int bli_pthread_cond_destroy ( bli_pthread_cond_t* cond ); BLIS_EXPORT_BLIS int bli_pthread_cond_wait ( bli_pthread_cond_t* cond, bli_pthread_mutex_t* mutex ); BLIS_EXPORT_BLIS int bli_pthread_cond_broadcast ( bli_pthread_cond_t* cond ); // -- pthread_once() -- BLIS_EXPORT_BLIS void bli_pthread_once ( bli_pthread_once_t* once, void (*init)(void) ); #if 0 // NOTE: This part of the API is disabled because (1) we don't actually need // _self() or _equal() yet, and (2) when we do try to include these functions, // AppVeyor for some reason fails on all the Windows/clang builds with the // error: // libblis.a(bli_pthread.o) : error LNK2019: unresolved external symbol // __imp_CompareObjectHandles referenced in function bli_pthread_equal // -- pthread_self() -- BLIS_EXPORT_BLIS bli_pthread_t bli_pthread_self ( void ); // -- pthread_equal() -- BLIS_EXPORT_BLIS int bli_pthread_equal ( bli_pthread_t t1, bli_pthread_t t2 ); #endif // -- pthread_barrier_*() -- BLIS_EXPORT_BLIS int bli_pthread_barrier_init ( bli_pthread_barrier_t* barrier, const bli_pthread_barrierattr_t* attr, unsigned int count ); BLIS_EXPORT_BLIS int bli_pthread_barrier_destroy ( bli_pthread_barrier_t* barrier ); BLIS_EXPORT_BLIS int bli_pthread_barrier_wait ( bli_pthread_barrier_t* barrier ); #endif // BLIS_PTHREAD_H // end bli_pthread.h // -- Pool block type -- typedef struct { void* buf; siz_t block_size; } pblk_t; // -- Pool type -- typedef struct { void* block_ptrs; dim_t block_ptrs_len; dim_t top_index; dim_t num_blocks; siz_t block_size; siz_t align_size; siz_t offset_size; malloc_ft malloc_fp; free_ft free_fp; } pool_t; // -- Array type -- typedef struct { void* buf; siz_t num_elem; siz_t elem_size; } array_t; // -- Locked pool-of-arrays-of-pools type -- typedef struct { bli_pthread_mutex_t mutex; pool_t pool; siz_t def_array_len; } apool_t; // -- packing block allocator: Locked set of pools type -- typedef struct pba_s { pool_t pools[3]; bli_pthread_mutex_t mutex; // These fields are used for general-purpose allocation. siz_t align_size; malloc_ft malloc_fp; free_ft free_fp; } pba_t; // -- Memory object type -- typedef struct mem_s { pblk_t pblk; packbuf_t buf_type; pool_t* pool; siz_t size; } mem_t; // -- Control tree node type -- struct cntl_s { // Basic fields (usually required). opid_t family; bszid_t bszid; void_fp var_func; struct cntl_s* sub_prenode; struct cntl_s* sub_node; // Optional fields (needed only by some operations such as packm). // NOTE: first field of params must be a uint64_t containing the size // of the struct. void* params; // Internal fields that track "cached" data. mem_t pack_mem; }; typedef struct cntl_s cntl_t; // -- Blocksize object type -- typedef struct blksz_s { // Primary blocksize values. dim_t v[BLIS_NUM_FP_TYPES]; // Blocksize extensions. dim_t e[BLIS_NUM_FP_TYPES]; } blksz_t; // -- Function pointer object type -- typedef struct func_s { // Kernel function address. void_fp ptr[BLIS_NUM_FP_TYPES]; } func_t; // -- Multi-boolean object type -- typedef struct mbool_s { bool v[BLIS_NUM_FP_TYPES]; } mbool_t; // -- Auxiliary kernel info type -- // Note: This struct is used by macro-kernels to package together extra // parameter values that may be of use to the micro-kernel without // cluttering up the micro-kernel interface itself. typedef struct { // The pack schemas of A and B. pack_t schema_a; pack_t schema_b; // Pointers to the micro-panels of A and B which will be used by the // next call to the micro-kernel. void* a_next; void* b_next; // The imaginary strides of A and B. inc_t is_a; inc_t is_b; // The panel strides of A and B. // NOTE: These are only used in situations where iteration over the // micropanels takes place in part within the kernel code (e.g. sup // millikernels). inc_t ps_a; inc_t ps_b; // The type to convert to on output. //num_t dt_on_output; // (Virtual) microkernel address and additional parameters. void_fp ukr; void* params; } auxinfo_t; // -- Global scalar constant data struct -- // Note: This struct is used only when statically initializing the // global scalar constants in bli_const.c. typedef struct constdata_s { float s; double d; scomplex c; dcomplex z; gint_t i; } constdata_t; // // -- BLIS object type definitions --------------------------------------------- // // Forward declarations for function pointer types struct obj_s; struct cntx_s; struct rntm_s; struct thrinfo_s; typedef void (*obj_pack_fn_t) ( struct obj_s* a, struct obj_s* ap, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef void (*obj_ker_fn_t) ( struct obj_s* a, struct obj_s* b, struct obj_s* c, struct cntx_s* cntx, struct rntm_s* rntm, struct cntl_s* cntl, struct thrinfo_s* thread ); typedef struct obj_s { // Basic fields struct obj_s* root; dim_t off[2]; dim_t dim[2]; doff_t diag_off; objbits_t info; objbits_t info2; siz_t elem_size; void* buffer; inc_t rs; inc_t cs; inc_t is; // Bufferless scalar storage atom_t scalar; // Pack-related fields dim_t m_padded; // m dimension of matrix, including any padding dim_t n_padded; // n dimension of matrix, including any padding inc_t ps; // panel stride (distance to next panel) inc_t pd; // panel dimension (the "width" of a panel: // usually MR or NR) dim_t m_panel; // m dimension of a "full" panel dim_t n_panel; // n dimension of a "full" panel // User-customizable fields obj_pack_fn_t pack_fn; void* pack_params; obj_ker_fn_t ker_fn; void* ker_params; } obj_t; // Pre-initializors. Things that must be set afterwards: // - root object pointer // - info bitfields: dt, target_dt, exec_dt, comp_dt // - info2 bitfields: scalar_dt // - elem_size // - dims, strides // - buffer // - internal scalar buffer (must always set imaginary component) #define BLIS_OBJECT_INITIALIZER \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 0, 0 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } #define BLIS_OBJECT_INITIALIZER_1X1 \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( float ), \ \ .buffer = NULL, \ .rs = 0, \ .cs = 0, \ .is = 1, \ \ .scalar = { 0.0, 0.0 }, \ \ .m_padded = 0, \ .n_padded = 0, \ .ps = 0, \ .pd = 0, \ .m_panel = 0, \ .n_panel = 0, \ \ .pack_fn = NULL, \ .pack_params = NULL, \ .ker_fn = NULL, \ .ker_params = NULL \ } // Define these macros here since they must be updated if contents of // obj_t changes. BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; b->dim[0] = a->dim[0]; b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b ) { b->root = a->root; b->off[0] = a->off[0]; b->off[1] = a->off[1]; // Avoid copying m and n since they will be overwritten. //b->dim[0] = a->dim[0]; //b->dim[1] = a->dim[1]; b->diag_off = a->diag_off; b->info = a->info; b->info2 = a->info2; b->elem_size = a->elem_size; b->buffer = a->buffer; b->rs = a->rs; b->cs = a->cs; b->is = a->is; b->scalar = a->scalar; // Avoid copying pack_mem entry. // FGVZ: You should probably make sure this is right. //b->pack_mem = a->pack_mem; b->m_padded = a->m_padded; b->n_padded = a->n_padded; b->ps = a->ps; b->pd = a->pd; b->m_panel = a->m_panel; b->n_panel = a->n_panel; b->pack_fn = a->pack_fn; b->pack_params = a->pack_params; b->ker_fn = a->ker_fn; b->ker_params = a->ker_params; } // Initializors for global scalar constants. // NOTE: These must remain cpp macros since they are initializor // expressions, not functions. #define bli_obj_init_const( buffer0 ) \ { \ .root = NULL, \ \ .off = { 0, 0 }, \ .dim = { 1, 1 }, \ .diag_off = 0, \ \ .info = 0x0 | BLIS_BITVAL_CONST_TYPE | \ BLIS_BITVAL_DENSE | \ BLIS_BITVAL_GENERAL, \ .info2 = 0x0, \ .elem_size = sizeof( constdata_t ), \ \ .buffer = buffer0, \ .rs = 1, \ .cs = 1, \ .is = 1 \ } #define bli_obj_init_constdata( val ) \ { \ .s = ( float )val, \ .d = ( double )val, \ .c = { .real = ( float )val, .imag = 0.0f }, \ .z = { .real = ( double )val, .imag = 0.0 }, \ .i = ( gint_t )val, \ } // -- Context type -- typedef struct cntx_s { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; bszid_t bmults[ BLIS_NUM_BLKSZS ]; func_t l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; func_t l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ]; mbool_t l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ]; blksz_t l3_sup_thresh[ BLIS_NUM_THRESH ]; void* l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ]; blksz_t l3_sup_blkszs[ BLIS_NUM_BLKSZS ]; func_t l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ]; mbool_t l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ]; func_t l1f_kers[ BLIS_NUM_LEVEL1F_KERS ]; func_t l1v_kers[ BLIS_NUM_LEVEL1V_KERS ]; func_t packm_kers[ BLIS_NUM_PACKM_KERS ]; func_t unpackm_kers[ BLIS_NUM_UNPACKM_KERS ]; ind_t method; } cntx_t; // -- Runtime type -- // NOTE: The order of these fields must be kept consistent with the definition // of the BLIS_RNTM_INITIALIZER macro in bli_rntm.h. typedef struct rntm_s { // "External" fields: these may be queried by the end-user. bool auto_factor; dim_t num_threads; dim_t thrloop[ BLIS_NUM_LOOPS ]; bool pack_a; // enable/disable packing of left-hand matrix A. bool pack_b; // enable/disable packing of right-hand matrix B. bool l3_sup; // enable/disable small matrix handling in level-3 ops. // "Internal" fields: these should not be exposed to the end-user. // The small block pool, which is attached in the l3 thread decorator. pool_t* sba_pool; // The packing block allocator, which is attached in the l3 thread decorator. pba_t* pba; } rntm_t; // -- Error types -- typedef enum { BLIS_NO_ERROR_CHECKING = 0, BLIS_FULL_ERROR_CHECKING } errlev_t; typedef enum { // Generic error codes BLIS_SUCCESS = ( -1), BLIS_FAILURE = ( -2), BLIS_ERROR_CODE_MIN = ( -9), // General errors BLIS_INVALID_ERROR_CHECKING_LEVEL = ( -10), BLIS_UNDEFINED_ERROR_CODE = ( -11), BLIS_NULL_POINTER = ( -12), BLIS_NOT_YET_IMPLEMENTED = ( -13), // Parameter-specific errors BLIS_INVALID_SIDE = ( -20), BLIS_INVALID_UPLO = ( -21), BLIS_INVALID_TRANS = ( -22), BLIS_INVALID_CONJ = ( -23), BLIS_INVALID_DIAG = ( -24), BLIS_INVALID_MACHVAL = ( -25), BLIS_EXPECTED_NONUNIT_DIAG = ( -26), // Datatype-specific errors BLIS_INVALID_DATATYPE = ( -30), BLIS_EXPECTED_FLOATING_POINT_DATATYPE = ( -31), BLIS_EXPECTED_NONINTEGER_DATATYPE = ( -32), BLIS_EXPECTED_NONCONSTANT_DATATYPE = ( -33), BLIS_EXPECTED_REAL_DATATYPE = ( -34), BLIS_EXPECTED_INTEGER_DATATYPE = ( -35), BLIS_INCONSISTENT_DATATYPES = ( -36), BLIS_EXPECTED_REAL_PROJ_OF = ( -37), BLIS_EXPECTED_REAL_VALUED_OBJECT = ( -38), BLIS_INCONSISTENT_PRECISIONS = ( -39), // Dimension-specific errors BLIS_NONCONFORMAL_DIMENSIONS = ( -40), BLIS_EXPECTED_SCALAR_OBJECT = ( -41), BLIS_EXPECTED_VECTOR_OBJECT = ( -42), BLIS_UNEQUAL_VECTOR_LENGTHS = ( -43), BLIS_EXPECTED_SQUARE_OBJECT = ( -44), BLIS_UNEXPECTED_OBJECT_LENGTH = ( -45), BLIS_UNEXPECTED_OBJECT_WIDTH = ( -46), BLIS_UNEXPECTED_VECTOR_DIM = ( -47), BLIS_UNEXPECTED_DIAG_OFFSET = ( -48), BLIS_NEGATIVE_DIMENSION = ( -49), // Stride-specific errors BLIS_INVALID_ROW_STRIDE = ( -50), BLIS_INVALID_COL_STRIDE = ( -51), BLIS_INVALID_DIM_STRIDE_COMBINATION = ( -52), // Structure-specific errors BLIS_EXPECTED_GENERAL_OBJECT = ( -60), BLIS_EXPECTED_HERMITIAN_OBJECT = ( -61), BLIS_EXPECTED_SYMMETRIC_OBJECT = ( -62), BLIS_EXPECTED_TRIANGULAR_OBJECT = ( -63), // Storage-specific errors BLIS_EXPECTED_UPPER_OR_LOWER_OBJECT = ( -70), // Partitioning-specific errors BLIS_INVALID_3x1_SUBPART = ( -80), BLIS_INVALID_1x3_SUBPART = ( -81), BLIS_INVALID_3x3_SUBPART = ( -82), // Control tree-specific errors BLIS_UNEXPECTED_NULL_CONTROL_TREE = ( -90), // Packing-specific errors BLIS_PACK_SCHEMA_NOT_SUPPORTED_FOR_UNPACK = (-100), // Buffer-specific errors BLIS_EXPECTED_NONNULL_OBJECT_BUFFER = (-110), // Memory errors BLIS_MALLOC_RETURNED_NULL = (-120), // Internal memory pool errors BLIS_INVALID_PACKBUF = (-130), BLIS_EXHAUSTED_CONTIG_MEMORY_POOL = (-131), BLIS_INSUFFICIENT_STACK_BUF_SIZE = (-132), BLIS_ALIGNMENT_NOT_POWER_OF_TWO = (-133), BLIS_ALIGNMENT_NOT_MULT_OF_PTR_SIZE = (-134), // Object-related errors BLIS_EXPECTED_OBJECT_ALIAS = (-140), // Architecture-related errors BLIS_INVALID_ARCH_ID = (-150), BLIS_UNINITIALIZED_GKS_CNTX = (-151), // Blocksize-related errors BLIS_MC_DEF_NONMULTIPLE_OF_MR = (-160), BLIS_MC_MAX_NONMULTIPLE_OF_MR = (-161), BLIS_NC_DEF_NONMULTIPLE_OF_NR = (-162), BLIS_NC_MAX_NONMULTIPLE_OF_NR = (-163), BLIS_KC_DEF_NONMULTIPLE_OF_KR = (-164), BLIS_KC_MAX_NONMULTIPLE_OF_KR = (-165), BLIS_ERROR_CODE_MAX = (-170) } err_t; #endif // end bli_type_defs.h enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113}; enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; #ifdef __cplusplus extern "C" { #endif BLIS_EXPORT_BLAS float cblas_sdsdot(f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_dsdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS float cblas_sdot(f77_int N, const float *X, f77_int incX, const float *Y, f77_int incY); BLIS_EXPORT_BLAS double cblas_ddot(f77_int N, const double *X, f77_int incX, const double *Y, f77_int incY); BLIS_EXPORT_BLAS void cblas_cdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_cdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS void cblas_zdotu_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotu); BLIS_EXPORT_BLAS void cblas_zdotc_sub(f77_int N, const void *X, f77_int incX, const void *Y, f77_int incY, void *dotc); BLIS_EXPORT_BLAS float cblas_snrm2(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_sasum(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dnrm2(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dasum(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scnrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS float cblas_scasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dznrm2(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS double cblas_dzasum(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_isamax(f77_int N, const float *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_idamax(f77_int N, const double *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_icamax(f77_int N, const void *X, f77_int incX); BLIS_EXPORT_BLAS f77_int cblas_izamax(f77_int N, const void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sswap(f77_int N, float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_scopy(f77_int N, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_saxpy(f77_int N, float alpha, const float *X, f77_int incX, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dswap(f77_int N, double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dcopy(f77_int N, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpy(f77_int N, double alpha, const double *X, f77_int incX, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ccopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zswap(f77_int N, void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zcopy(f77_int N, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpy(f77_int N, const void *alpha, const void *X, f77_int incX, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_srotg(float *a, float *b, float *c, float *s); void BLIS_EXPORT_BLAS cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); void BLIS_EXPORT_BLAS cblas_srot(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float c, const float s); void BLIS_EXPORT_BLAS cblas_srotm(f77_int N, float *X, f77_int incX, float *Y, f77_int incY, const float *P); void BLIS_EXPORT_BLAS cblas_drotg(double *a, double *b, double *c, double *s); void BLIS_EXPORT_BLAS cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); void BLIS_EXPORT_BLAS cblas_drot(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double c, const double s); void BLIS_EXPORT_BLAS cblas_drotm(f77_int N, double *X, f77_int incX, double *Y, f77_int incY, const double *P); void BLIS_EXPORT_BLAS cblas_sscal(f77_int N, float alpha, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dscal(f77_int N, double alpha, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zscal(f77_int N, const void *alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_csscal(f77_int N, float alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zdscal(f77_int N, double alpha, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const float *A, f77_int lda, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const float *Ap, float *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const double *A, f77_int lda, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const double *Ap, double *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, f77_int M, f77_int N, f77_int KL, f77_int KU, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, f77_int K, const void *A, f77_int lda, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int N, const void *Ap, void *X, f77_int incX); void BLIS_EXPORT_BLAS cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *Ap, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_sger(enum CBLAS_ORDER order, f77_int M, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, float *Ap); void BLIS_EXPORT_BLAS cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const float *X, f77_int incX, const float *Y, f77_int incY, float *A); void BLIS_EXPORT_BLAS cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *Ap, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_dger(enum CBLAS_ORDER order, f77_int M, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, double *Ap); void BLIS_EXPORT_BLAS cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const double *X, f77_int incX, const double *Y, f77_int incY, double *A); void BLIS_EXPORT_BLAS cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_cgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, float alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *Ap, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zgeru(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zgerc(enum CBLAS_ORDER order, f77_int M, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, double alpha, const void *X, f77_int incX, void *A); void BLIS_EXPORT_BLAS cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *A, f77_int lda); void BLIS_EXPORT_BLAS cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, f77_int N, const void *alpha, const void *X, f77_int incX, const void *Y, f77_int incY, void *Ap); void BLIS_EXPORT_BLAS cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, float alpha, const float *A, f77_int lda, float *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, double alpha, const double *A, f77_int lda, double *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, void *B, f77_int ldb); void BLIS_EXPORT_BLAS cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, float alpha, const void *A, f77_int lda, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, float beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, f77_int M, f77_int N, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, double alpha, const void *A, f77_int lda, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, double beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_xerbla(f77_int p, const char *rout, const char *form, ...); // -- APIs to operations unique to BLIS -- void BLIS_EXPORT_BLAS cblas_saxpby(f77_int N, float alpha, const float *X, f77_int incX, float beta, float *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_daxpby(f77_int N, double alpha, const double *X, f77_int incX, double beta, double *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_caxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void* beta, void *Y, f77_int incY); void BLIS_EXPORT_BLAS cblas_zaxpby(f77_int N, const void *alpha, const void *X, f77_int incX, const void *beta, void *Y, f77_int incY); // -- APIs to level-3-like operations -- void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, float alpha, const float *A, f77_int lda, const float *B, f77_int ldb, float beta, float *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, double alpha, const double *A, f77_int lda, const double *B, f77_int ldb, double beta, double *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); // -- Batch APIs -- void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const float *alpha_array, const float **A, f77_int *lda_array, const float **B, f77_int *ldb_array, const float *beta_array, float **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const double *alpha_array, const double **A,f77_int *lda_array, const double **B, f77_int *ldb_array, const double *beta_array, double **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE *TransA_array, enum CBLAS_TRANSPOSE *TransB_array, f77_int *M_array, f77_int *N_array, f77_int *K_array, const void *alpha_array, const void **A, f77_int *lda_array, const void **B, f77_int *ldb_array, const void *beta_array, void **C, f77_int *ldc_array, f77_int group_count, f77_int *group_size); // -- 3m APIs -- void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); void BLIS_EXPORT_BLAS cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N, f77_int K, const void *alpha, const void *A, f77_int lda, const void *B, f77_int ldb, const void *beta, void *C, f77_int ldc); #ifdef __cplusplus } #endif #endif // end cblas.h #endif // BLIS_ENABLE_CBLAS #endif // end bli_cblas.h // -- Windows definitions // begin bli_winsys.h //int bli_setenv( const char *name, const char *value, int overwrite ); BLIS_EXPORT_BLIS void bli_sleep( unsigned int secs ); // end bli_winsys.h // End extern "C" construct block. #ifdef __cplusplus } #endif #endif cython-blis-0.9.1/blis/_src/kernels/000077500000000000000000000000001427272030600173015ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armsve/000077500000000000000000000000001427272030600205765ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armsve/1m/000077500000000000000000000000001427272030600211135ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armsve/1m/armsve512_asm_transpose_d8x2.h000066400000000000000000000043241427272030600266170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define SVE512_IN_REG_TRANSPOSE_d8x2(DST0,DST1,DST2,DST3,DST4,DST5,DST6SRC0,DST7SRC1,PT,P2C,P4C,P6C) \ "trn1 " #DST0".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \ "trn2 " #DST1".d, " #DST6SRC0".d, " #DST7SRC1".d \n\t" \ "compact " #DST2".d, " #P2C", " #DST0".d \n\t" \ "compact " #DST3".d, " #P2C", " #DST1".d \n\t" \ "compact " #DST4".d, " #P4C", " #DST0".d \n\t" \ "compact " #DST5".d, " #P4C", " #DST1".d \n\t" \ "compact " #DST6SRC0".d, " #P6C", " #DST0".d \n\t" \ "compact " #DST7SRC1".d, " #P6C", " #DST1".d \n\t" cython-blis-0.9.1/blis/_src/kernels/armsve/1m/armsve512_asm_transpose_d8x8.h000066400000000000000000000113651427272030600266300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(XTMP,PT,P2C,P4C,P6C,PTFTF,P4,P6) \ "ptrue " #PT".d \n\t" \ "mov " #XTMP", #2 \n\t" \ "whilelo " #P2C".d, xzr, " #XTMP" \n\t" \ "mov " #XTMP", #4 \n\t" \ "whilelo " #P4".d, xzr, " #XTMP" \n\t" \ "mov " #XTMP", #6 \n\t" \ "whilelo " #P6".d, xzr, " #XTMP" \n\t" \ \ "eor " #PTFTF".b, " #PT"/z, " #P6".b, " #P4".b \n\t" /***** o o | o */ \ "orr " #PTFTF".b, " #PT"/z, " #PTFTF".b, " #P2C".b \n\t" /* | o | o */ \ \ "not " #P2C".b, " #PT"/z, " #P2C".b \n\t" \ "not " #P4C".b, " #PT"/z, " #P4".b \n\t" \ "not " #P6C".b, " #PT"/z, " #P6".b \n\t" \ #define SVE512_IN_REG_TRANSPOSE_d8x8(DST0,DST1,DST2,DST3,DST4,DST5,DST6,DST7,SRC0,SRC1,SRC2,SRC3,SRC4,SRC5,SRC6,SRC7,PT,P2C,P4C,P6C,PTFTF,P4,P6) \ "trn1 " #DST0".d, " #SRC0".d, " #SRC1".d \n\t" \ "trn2 " #DST1".d, " #SRC0".d, " #SRC1".d \n\t" \ "trn1 " #DST2".d, " #SRC2".d, " #SRC3".d \n\t" \ "trn2 " #DST3".d, " #SRC2".d, " #SRC3".d \n\t" \ "trn1 " #DST4".d, " #SRC4".d, " #SRC5".d \n\t" \ "trn2 " #DST5".d, " #SRC4".d, " #SRC5".d \n\t" \ "trn1 " #DST6".d, " #SRC6".d, " #SRC7".d \n\t" \ "trn2 " #DST7".d, " #SRC6".d, " #SRC7".d \n\t" \ \ "compact " #SRC0".d, " #P2C", " #DST0".d \n\t" \ "compact " #SRC2".d, " #P2C", " #DST1".d \n\t" \ "ext " #SRC1".b, " #SRC1".b, " #DST2".b, #48 \n\t" \ "ext " #SRC3".b, " #SRC3".b, " #DST3".b, #48 \n\t" \ "compact " #SRC4".d, " #P2C", " #DST4".d \n\t" \ "compact " #SRC6".d, " #P2C", " #DST5".d \n\t" \ "ext " #SRC5".b, " #SRC5".b, " #DST6".b, #48 \n\t" \ "ext " #SRC7".b, " #SRC7".b, " #DST7".b, #48 \n\t" \ \ "sel " #DST0".d, " #PTFTF", " #DST0".d, " #SRC1".d \n\t" \ "sel " #DST2".d, " #PTFTF", " #SRC0".d, " #DST2".d \n\t" \ "sel " #DST1".d, " #PTFTF", " #DST1".d, " #SRC3".d \n\t" \ "sel " #DST3".d, " #PTFTF", " #SRC2".d, " #DST3".d \n\t" \ "sel " #DST4".d, " #PTFTF", " #DST4".d, " #SRC5".d \n\t" \ "sel " #DST6".d, " #PTFTF", " #SRC4".d, " #DST6".d \n\t" \ "sel " #DST5".d, " #PTFTF", " #DST5".d, " #SRC7".d \n\t" \ "sel " #DST7".d, " #PTFTF", " #SRC6".d, " #DST7".d \n\t" \ \ "compact " #SRC0".d, " #P4C", " #DST0".d \n\t" \ "compact " #SRC1".d, " #P4C", " #DST1".d \n\t" \ "compact " #SRC2".d, " #P4C", " #DST2".d \n\t" \ "compact " #SRC3".d, " #P4C", " #DST3".d \n\t" \ "ext " #SRC4".b, " #SRC4".b, " #DST4".b, #32 \n\t" \ "ext " #SRC5".b, " #SRC5".b, " #DST5".b, #32 \n\t" \ "ext " #SRC6".b, " #SRC6".b, " #DST6".b, #32 \n\t" \ "ext " #SRC7".b, " #SRC7".b, " #DST7".b, #32 \n\t" \ \ "sel " #DST0".d, " #P4", " #DST0".d, " #SRC4".d \n\t" \ "sel " #DST1".d, " #P4", " #DST1".d, " #SRC5".d \n\t" \ "sel " #DST2".d, " #P4", " #DST2".d, " #SRC6".d \n\t" \ "sel " #DST3".d, " #P4", " #DST3".d, " #SRC7".d \n\t" \ "sel " #DST4".d, " #P4", " #SRC0".d, " #DST4".d \n\t" \ "sel " #DST5".d, " #P4", " #SRC1".d, " #DST5".d \n\t" \ "sel " #DST6".d, " #P4", " #SRC2".d, " #DST6".d \n\t" \ "sel " #DST7".d, " #P4", " #SRC3".d, " #DST7".d \n\t" cython-blis-0.9.1/blis/_src/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c000066400000000000000000000174531427272030600266340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Linaro Limited Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if !defined(BLIS_FAMILY_A64FX) #include // assumption: // SVE vector length = 256 bits. // void bli_dpackm_armsve256_int_8xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, double* restrict kappa, double* restrict a, inc_t inca_, inc_t lda_, double* restrict p, inc_t ldp_, cntx_t* restrict cntx ) { const int64_t cdim = cdim_; const int64_t mnr = 8; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; double* restrict alpha1 = a; double* restrict alpha1_4 = alpha1 + 4 * inca; double* restrict pi1 = p; const svbool_t all_active = svptrue_b64(); svfloat64_t z_a0; svfloat64_t z_a4; svuint64_t z_index; // creating index for gather/scatter // with each element as: 0, 1*inca, 2*inca, 3*inca z_index = svindex_u64( 0, inca * sizeof( double ) ); if ( cdim == mnr ) { if ( bli_deq1( *kappa ) ) { if ( inca == 1 ) // continous memory. packA style { for ( dim_t k = n; k != 0; --k ) { // svld1_f64 retrieves all zero's into z_a0 and z_a4, // which is not correct. // qemu-aarch64 or gcc interpretation of svld1_f64 // should be blamed. // load 8 continuous elments from *a // z_a0 = svld1_f64( all_active, alpha1 ); // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 ); // as a workaround, using gather load // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); // store them into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a4 ); alpha1 += lda; alpha1_4 = alpha1 + 4 * inca; pi1 += ldp; } } else // gather/scatter load/store. packB style { for ( dim_t k = n; k != 0; --k ) { // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); // scatter store into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a4 ); alpha1 += lda; alpha1_4 = alpha1 + 4 * inca; pi1 += ldp; } } } else // *kappa != 1.0 { // load kappa into vector svfloat64_t z_kappa; z_kappa = svdup_f64( *kappa ); if ( inca == 1 ) // continous memory. packA style { for ( dim_t k = n; k != 0; --k ) { // load 8 continuous elments from *a // z_a0 = svld1_f64( all_active, alpha1 ); // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 ); // same reason as above. as a workaround, using gather load // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); // multiply by *kappa z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 ); // store them into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a4 ); alpha1 += lda; alpha1_4 = alpha1 + 4 * inca; pi1 += ldp; } } else // gather/scatter load/store. packB style { for ( dim_t k = n; k != 0; --k ) { // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); // multiply by *kappa z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 ); // scatter store into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a4 ); alpha1 += lda; alpha1_4 = alpha1 + 4 * inca; pi1 += ldp; } } } // end of if ( *kappa == 1.0 ) } else // if ( cdim < mnr ) { bli_dscal2m_ex ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim, n, kappa, a, inca, lda, p, 1, ldp, cntx, NULL ); // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } #endif // __has_include() cython-blis-0.9.1/blis/_src/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c000066400000000000000000000321461427272030600266620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "armsve512_asm_transpose_d8x8.h" #include "armsve512_asm_transpose_d8x2.h" #include "../3/armsve_asm_macros.h" // assumption: // SVE vector length = 512 bits. void bli_dpackm_armsve512_asm_10xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, double* restrict kappa, double* restrict a, inc_t inca_, inc_t lda_, double* restrict p, inc_t ldp_, cntx_t* restrict cntx ) { const int64_t cdim = cdim_; const int64_t mnr = 10; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; const bool gs = inca != 1 && lda != 1; const bool unitk = bli_deq1( *kappa ); #ifdef _A64FX { // Infer whether A or B is being packed. if ( schema == BLIS_PACKED_ROWS ) p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p; if ( schema == BLIS_PACKED_COLUMNS ) p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p; } #endif if ( cdim == mnr && !gs && unitk ) { uint64_t n_mker = n / 8; uint64_t n_left = n % 8; __asm__ volatile ( "mov x0, %[a] \n\t" "mov x1, %[p] \n\t" "mov x2, %[ldp] \n\t" "mov x3, %[lda] \n\t" "mov x4, %[inca] \n\t" "cmp x4, #1 \n\t" // Skips by sizeof(double). "mov x8, #8 \n\t" "madd x2, x2, x8, xzr \n\t" "madd x3, x3, x8, xzr \n\t" "madd x4, x4, x8, xzr \n\t" // Loop constants. "mov x8, %[n_mker] \n\t" "mov x9, %[n_left] \n\t" "ptrue p0.d \n\t" BNE(AROWSTOR) // A stored in columns. LABEL(ACOLSTOR) // Prefetch distance. "mov x17, #8 \n\t" "madd x17, x17, x3, xzr \n\t" #ifdef _A64FX // Disable hardware prefetch for A. "mov x16, 0x6 \n\t" "lsl x16, x16, #60 \n\t" "orr x0, x0, x16 \n\t" #endif LABEL(ACOLSTORMKER) "cmp x8, xzr \n\t" BEQ(ACOLSTORMKEREND) "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" "ld1d z0.d, p0/z, [x0] \n\t" "ldr q1, [x0, #64] \n\t" "ld1d z2.d, p0/z, [x5] \n\t" "ldr q3, [x5, #64] \n\t" "ld1d z4.d, p0/z, [x6] \n\t" "ldr q5, [x6, #64] \n\t" "ld1d z6.d, p0/z, [x7] \n\t" "ldr q7, [x7, #64] \n\t" "add x18, x17, x0 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x5 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x6 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x7 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x0, x7, x3 \n\t" "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" "ld1d z8.d, p0/z, [x0] \n\t" "ldr q9, [x0, #64] \n\t" "ld1d z10.d, p0/z, [x5] \n\t" "ldr q11, [x5, #64] \n\t" "ld1d z12.d, p0/z, [x6] \n\t" "ldr q13, [x6, #64] \n\t" "ld1d z14.d, p0/z, [x7] \n\t" "ldr q15, [x7, #64] \n\t" "add x18, x17, x0 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x5 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x6 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x7 \n\t" "prfm PLDL1STRM, [x18] \n\t" // Plain storage "add x10, x1, x2 \n\t" "add x11, x10, x2 \n\t" "add x12, x11, x2 \n\t" "add x13, x12, x2 \n\t" "add x14, x13, x2 \n\t" "add x15, x14, x2 \n\t" "add x16, x15, x2 \n\t" "st1d z0.d, p0, [x1] \n\t" "str q1, [x1, #64] \n\t" "st1d z2.d, p0, [x10] \n\t" "str q3, [x10, #64] \n\t" "st1d z4.d, p0, [x11] \n\t" "str q5, [x11, #64] \n\t" "st1d z6.d, p0, [x12] \n\t" "str q7, [x12, #64] \n\t" "st1d z8.d, p0, [x13] \n\t" "str q9, [x13, #64] \n\t" "st1d z10.d, p0, [x14] \n\t" "str q11, [x14, #64] \n\t" "st1d z12.d, p0, [x15] \n\t" "str q13, [x15, #64] \n\t" "st1d z14.d, p0, [x16] \n\t" "str q15, [x16, #64] \n\t" "add x1, x16, x2 \n\t" // Realign and store. // "ext z1.b, z1.b, z1.b, #16 \n\t" // "ext z1.b, z1.b, z2.b, #48 \n\t" // "ext z2.b, z2.b, z3.b, #16 \n\t" // "ext z2.b, z2.b, z4.b, #32 \n\t" // "ext z4.b, z4.b, z5.b, #16 \n\t" // "ext z4.b, z4.b, z6.b, #16 \n\t" // "ext z6.b, z6.b, z7.b, #16 \n\t" // "ext z9.b, z9.b, z9.b, #16 \n\t" // "ext z9.b, z9.b, z10.b, #48 \n\t" // "ext z10.b, z10.b, z11.b, #16 \n\t" // "ext z10.b, z10.b, z12.b, #32 \n\t" // "ext z12.b, z12.b, z13.b, #16 \n\t" // "ext z12.b, z12.b, z14.b, #16 \n\t" // "ext z14.b, z14.b, z15.b, #16 \n\t" // "st1d z0.d, p0, [x1] \n\t" // "st1d z1.d, p0, [x1, #1, mul vl] \n\t" // "st1d z2.d, p0, [x1, #2, mul vl] \n\t" // "st1d z4.d, p0, [x1, #3, mul vl] \n\t" // "st1d z6.d, p0, [x1, #4, mul vl] \n\t" // "add x1, x1, #320 \n\t" // "st1d z8.d, p0, [x1] \n\t" // "st1d z9.d, p0, [x1, #1, mul vl] \n\t" // "st1d z10.d, p0, [x1, #2, mul vl] \n\t" // "st1d z12.d, p0, [x1, #3, mul vl] \n\t" // "st1d z14.d, p0, [x1, #4, mul vl] \n\t" // "add x1, x1, #320 \n\t" "add x0, x7, x3 \n\t" "sub x8, x8, #1 \n\t" BRANCH(ACOLSTORMKER) LABEL(ACOLSTORMKEREND) LABEL(ACOLSTORLEFT) "cmp x9, xzr \n\t" BEQ(UNITKDONE) "ld1d z0.d, p0/z, [x0] \n\t" "ldr q1, [x0, #64] \n\t" "st1d z0.d, p0, [x1] \n\t" "str q1, [x1, #64] \n\t" "add x0, x0, x3 \n\t" "add x1, x1, x2 \n\t" "sub x9, x9, #1 \n\t" BRANCH(ACOLSTORLEFT) // A stored in rows. LABEL(AROWSTOR) // Prepare predicates for in-reg transpose. SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful. "cmp x8, xzr \n\t" BEQ(AROWSTORMKEREND) "add x10, x0, x4 \n\t" "add x11, x10, x4 \n\t" "add x12, x11, x4 \n\t" "add x13, x12, x4 \n\t" "add x14, x13, x4 \n\t" "add x15, x14, x4 \n\t" "add x16, x15, x4 \n\t" "add x17, x16, x4 \n\t" "add x18, x17, x4 \n\t" "ld1d z0.d, p0/z, [x0] \n\t" "ld1d z1.d, p0/z, [x10] \n\t" "ld1d z2.d, p0/z, [x11] \n\t" "ld1d z3.d, p0/z, [x12] \n\t" "ld1d z4.d, p0/z, [x13] \n\t" "ld1d z5.d, p0/z, [x14] \n\t" "ld1d z6.d, p0/z, [x15] \n\t" "ld1d z7.d, p0/z, [x16] \n\t" "ld1d z22.d, p0/z, [x17] \n\t" "ld1d z23.d, p0/z, [x18] \n\t" // Transpose first 8 rows. SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6) // Transpose last 2 rows. SVE512_IN_REG_TRANSPOSE_d8x2(z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3) // Plain storage. "add x10, x1, x2 \n\t" "add x11, x10, x2 \n\t" "add x12, x11, x2 \n\t" "add x13, x12, x2 \n\t" "add x14, x13, x2 \n\t" "add x15, x14, x2 \n\t" "add x16, x15, x2 \n\t" "st1d z8.d, p0, [x1] \n\t" "str q16, [x1, #64] \n\t" "st1d z9.d, p0, [x10] \n\t" "str q17, [x10, #64] \n\t" "st1d z10.d, p0, [x11] \n\t" "str q18, [x11, #64] \n\t" "st1d z11.d, p0, [x12] \n\t" "str q19, [x12, #64] \n\t" "st1d z12.d, p0, [x13] \n\t" "str q20, [x13, #64] \n\t" "st1d z13.d, p0, [x14] \n\t" "str q21, [x14, #64] \n\t" "st1d z14.d, p0, [x15] \n\t" "str q22, [x15, #64] \n\t" "st1d z15.d, p0, [x16] \n\t" "str q23, [x16, #64] \n\t" "add x1, x16, x2 \n\t" "add x0, x0, #64 \n\t" "sub x8, x8, #1 \n\t" BRANCH(AROWSTORMKER) LABEL(AROWSTORMKEREND) "mov x4, %[inca] \n\t" // Restore unshifted inca. "index z30.d, xzr, x4 \n\t" // Generate index. "lsl x4, x4, #3 \n\t" // Shift again. "lsl x5, x4, #3 \n\t" // Virtual column vl. LABEL(AROWSTORLEFT) "cmp x9, xzr \n\t" BEQ(UNITKDONE) "add x6, x0, x5 \n\t" "add x7, x6, x4 \n\t" "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" "ldr d1, [x6] \n\t" "ldr d2, [x7] \n\t" "trn1 v1.2d, v1.2d, v2.2d \n\t" "st1d z0.d, p0, [x1] \n\t" "str q1, [x1, #64] \n\t" "add x1, x1, x2 \n\t" "add x0, x0, #8 \n\t" "sub x9, x9, #1 \n\t" BRANCH(AROWSTORLEFT) LABEL(UNITKDONE) "mov x0, #0 \n\t" : : [a] "r" (a), [p] "r" (p), [lda] "r" (lda), [ldp] "r" (ldp), [inca] "r" (inca), [n_mker] "r" (n_mker), [n_left] "r" (n_left) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14","x15", "x16","x17","x18", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19","z20","z21","z22","z23", // "z24","z25","z26","z27","z28","z29", "z30","z31", "p0", "p1", "p2", "p3", "p4", // "p5", "p6", "p7", "p8" ); } else // if ( cdim < mnr ) { bli_dscal2m_ex ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim, n, kappa, a, inca, lda, p, 1, ldp, cntx, NULL ); // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c000066400000000000000000000325511427272030600266700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "armsve512_asm_transpose_d8x8.h" #include "../3/armsve_asm_macros.h" // assumption: // SVE vector length = 512 bits. void bli_dpackm_armsve512_asm_16xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, double* restrict kappa, double* restrict a, inc_t inca_, inc_t lda_, double* restrict p, inc_t ldp_, cntx_t* restrict cntx ) { const int64_t cdim = cdim_; const int64_t mnr = 16; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; const bool gs = inca != 1 && lda != 1; const bool unitk = bli_deq1( *kappa ); #ifdef _A64FX { // Infer whether A or B is being packed. if ( schema == BLIS_PACKED_ROWS ) p = ( (uint64_t)0x1 << 56 ) | (uint64_t)p; if ( schema == BLIS_PACKED_COLUMNS ) p = ( (uint64_t)0x2 << 56 ) | (uint64_t)p; } #endif if ( cdim == mnr && !gs && unitk ) { uint64_t n_mker = n / 8; uint64_t n_left = n % 8; __asm__ volatile ( "mov x0, %[a] \n\t" "mov x1, %[p] \n\t" "mov x2, %[ldp] \n\t" "mov x3, %[lda] \n\t" "mov x4, %[inca] \n\t" "cmp x4, #1 \n\t" // Skips by sizeof(double). "mov x8, #8 \n\t" "madd x2, x2, x8, xzr \n\t" "madd x3, x3, x8, xzr \n\t" "madd x4, x4, x8, xzr \n\t" // "mov x8, 0x8 \n\t" // Control#0 for A address. // "mov x8, 0x24 \n\t" // Higher 6bit for Control#0: // "lsl x8, x8, #58 \n\t" // Valid|Strong|Strong|Alloc|Load|Strong // "orr x8, x8, x3 \n\t" // Stride. // "msr S3_3_C11_C6_0, x8 \n\t" // Write system register. // Loop constants. "mov x8, %[n_mker] \n\t" "mov x9, %[n_left] \n\t" "ptrue p0.d \n\t" BNE(AROWSTOR) // A stored in columns. LABEL(ACOLSTOR) // Prefetch distance. "mov x17, #8 \n\t" "madd x17, x17, x3, xzr \n\t" #ifdef _A64FX "mov x16, 0x6 \n\t" // Disable hardware prefetch for A. "lsl x16, x16, #60 \n\t" "orr x0, x0, x16 \n\t" #endif // "add x5, x0, x3 \n\t" // "add x6, x5, x3 \n\t" // "add x7, x6, x3 \n\t" // "prfm PLDL1STRM, [x0] \n\t" // "prfm PLDL1STRM, [x5] \n\t" // "prfm PLDL1STRM, [x6] \n\t" // "prfm PLDL1STRM, [x7] \n\t" // "add x18, x7, x3 \n\t" // "add x5, x18, x3 \n\t" // "add x6, x5, x3 \n\t" // "add x7, x6, x3 \n\t" // "prfm PLDL1STRM, [x18] \n\t" // "prfm PLDL1STRM, [x5] \n\t" // "prfm PLDL1STRM, [x6] \n\t" // "prfm PLDL1STRM, [x7] \n\t" LABEL(ACOLSTORMKER) "cmp x8, xzr \n\t" BEQ(ACOLSTORMKEREND) "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" "add x10, x1, x2 \n\t" "add x11, x10, x2 \n\t" "add x12, x11, x2 \n\t" "add x13, x12, x2 \n\t" "add x14, x13, x2 \n\t" "add x15, x14, x2 \n\t" "add x16, x15, x2 \n\t" "ld1d z0.d, p0/z, [x0] \n\t" "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t" "ld1d z2.d, p0/z, [x5] \n\t" "ld1d z3.d, p0/z, [x5, #1, mul vl] \n\t" "ld1d z4.d, p0/z, [x6] \n\t" "ld1d z5.d, p0/z, [x6, #1, mul vl] \n\t" "ld1d z6.d, p0/z, [x7] \n\t" "ld1d z7.d, p0/z, [x7, #1, mul vl] \n\t" "add x18, x17, x0 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x5 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x6 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x7 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x0, x7, x3 \n\t" "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" "ld1d z8.d, p0/z, [x0] \n\t" "ld1d z9.d, p0/z, [x0, #1, mul vl] \n\t" "ld1d z10.d, p0/z, [x5] \n\t" "ld1d z11.d, p0/z, [x5, #1, mul vl] \n\t" "ld1d z12.d, p0/z, [x6] \n\t" "ld1d z13.d, p0/z, [x6, #1, mul vl] \n\t" "ld1d z14.d, p0/z, [x7] \n\t" "ld1d z15.d, p0/z, [x7, #1, mul vl] \n\t" "add x18, x17, x0 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x5 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x6 \n\t" "prfm PLDL1STRM, [x18] \n\t" "add x18, x17, x7 \n\t" "prfm PLDL1STRM, [x18] \n\t" "st1d z0.d, p0, [x1] \n\t" "st1d z1.d, p0, [x1, #1, mul vl] \n\t" "st1d z2.d, p0, [x10] \n\t" "st1d z3.d, p0, [x10, #1, mul vl] \n\t" "st1d z4.d, p0, [x11] \n\t" "st1d z5.d, p0, [x11, #1, mul vl] \n\t" "st1d z6.d, p0, [x12] \n\t" "st1d z7.d, p0, [x12, #1, mul vl] \n\t" "st1d z8.d, p0, [x13] \n\t" "st1d z9.d, p0, [x13, #1, mul vl] \n\t" "st1d z10.d, p0, [x14] \n\t" "st1d z11.d, p0, [x14, #1, mul vl] \n\t" "st1d z12.d, p0, [x15] \n\t" "st1d z13.d, p0, [x15, #1, mul vl] \n\t" "st1d z14.d, p0, [x16] \n\t" "st1d z15.d, p0, [x16, #1, mul vl] \n\t" "add x0, x7, x3 \n\t" "add x1, x16, x2 \n\t" "sub x8, x8, #1 \n\t" BRANCH(ACOLSTORMKER) LABEL(ACOLSTORMKEREND) LABEL(ACOLSTORLEFT) "cmp x9, xzr \n\t" BEQ(UNITKDONE) "ld1d z0.d, p0/z, [x0] \n\t" "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t" "st1d z0.d, p0, [x1] \n\t" "st1d z1.d, p0, [x1, #1, mul vl] \n\t" "add x0, x0, x3 \n\t" "add x1, x1, x2 \n\t" "sub x9, x9, #1 \n\t" BRANCH(ACOLSTORLEFT) // A stored in rows. LABEL(AROWSTOR) // Prepare predicates for in-reg transpose. SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful. "cmp x8, xzr \n\t" BEQ(AROWSTORMKEREND) "add x10, x0, x4 \n\t" "add x11, x10, x4 \n\t" "add x12, x11, x4 \n\t" "add x13, x12, x4 \n\t" "add x14, x13, x4 \n\t" "add x15, x14, x4 \n\t" "add x16, x15, x4 \n\t" "ld1d z0.d, p0/z, [x0] \n\t" "ld1d z1.d, p0/z, [x10] \n\t" "ld1d z2.d, p0/z, [x11] \n\t" "ld1d z3.d, p0/z, [x12] \n\t" "ld1d z4.d, p0/z, [x13] \n\t" "ld1d z5.d, p0/z, [x14] \n\t" "ld1d z6.d, p0/z, [x15] \n\t" "ld1d z7.d, p0/z, [x16] \n\t" "add x5, x16, x4 \n\t" "add x10, x5, x4 \n\t" "add x11, x10, x4 \n\t" "add x12, x11, x4 \n\t" "add x13, x12, x4 \n\t" "add x14, x13, x4 \n\t" "add x15, x14, x4 \n\t" "add x16, x15, x4 \n\t" "ld1d z16.d, p0/z, [x5] \n\t" "ld1d z17.d, p0/z, [x10] \n\t" "ld1d z18.d, p0/z, [x11] \n\t" "ld1d z19.d, p0/z, [x12] \n\t" "ld1d z20.d, p0/z, [x13] \n\t" "ld1d z21.d, p0/z, [x14] \n\t" "ld1d z22.d, p0/z, [x15] \n\t" "ld1d z23.d, p0/z, [x16] \n\t" // Transpose first 8 rows. SVE512_IN_REG_TRANSPOSE_d8x8(z8,z9,z10,z11,z12,z13,z14,z15,z0,z1,z2,z3,z4,z5,z6,z7,p0,p1,p2,p3,p8,p4,p6) // Transpose last 8 rows. SVE512_IN_REG_TRANSPOSE_d8x8(z24,z25,z26,z27,z28,z29,z30,z31,z16,z17,z18,z19,z20,z21,z22,z23,p0,p1,p2,p3,p8,p4,p6) "add x10, x1, x2 \n\t" "add x11, x10, x2 \n\t" "add x12, x11, x2 \n\t" "add x13, x12, x2 \n\t" "add x14, x13, x2 \n\t" "add x15, x14, x2 \n\t" "add x16, x15, x2 \n\t" "st1d z8.d, p0, [x1] \n\t" "st1d z24.d, p0, [x1, #1, mul vl] \n\t" "st1d z9.d, p0, [x10] \n\t" "st1d z25.d, p0, [x10, #1, mul vl] \n\t" "st1d z10.d, p0, [x11] \n\t" "st1d z26.d, p0, [x11, #1, mul vl] \n\t" "st1d z11.d, p0, [x12] \n\t" "st1d z27.d, p0, [x12, #1, mul vl] \n\t" "st1d z12.d, p0, [x13] \n\t" "st1d z28.d, p0, [x13, #1, mul vl] \n\t" "st1d z13.d, p0, [x14] \n\t" "st1d z29.d, p0, [x14, #1, mul vl] \n\t" "st1d z14.d, p0, [x15] \n\t" "st1d z30.d, p0, [x15, #1, mul vl] \n\t" "st1d z15.d, p0, [x16] \n\t" "st1d z31.d, p0, [x16, #1, mul vl] \n\t" "add x0, x0, #64 \n\t" "add x1, x16, x2 \n\t" "sub x8, x8, #1 \n\t" BRANCH(AROWSTORMKER) LABEL(AROWSTORMKEREND) "mov x4, %[inca] \n\t" // Restore unshifted inca. "index z30.d, xzr, x4 \n\t" // Generate index. "lsl x4, x4, #3 \n\t" // Shift again. "lsl x5, x4, #3 \n\t" // Virtual column vl. LABEL(AROWSTORLEFT) "cmp x9, xzr \n\t" BEQ(UNITKDONE) "add x6, x0, x5 \n\t" "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t" "st1d z0.d, p0, [x1] \n\t" "st1d z1.d, p0, [x1, #1, mul vl] \n\t" "add x1, x1, x2 \n\t" "add x0, x0, #8 \n\t" "sub x9, x9, #1 \n\t" BRANCH(AROWSTORLEFT) LABEL(UNITKDONE) "mov x0, #0 \n\t" : : [a] "r" (a), [p] "r" (p), [lda] "r" (lda), [ldp] "r" (ldp), [inca] "r" (inca), [n_mker] "r" (n_mker), [n_left] "r" (n_left) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14","x15", "x16","x17","x18", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10","z11","z12","z13","z14","z15", // "z16","z17","z18","z19","z20","z21","z22","z23", // "z24","z25","z26","z27","z28","z29","z30","z31", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7" ); } else // if ( cdim < mnr ) { bli_dscal2m_ex ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim, n, kappa, a, inca, lda, p, 1, ldp, cntx, NULL ); // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/armsve/1m/old/000077500000000000000000000000001427272030600216715ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armsve/1m/old/bli_dpackm_armsve512_int_12xk.c000066400000000000000000000326531427272030600274570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Linaro Limited Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #if !defined(BLIS_FAMILY_A64FX) #include // assumption: // SVE vector length = 512 bits. // TODO: // 2-rows -> 3 vectors packing and use predicator only in odd num of rows to be packed. // prefetching is needed. void bli_dpackm_armsve512_int_12xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, double* restrict kappa, double* restrict a, inc_t inca_, inc_t lda_, double* restrict p, inc_t ldp_, cntx_t* restrict cntx ) { const int64_t cdim = cdim_; const int64_t mnr = 12; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; double* restrict alpha1 = a; double* restrict alpha1_8 = alpha1 + 8 * inca; double* restrict alpha1_p4 = alpha1 + 4 * inca; double* restrict alpha1_m4 = alpha1 - 4 * inca; double* restrict pi1 = p; const svbool_t all_active = svptrue_b64(); const svbool_t first_half_active = svwhilelt_b64(0, 4); const svbool_t last_half_active = svnot_z(all_active, first_half_active); svfloat64_t z_a0; svfloat64_t z_a8; svfloat64_t z_a8_lh; svfloat64_t z_a16; svuint64_t z_index; // creating index for gather/scatter // with each element as: 0, 1*inca, 2*inca, 3*inca z_index = svindex_u64( 0, inca * sizeof( double ) ); if ( cdim == mnr ) { if ( bli_deq1( *kappa ) ) { if ( inca == 1 ) // continous memory. packA style { dim_t k = n; // 2 pack into 3 case. if ( ldp == mnr ) { for ( ; k > 1; k -= 2 ) { // load 12 continuous elments from *a z_a0 = svld1_f64( all_active, alpha1 ); z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); // forward address - 0 to 1 alpha1 += lda; alpha1_p4 = alpha1 + 4 * inca; alpha1_m4 = alpha1 - 4 * inca; // load 12 continuous elments from *a, filling last half of z8. z_a8_lh = svld1_f64( last_half_active, alpha1_m4 ); z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); z_a16 = svld1_f64( all_active, alpha1_p4 ); // stored packed data into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a8 ); svst1_vnum_f64( all_active, pi1, 2, z_a16 ); // forward address - 1 to 0 alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += 2 * ldp; } } // line-by-line packing case. for ( ; k != 0; --k ) { // load 12 continuous elments from *a z_a0 = svld1_f64( all_active, alpha1 ); z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); // store them into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += ldp; } } else // gather/scatter load/store. packB style { dim_t k = n; if ( ldp == mnr ) { for ( ; k > 1; k -= 2 ) { // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); // forward address - 0 to 1 alpha1 += lda; alpha1_p4 = alpha1 + 4 * inca; alpha1_m4 = alpha1 - 4 * inca; // gather load from *a, filling last half of z8. z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index ); z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index ); // stored packed data into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a8 ); svst1_vnum_f64( all_active, pi1, 2, z_a16 ); // forward address - 1 to 0 alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += 2 * ldp; } } for ( ; k != 0; --k ) { // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); // scatter store into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += ldp; } } } else // *kappa != 1.0 { // load kappa into vector svfloat64_t z_kappa; z_kappa = svdup_f64( *kappa ); if ( inca == 1 ) // continous memory. packA style { dim_t k = n; if ( ldp == mnr ) { for ( ; k > 1; k -= 2 ) { // load 12 continuous elments from *a z_a0 = svld1_f64( all_active, alpha1 ); z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); // forward address - 0 to 1 alpha1 += lda; alpha1_p4 = alpha1 + 4 * inca; alpha1_m4 = alpha1 - 4 * inca; // load 12 continuous elments from *a, filling last half of z8. z_a8_lh = svld1_f64( last_half_active, alpha1_m4 ); z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); z_a16 = svld1_f64( all_active, alpha1_p4 ); // multiply by *kappa z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 ); // stored packed data into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a8 ); svst1_vnum_f64( all_active, pi1, 2, z_a16 ); // forward address - 1 to 0 alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += 2 * ldp; } } for ( ; k != 0; --k ) { // load 12 continuous elments from *a z_a0 = svld1_f64( all_active, alpha1 ); z_a8 = svld1_vnum_f64( first_half_active, alpha1, 1 ); // multiply by *kappa z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); // store them into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += ldp; } } else // gather/scatter load/store. packB style { dim_t k = n; if ( ldp == mnr ) { for ( ; k > 1; k -= 2 ) { // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); // forward address - 0 to 1 alpha1 += lda; alpha1_p4 = alpha1 + 4 * inca; alpha1_m4 = alpha1 - 4 * inca; // gather load from *a, filling last half of z8. z_a8_lh = svld1_gather_u64offset_f64( last_half_active, alpha1_m4, z_index ); z_a8 = svadd_f64_z( all_active, z_a8, z_a8_lh ); z_a16 = svld1_gather_u64offset_f64( all_active, alpha1_p4, z_index ); // multiply by *kappa z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); z_a16 = svmul_lane_f64( z_a16, z_kappa, 0 ); // stored packed data into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( all_active, pi1, 1, z_a8 ); svst1_vnum_f64( all_active, pi1, 2, z_a16 ); // forward address - 1 to 0 alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += 2 * ldp; } } for ( ; k != 0; --k ) { // gather load from *a z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); z_a8 = svld1_gather_u64offset_f64( first_half_active, alpha1_8, z_index ); // multiply by *kappa z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); z_a8 = svmul_lane_f64( z_a8, z_kappa, 0 ); // scatter store into *p svst1_f64( all_active, pi1, z_a0 ); svst1_vnum_f64( first_half_active, pi1, 1, z_a8 ); alpha1 += lda; alpha1_8 = alpha1 + 8 * inca; pi1 += ldp; } } } // end of if ( *kappa == 1.0 ) } else // if ( cdim < mnr ) { bli_dscal2m_ex ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim, n, kappa, a, inca, lda, p, 1, ldp, cntx, NULL ); // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } #endif // __has_include() cython-blis-0.9.1/blis/_src/kernels/armsve/3/000077500000000000000000000000001427272030600207405ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_2vx10.h000066400000000000000000000314141427272030600242110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BADDR,8) \ GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BADDR,9) \ " add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ GEMM_FMLA2_LD1R(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BADDR,0) \ GEMM_FMLA2_LD1R(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BADDR,1) \ GEMM_FMLA2_LD1R(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BADDR,2) \ GEMM_FMLA2_LD1R(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BADDR,3) \ GEMM_FMLA2_LD1R(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BADDR,4) \ GEMM_FMLA2_LD1R(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BADDR,5) \ \ GEMM_FMLA2_LD1R(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BADDR,6) \ GEMM_FMLA2_LD1R(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BADDR,7) // Second through forth microkernels are the first one with B vectors rotated. #define GEMM_2VX10_MKER_LOOP_PLAIN_C_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BRSBIT) #define GEMM_2VX10_MKER_LOOP_PLAIN_C_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BRSBIT) #define GEMM_2VX10_MKER_LOOP_PLAIN_C_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ GEMM_2VX10_MKER_LOOP_PLAIN_C_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BRSBIT) // NOTE: // The microkernel (PLAIN_1-4 as a whole) satisfies on entry/exit // (sth. akin to loop-invariant): // - BV[0-7] holds B[0:7, 4*k_cur] // - B's address stops at B[0, 4*k_cur+1] // Final loop inside K=4 microkernels. #define GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BRSBIT) \ GEMM_FMLA2_LD1R(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BADDR,8) \ GEMM_FMLA2_LD1R(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BADDR,9) \ " add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \ GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \ GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \ GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \ GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \ GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \ GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \ GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7) // K=4 MKer loop with B memory scattered. #define GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) \ " add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ " mov "#BELMADDR", "#BADDR" \n\t" \ GEMM_FMLA2_LD1R_G_ELMFWD(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV2,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV3,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV4,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV5,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \ \ GEMM_FMLA2_LD1R_G_ELMFWD(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV0,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV1,BELMADDR,BCSBIT) #define GEMM_2VX10_MKER_LOOP_PLAIN_G_2(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV2,BV3,BV4,BV5,BV6,BV7,BV0,BV1,BADDR,BELMADDR,BRSBIT,BCSBIT) #define GEMM_2VX10_MKER_LOOP_PLAIN_G_3(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BADDR,BELMADDR,BRSBIT,BCSBIT) #define GEMM_2VX10_MKER_LOOP_PLAIN_G_4(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ GEMM_2VX10_MKER_LOOP_PLAIN_G_1(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV6,BV7,BV0,BV1,BV2,BV3,BV4,BV5,BADDR,BELMADDR,BRSBIT,BCSBIT) #define GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(C0FH,C1FH,C2FH,C3FH,C4FH,C5FH,C6FH,C7FH,C8FH,C9FH,C0LH,C1LH,C2LH,C3LH,C4LH,C5LH,C6LH,C7LH,C8LH,C9LH,PT,ACOLFH,ACOLLH,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BADDR,BELMADDR,BRSBIT,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C0FH,C0LH,PT,ACOLFH,ACOLLH,BV6,BELMADDR,BCSBIT) \ GEMM_FMLA2_LD1R_G_ELMFWD(C1FH,C1LH,PT,ACOLFH,ACOLLH,BV7,BELMADDR,BCSBIT) \ " add "#BADDR", "#BRSBIT", "#BADDR" \n\t" /* B address forward */ \ " mov "#BELMADDR", "#BADDR" \n\t" \ GEMM_FMLA2(C2FH,C2LH,PT,ACOLFH,ACOLLH,BV0) \ GEMM_FMLA2(C3FH,C3LH,PT,ACOLFH,ACOLLH,BV1) \ GEMM_FMLA2(C4FH,C4LH,PT,ACOLFH,ACOLLH,BV2) \ GEMM_FMLA2(C5FH,C5LH,PT,ACOLFH,ACOLLH,BV3) \ GEMM_FMLA2(C6FH,C6LH,PT,ACOLFH,ACOLLH,BV4) \ GEMM_FMLA2(C7FH,C7LH,PT,ACOLFH,ACOLLH,BV5) \ GEMM_FMLA2(C8FH,C8LH,PT,ACOLFH,ACOLLH,BV6) \ GEMM_FMLA2(C9FH,C9LH,PT,ACOLFH,ACOLLH,BV7) #define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \ CLEAR_COL4(Z00,Z01,Z02,Z03) \ CLEAR_COL4(Z04,Z05,Z06,Z07) \ CLEAR_COL4(Z08,Z09,Z10,Z11) \ CLEAR_COL4(Z12,Z13,Z14,Z15) \ CLEAR_COL4(Z16,Z17,Z18,Z19) #define SCALE_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19,ZFACTOR) \ SCALE_COL4(Z00,Z01,Z02,Z03,ZFACTOR) \ SCALE_COL4(Z04,Z05,Z06,Z07,ZFACTOR) \ SCALE_COL4(Z08,Z09,Z10,Z11,ZFACTOR) \ SCALE_COL4(Z12,Z13,Z14,Z15,ZFACTOR) \ SCALE_COL4(Z16,Z17,Z18,Z19,ZFACTOR) #define GEMM_C_FMLA_UKER(C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,PT,Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZSCALE) \ GEMM_FMLA2(C0FH,C0LH,PT,Z0FH,Z0LH,ZSCALE) \ GEMM_FMLA2(C1FH,C1LH,PT,Z1FH,Z1LH,ZSCALE) \ GEMM_FMLA2(C2FH,C2LH,PT,Z2FH,Z2LH,ZSCALE) \ GEMM_FMLA2(C3FH,C3LH,PT,Z3FH,Z3LH,ZSCALE) \ GEMM_FMLA2(C4FH,C4LH,PT,Z4FH,Z4LH,ZSCALE) #define GEMM_C_FMAD_UKER(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE) \ GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \ GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \ GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \ GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \ GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) #define GEMM_C_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS) #define GEMM_C_STORE_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z0FH,Z0LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z1FH,Z1LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z2FH,Z2LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z3FH,Z3LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_CONTIGUOUS_STORE_FWD(Z4FH,Z4LH,PFH,PLH,CADDR,CCS) #define GEMM_C_FMAD_LOAD_UKER_C(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,CADDR,CCS) \ GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C0FH,C0LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C1FH,C1LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C2FH,C2LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C3FH,C3LH,PFH,PLH,CADDR,CCS) \ GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \ GEMM_CCOL_CONTIGUOUS_LOAD_FWD(C4FH,C4LH,PFH,PLH,CADDR,CCS) #define GEMM_C_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_GATHER_LOAD_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_GATHER_LOAD_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_GATHER_LOAD_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_GATHER_LOAD_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_GATHER_LOAD_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) #define GEMM_C_STORE_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_SCATTER_STORE_FWD(Z0FH,Z0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_SCATTER_STORE_FWD(Z1FH,Z1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_SCATTER_STORE_FWD(Z2FH,Z2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_SCATTER_STORE_FWD(Z3FH,Z3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_SCATTER_STORE_FWD(Z4FH,Z4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) #define GEMM_C_FMAD_LOAD_UKER_G(Z0FH,Z1FH,Z2FH,Z3FH,Z4FH,Z0LH,Z1LH,Z2LH,Z3LH,Z4LH,PFH,PLH,C0FH,C1FH,C2FH,C3FH,C4FH,C0LH,C1LH,C2LH,C3LH,C4LH,ZSCALE,ZIDX,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_FMAD(Z0FH,Z0LH,PFH,PLH,C0FH,C0LH,ZSCALE) \ GEMM_CCOL_GATHER_LOAD_FWD(C0FH,C0LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_FMAD(Z1FH,Z1LH,PFH,PLH,C1FH,C1LH,ZSCALE) \ GEMM_CCOL_GATHER_LOAD_FWD(C1FH,C1LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_FMAD(Z2FH,Z2LH,PFH,PLH,C2FH,C2LH,ZSCALE) \ GEMM_CCOL_GATHER_LOAD_FWD(C2FH,C2LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_FMAD(Z3FH,Z3LH,PFH,PLH,C3FH,C3LH,ZSCALE) \ GEMM_CCOL_GATHER_LOAD_FWD(C3FH,C3LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_CCOL_FMAD(Z4FH,Z4LH,PFH,PLH,C4FH,C4LH,ZSCALE) \ GEMM_CCOL_GATHER_LOAD_FWD(C4FH,C4LH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_2vx10cmplx.h000066400000000000000000000163001427272030600252520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \ GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \ GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \ GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \ GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \ GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \ GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \ GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \ GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \ GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \ \ GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \ GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \ " add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV4,BAddr,0) \ GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV5,BAddr,2) \ GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV6,BAddr,4) \ GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV7,BAddr,6) \ GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV0,BAddr,8) \ GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV1,BAddr,10) \ GEMM_FMLX2_LD1R(C8Im,C8Re,PT,AColRe,AColIm,BV2,BAddr,12) \ GEMM_FMLX2_LD1R(C9Im,C9Re,PT,AColRe,AColIm,BV3,BAddr,14) #define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit) #define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,16) \ GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,18) \ GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,1) \ GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,3) \ GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,5) \ GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,7) \ GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,9) \ GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,11) \ GEMM_FMLA2_LD1R(C8Re,C8Im,PT,AColRe,AColIm,BV0,BAddr,13) \ GEMM_FMLA2_LD1R(C9Re,C9Im,PT,AColRe,AColIm,BV1,BAddr,15) \ \ GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV2,BAddr,17) \ GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV3,BAddr,19) \ " add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV4) \ GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV5) \ GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV6) \ GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV7) \ GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV0) \ GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV1) \ GEMM_FMLX2(C8Im,C8Re,PT,AColRe,AColIm,BV2) \ GEMM_FMLX2(C9Im,C9Re,PT,AColRe,AColIm,BV3) #define GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) \ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C8Re,C9Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,C8Im,C9Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV0,BV1,BV2,BV3,BAddr,BRSBit) #define CLEAR_COL20(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15,Z16,Z17,Z18,Z19) \ CLEAR_COL4(Z00,Z01,Z02,Z03) \ CLEAR_COL4(Z04,Z05,Z06,Z07) \ CLEAR_COL4(Z08,Z09,Z10,Z11) \ CLEAR_COL4(Z12,Z13,Z14,Z15) \ CLEAR_COL4(Z16,Z17,Z18,Z19) // Moving is always .d. // Never use .DT here! #define MOV_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,Z0Re,Z0Im,Z1Re,Z1Im) \ " mov "#ZD0Re".d, "#Z0Re".d \n\t" \ " mov "#ZD0Im".d, "#Z0Im".d \n\t" \ " mov "#ZD1Re".d, "#Z1Re".d \n\t" \ " mov "#ZD1Im".d, "#Z1Im".d \n\t" #define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \ FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \ GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \ GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) #define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) #define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) #define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) #define GEMM_CCMPLX_LOAD_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) #define GEMM_CCMPLX_STORE_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_macros.h000066400000000000000000000141341427272030600246150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Clang's label requirements. #if defined(__clang__) #define LABEL(str) " L" #str"%=: \n\t" #define BEQ(str) "b.eq L" #str"%= \n\t" #define BNE(str) "b.ne L" #str"%= \n\t" #define BRANCH(str) "b L" #str"%= \n\t" #else #define LABEL(str) " ." #str": \n\t" #define BEQ(str) "b.eq ." #str" \n\t" #define BNE(str) "b.ne ." #str" \n\t" #define BRANCH(str) "b ." #str" \n\t" #endif #define CLEAR_COL2(Z0,Z1) \ " dup "#Z0"."DT", #0 \n\t" \ " dup "#Z1"."DT", #0 \n\t" #define CLEAR_COL4(Z0,Z1,Z2,Z3) \ CLEAR_COL2(Z0,Z1) \ CLEAR_COL2(Z2,Z3) #define SCALE_COL2(Z0,Z1,ZFACTOR) \ " fmul "#Z0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \ " fmul "#Z1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \ #define SCALE_COL4(Z0,Z1,Z2,Z3,ZFACTOR) \ SCALE_COL2(Z0,Z1,ZFACTOR) \ SCALE_COL2(Z2,Z3,ZFACTOR) // Prefetch or not. #define PREFETCH_CONTIGUOUS_noprfm(LV,PROP,ADDR,SHIFT) #define PREFETCH_CONTIGUOUS_prfm(LV,PROP,ADDR,SHIFT) \ " prfm PLD"#LV""#PROP", ["#ADDR", "#SHIFT"] \n\t" #define GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ " fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" /* A Row 0 :VL */ \ " fmla "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" /* A Row VL:2VL */ #define GEMM_FMLA2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \ GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ " "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t" #define GEMM_FMLA2_LD1R_G_ELMFWD(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BELMADDR,BCSBIT) \ GEMM_FMLA2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ " "LD1R" "#BV"."DT", "#PT"/z, ["#BELMADDR"] \n\t" /* Load B */ \ " add "#BELMADDR", "#BELMADDR", "#BCSBIT" \n\t" /* Forward B element */ #define GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \ " "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR"] \n\t" \ " "LD1" "#ZLH"."DT", "#PLH"/z, ["#AADDR", #1, mul vl]\n\t" #define GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \ " "LD1" "#ZFH"."DT", "#PFH"/z, ["#AADDR", "#ZIDX"."DT", "OFFS"]\n\t" \ " add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \ " "LD1" "#ZLH"."DT", "#PLH"/z, ["#ATEMP", "#ZIDX"."DT", "OFFS"]\n\t" // Prefetch or not. #define GEMM_ACOL_GATHER_noprfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) #define GEMM_ACOL_GATHER_prfm(LV,PROP,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) \ " "PRFG" PLD"#LV""#PROP", "#PFH", ["#AADDR", "#ZIDX"."DT", "OFFS"] \n\t" \ " add "#ATEMP", "#AADDR", "#AVSKIP" \n\t" \ " "PRFG" PLD"#LV""#PROP", "#PLH", ["#ATEMP", "#ZIDX"."DT", "OFFS"] \n\t" #define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(ZFH,ZLH,PFH,PLH,AADDR,A4KS,ACS,ATEMP,PREFMODE) \ " add "#ATEMP", "#AADDR", "#A4KS" \n\t" \ " add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \ GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,AADDR) \ PREFETCH_CONTIGUOUS_ ##PREFMODE(L1,STRM,ATEMP,0) #define GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,A4KS,APS,ACS,AVSKIP,ATEMP,PREFMODEL1,PREFMODEL2) \ " add "#ATEMP", "#AADDR", "#A4KS" \n\t" \ GEMM_ACOL_GATHER_ ##PREFMODEL1(L1,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \ " add "#ATEMP", "#AADDR", "#APS" \n\t" \ GEMM_ACOL_GATHER_ ##PREFMODEL2(L2,STRM,ZIDX,PFH,PLH,ATEMP,AVSKIP,ATEMP) \ " add "#AADDR", "#AADDR", "#ACS" \n\t" /* Forward A's address to the next column. */ \ GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,AADDR,AVSKIP,ATEMP) #define GEMM_CCOL_CONTIGUOUS_LOAD_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \ GEMM_ACOL_CONTIGUOUS_LOAD(ZFH,ZLH,PFH,PLH,CADDR) \ " add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (load) to next column. */ #define GEMM_CCOL_CONTIGUOUS_STORE_FWD(ZFH,ZLH,PFH,PLH,CADDR,CCS) \ " "ST1" "#ZFH"."DT", "#PFH", ["#CADDR"] \n\t" \ " "ST1" "#ZLH"."DT", "#PLH", ["#CADDR", #1, mul vl] \n\t" \ " add "#CADDR", "#CADDR", "#CCS" \n\t" /* Forward C address (store) to next column. */ #define GEMM_CCOL_FMAD(ZFH,ZLH,PFH,PLH,CFH,CLH,ZSCALE) \ " fmad "#ZFH"."DT", "#PFH"/m, "#ZSCALE"."DT", "#CFH"."DT" \n\t" \ " fmad "#ZLH"."DT", "#PLH"/m, "#ZSCALE"."DT", "#CLH"."DT" \n\t" #define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ GEMM_ACOL_GATHER_LOAD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CVSKIP,CTEMP) \ " add "#CADDR", "#CADDR", "#CCS" \n\t" #define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX,PFH,PLH,CADDR,CCS,CVSKIP,CTEMP) \ " "ST1" "#ZFH"."DT", "#PFH", ["#CADDR", "#ZIDX"."DT", "OFFS"]\n\t" \ " add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \ " "ST1" "#ZLH"."DT", "#PLH", ["#CTEMP", "#ZIDX"."DT", "OFFS"]\n\t" \ " add "#CADDR", "#CADDR", "#CCS" \n\t" cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_macros_cmplx.h000066400000000000000000000100731427272030600260160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "armsve_asm_macros.h" #define FMUL_COL2(ZD0,ZD1,Z0,Z1,ZFACTOR) \ " fmul "#ZD0"."DT", "#Z0"."DT", "#ZFACTOR"."DT" \n\t" \ " fmul "#ZD1"."DT", "#Z1"."DT", "#ZFACTOR"."DT" \n\t" \ #define GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ " fmla "#CCOLFH"."DT", "#PT"/m, "#ACOLFH"."DT", "#BV"."DT" \n\t" \ " fmls "#CCOLLH"."DT", "#PT"/m, "#ACOLLH"."DT", "#BV"."DT" \n\t" #define GEMM_FMLX2_LD1R(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV,BADDR,NSHIFT) \ GEMM_FMLX2(CCOLFH,CCOLLH,PT,ACOLFH,ACOLLH,BV) \ " "LD1R" "#BV"."DT", "#PT"/z, ["#BADDR", #"#NSHIFT"*"SZ"]\n\t" #define GEMM_FMULCMPLX(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re,Z1Im) \ FMUL_COL2(ZDRe,ZDIm,Z0Re,Z0Im,Z1Re) \ GEMM_FMLX2(ZDIm,ZDRe,PT,Z0Re,Z0Im,Z1Im) #define GEMM_FMLACMPLX(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re,Z1Im) \ GEMM_FMLA2(ZDRe,ZDIm,PT,Z0Re,Z0Im,Z1Re) \ GEMM_FMLX2(ZDIm,ZDRe,PT,Z0Re,Z0Im,Z1Im) #define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \ " "LD2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT"/z, ["#AAddr"] \n\t" #define GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \ " "ST2" {"#ZRe"."DT", "#ZIm"."DT"}, "#PT", ["#AAddr"] \n\t" #define GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,AAddr,ACS) \ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD(ZRe,ZIm,PT,AAddr) \ " add "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */ #define GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS) \ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(ZRe,ZIm,PT,CAddr,CCS) #define GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,AAddr,ACS) \ GEMM_ACOLCMPLX_CONTIGUOUS_STORE(ZRe,ZIm,PT,AAddr) \ " add "#AAddr", "#AAddr", "#ACS" \n\t" /* Forward A address (load) to next column. */ #define GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS) \ GEMM_ACOLCMPLX_CONTIGUOUS_STORE_FWD(ZRe,ZIm,PT,CAddr,CCS) #define GEMM_CCOLCMPLX_GATHER_LOAD_FWD(ZRe,ZIm,ZIndex,PRe,PIm,CAddr,CCS,CTemp) \ " add "#CTemp", "#CAddr", #"SZ" \n\t" /* Imaginary skip */ \ " "LD1" "#ZRe"."DT", "#PRe"/z, ["#CAddr", "#ZIndex"."DT", "OFFS"]\n\t" \ " "LD1" "#ZIm"."DT", "#PRe"/z, ["#CTemp", "#ZIndex"."DT", "OFFS"]\n\t" \ " add "#CAddr", "#CAddr", "#CCS" \n\t" #define GEMM_CCOLCMPLX_SCATTER_STORE_FWD(ZRe,ZIm,ZIndex,PRe,PIm,CAddr,CCS,CTemp) \ " add "#CTemp", "#CAddr", #"SZ" \n\t" /* Imaginary skip */ \ " "ST1" "#ZRe"."DT", "#PRe", ["#CAddr", "#ZIndex"."DT", "OFFS"]\n\t" \ " "ST1" "#ZIm"."DT", "#PRe", ["#CTemp", "#ZIndex"."DT", "OFFS"]\n\t" \ " add "#CAddr", "#CAddr", "#CCS" \n\t" cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_macros_dcomplex.h000066400000000000000000000037171427272030600265150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Specify to use double precision. #define DT "d" #define LD1 "ld1d" #define ST1 "st1d" #define LD2 "ld2d" #define ST2 "st2d" #define LD1R "ld1rd" #define PRFG "prfd" #define SZ "8" #define OFFS "lsl #3" // Include macros. #include "armsve_asm_macros_cmplx.h" cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_macros_double.h000066400000000000000000000036371427272030600261550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Specify to use double precision. #define DT "d" #define LD1 "ld1d" #define ST1 "st1d" #define LD1R "ld1rd" #define PRFG "prfd" #define SZ "8" #define OFFS "lsl #3" // Include macros. #include "armsve_asm_macros.h" cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_macros_scomplex.h000066400000000000000000000037201427272030600265260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Specify to use single precision. #define DT "s" #define LD1 "ld1w" #define ST1 "st1w" #define LD2 "ld2w" #define ST2 "st2w" #define LD1R "ld1rw" #define PRFG "prfw" #define SZ "4" #define OFFS "uxtw #2" // Include macros. #include "armsve_asm_macros_cmplx.h" cython-blis-0.9.1/blis/_src/kernels/armsve/3/armsve_asm_macros_single.h000066400000000000000000000036401427272030600261560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Specify to use single precision. #define DT "s" #define LD1 "ld1w" #define ST1 "st1w" #define LD1R "ld1rw" #define PRFG "prfw" #define SZ "4" #define OFFS "uxtw #2" // Include macros. #include "armsve_asm_macros.h" cython-blis-0.9.1/blis/_src/kernels/armsve/3/bli_armsve_utils.c000066400000000000000000000072601427272030600244540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" dim_t bli_vl_bytes_armsve(void) { \ uint64_t vl = 0; __asm__ ( " mov x0, xzr \n\t" " incb x0 \n\t" " mov %[vl], x0 \n\t" : [vl] "=r" (vl) : : "x0" ); return vl; } #define EXPANDMAC_BLKSZ_ARMSVE(ch, S_Data) \ void PASTEMAC(ch, _blksz_armsve) (dim_t *m_r_, dim_t *n_r_, \ dim_t *k_c_, dim_t *m_c_, dim_t *n_c_) \ { \ dim_t W_L1 = bli_env_get_var("BLIS_SVE_W_L1", W_L1_SVE_DEFAULT); \ dim_t N_L1 = bli_env_get_var("BLIS_SVE_N_L1", N_L1_SVE_DEFAULT); \ dim_t C_L1 = bli_env_get_var("BLIS_SVE_C_L1", C_L1_SVE_DEFAULT); \ dim_t W_L2 = bli_env_get_var("BLIS_SVE_W_L2", W_L2_SVE_DEFAULT); \ dim_t N_L2 = bli_env_get_var("BLIS_SVE_N_L2", N_L2_SVE_DEFAULT); \ dim_t C_L2 = bli_env_get_var("BLIS_SVE_C_L2", C_L2_SVE_DEFAULT); \ dim_t W_L3 = bli_env_get_var("BLIS_SVE_W_L3", W_L3_SVE_DEFAULT); \ dim_t N_L3 = bli_env_get_var("BLIS_SVE_N_L3", N_L3_SVE_DEFAULT); \ dim_t C_L3 = bli_env_get_var("BLIS_SVE_C_L3", C_L3_SVE_DEFAULT); \ \ dim_t vl_b = bli_vl_bytes_armsve(); \ dim_t vl = vl_b / S_Data; \ dim_t m_r = 2 * vl; \ dim_t n_r = 10; \ \ dim_t k_c = (dim_t)( floor((W_L1 - 1.0)/(1.0 + (double)n_r/m_r)) * N_L1 * C_L1 ) \ / (n_r * S_Data); \ \ dim_t C_Ac = W_L2 - 1 - ceil( (2.0 * k_c * n_r * S_Data)/(C_L2 * N_L2) ); \ dim_t m_c = C_Ac * (N_L2 * C_L2)/(k_c * S_Data); \ m_c -= m_c % m_r; \ \ dim_t C_Bc = W_L3 - 1 - ceil( (2.0 * k_c * m_c * S_Data)/(C_L3 * N_L3) ); \ dim_t n_c = C_Bc * (N_L3 * C_L3)/(k_c * S_Data); \ n_c -= n_c % n_r; \ \ /* Ensure non-zero block sizes. */ \ m_c = bli_max(m_c, m_r); \ n_c = bli_max(n_c, n_r); \ k_c = bli_max(k_c, 128); \ \ *m_r_ = m_r; \ *n_r_ = n_r; \ *k_c_ = k_c; \ *m_c_ = m_c; \ *n_c_ = n_c; \ } EXPANDMAC_BLKSZ_ARMSVE( s, 4 ) EXPANDMAC_BLKSZ_ARMSVE( d, 8 ) EXPANDMAC_BLKSZ_ARMSVE( c, 8 ) EXPANDMAC_BLKSZ_ARMSVE( z, 16 ) cython-blis-0.9.1/blis/_src/kernels/armsve/3/bli_armsve_utils.h000066400000000000000000000042051427272030600244550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" dim_t bli_vl_bytes_armsve(void); void bli_s_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_d_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_c_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); void bli_z_blksz_armsve(dim_t *m_r_, dim_t *n_r_, dim_t *k_c_, dim_t *m_c_, dim_t *n_c_); cython-blis-0.9.1/blis/_src/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c000066400000000000000000000374571427272030600302620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Single-precision composite instructions. #include "armsve_asm_macros_scomplex.h" // 2vx10 microkernels. #include "armsve_asm_2vx10cmplx.h" void bli_cgemm_armsve_asm_2vx10_unindexed ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; GEMM_UKR_SETUP_CT( c, m, 10, false ); __asm__ volatile ( " whilelo p0.s, xzr, %12 \n\t" // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" " incw x2, ALL, MUL #1 \n\t" // Column-skip of A. " mov x3, #10 \n\t" // Row-skip of B. " \n\t" // " ldr x2, %[c] \n\t" // " ldr x3, %[rs_c] \n\t" // Row-skip of C. // " ldr x4, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %0, %0, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %1, %1, x16 \n\t" " mov x16, 0x3 \n\t" // Tag C address. " lsl x16, x16, #56 \n\t" " orr %2, %2, x16 \n\t" #endif " \n\t" " mov x16, #8 \n\t" // Multiply some address skips by sizeof(scomplex). " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" LABEL(LOAD_ABC) " cmp %5, #0 \n\t" // Don't preload if no microkernel there. BEQ(END_CCOL_PRFM) " \n\t" " ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rw z21.s, p0/z, [%1, 4*2] \n\t" " ld1rw z22.s, p0/z, [%1, 4*4] \n\t" " ld1rw z23.s, p0/z, [%1, 4*6] \n\t" " ld1rw z24.s, p0/z, [%1, 4*8] \n\t" " ld1rw z25.s, p0/z, [%1, 4*10] \n\t" " ld1rw z26.s, p0/z, [%1, 4*12] \n\t" " ld1rw z27.s, p0/z, [%1, 4*14] \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" LABEL(CCOL_PRFM) // " cmp %3, #1 \n\t" // BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp %5, #0 \n\t" // If no 4-microkernel can be applied. BEQ(K_LEFT_LOOP) " \n\t" LABEL(K_MKER_LOOP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) BRANCH(K_MKER_LOOP) " \n\t" LABEL(FIN_MKER_LOOP) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" LABEL(K_LEFT_LOOP) " cmp %6, #0 \n\t" // End of execution. BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rw z21.s, p0/z, [%1, 4*2] \n\t" " ld1rw z22.s, p0/z, [%1, 4*4] \n\t" " ld1rw z23.s, p0/z, [%1, 4*6] \n\t" " ld1rw z24.s, p0/z, [%1, 4*8] \n\t" " ld1rw z25.s, p0/z, [%1, 4*10] \n\t" " ld1rw z26.s, p0/z, [%1, 4*12] \n\t" " ld1rw z27.s, p0/z, [%1, 4*14] \n\t" GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " sub %6, %6, #1 \n\t" BRANCH(K_LEFT_LOOP) " \n\t" LABEL(WRITE_MEM_PREP) " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" " ld1rw z28.s, p0/z, [%7] \n\t" // Real(alpha). " ld1rw z29.s, p0/z, [%7, 4] \n\t" // Imag(alpha). " ld1rw z30.s, p0/z, [%8] \n\t" // Real(beta). " ld1rw z31.s, p0/z, [%8, 4] \n\t" // Imag(beta). " \n\t" LABEL(PREFETCH_ABNEXT) // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %9, %9, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %10, %10, x16 \n\t" #endif " prfm PLDL1STRM, [%9] \n\t" " prfm PLDL1STRM, [%9, 256*1] \n\t" " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" LABEL(WRITE_MEM) " fmov s27, #1.0 \n\t" " fcmp s29, #0.0 \n\t" // Whether Imag(alpha) == 0. " fccmp s28, s27, 0, eq \n\t" // Whether Real(alpha) == 1. BEQ(UNIT_ALPHA) " \n\t" GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) BRANCH(WRITE_MEM_EXEC) " \n\t" LABEL(UNIT_ALPHA) MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " \n\t" LABEL(WRITE_MEM_EXEC) " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. // " cmp %3, #1 \n\t" // BNE(WRITE_MEM_G) " \n\t" LABEL(WRITE_MEM_C) " fmov s29, wzr \n\t" " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. BEQ(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) LABEL(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) " \n\t" BEQ(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) LABEL(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) // BRANCH(END_WRITE_MEM) // " \n\t" // LABEL(WRITE_MEM_G) // " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, // " mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. // " index z28.s, wzr, w3 \n\t" // " fmov s29, wzr \n\t" // " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. // " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. // BEQ(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) // LABEL(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) // " \n\t" // BEQ(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) // LABEL(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) // " \n\t" // LABEL(END_WRITE_MEM) // BRANCH(END_EXEC) " \n\t" LABEL(END_EXEC) " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 "+r" (c), // %2 "+r" (rs_c), // %3 "+r" (cs_c), // %4 "+r" (k_mker), // %5 "+r" (k_left), // %6 "+r" (alpha), // %7 "+r" (beta), // %8 "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 : "r" (m) // %12 : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); GEMM_UKR_FLUSH_CT( c ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c000066400000000000000000000376051427272030600302560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Double-precision composite instructions. #include "armsve_asm_macros_double.h" // 2vx10 microkernels. #include "armsve_asm_2vx10.h" void bli_dgemm_armsve_asm_2vx10_unindexed ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( d, m, 10, false ); __asm__ volatile ( " mov x0, xzr \n\t" " ldr x1, %[m] \n\t" " whilelo p0.d, x0, x1 \n\t" " incd x0 \n\t" " whilelo p1.d, x0, x1 \n\t" " \n\t" " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" " incd x2, ALL, MUL #2 \n\t" // Column-skip of A. " mov x3, #10 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" // " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x8, 0x3 \n\t" // Tag C address. " lsl x8, x8, #56 \n\t" " orr x5, x5, x8 \n\t" " mov x8, 0x2 \n\t" // Tag B address. " lsl x8, x8, #56 \n\t" " orr x1, x1, x8 \n\t" " mov x8, 0x1 \n\t" // Tag A address. " lsl x8, x8, #56 \n\t" " orr x0, x0, x8 \n\t" #endif " \n\t" " mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" LABEL(LOAD_ABC) " cmp x4, #0 \n\t" // Don't preload if no microkernel there. BEQ(END_CCOL_PRFM) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rd z21.d, p0/z, [x1, 8] \n\t" " ld1rd z22.d, p0/z, [x1, 16] \n\t" " ld1rd z23.d, p0/z, [x1, 24] \n\t" " ld1rd z24.d, p0/z, [x1, 32] \n\t" " ld1rd z25.d, p0/z, [x1, 40] \n\t" " ld1rd z26.d, p0/z, [x1, 48] \n\t" " ld1rd z27.d, p0/z, [x1, 56] \n\t" " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" LABEL(CCOL_PRFM) // " cmp x6, #1 \n\t" // BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x4, #0 \n\t" // If no 4-microkernel can be applied BEQ(K_LEFT_LOOP) " \n\t" LABEL(K_MKER_LOOP) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) BRANCH(K_MKER_LOOP) " \n\t" LABEL(FIN_MKER_LOOP) GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " add x0, x0, x2 \n\t" // Forward A to fill the blank. " \n\t" LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of execution. BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rd z21.d, p0/z, [x1, 8] \n\t" " ld1rd z22.d, p0/z, [x1, 16] \n\t" " ld1rd z23.d, p0/z, [x1, 24] \n\t" " ld1rd z24.d, p0/z, [x1, 32] \n\t" " ld1rd z25.d, p0/z, [x1, 40] \n\t" " ld1rd z26.d, p0/z, [x1, 48] \n\t" " ld1rd z27.d, p0/z, [x1, 56] \n\t" " ld1rd z28.d, p0/z, [x1, 64] \n\t" " ld1rd z29.d, p0/z, [x1, 72] \n\t" GEMM_FMLA2(z0,z1,p0,z30,z31,z20) GEMM_FMLA2(z2,z3,p0,z30,z31,z21) GEMM_FMLA2(z4,z5,p0,z30,z31,z22) GEMM_FMLA2(z6,z7,p0,z30,z31,z23) GEMM_FMLA2(z8,z9,p0,z30,z31,z24) GEMM_FMLA2(z10,z11,p0,z30,z31,z25) GEMM_FMLA2(z12,z13,p0,z30,z31,z26) GEMM_FMLA2(z14,z15,p0,z30,z31,z27) GEMM_FMLA2(z16,z17,p0,z30,z31,z28) GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x0, x0, x2 \n\t" // Forward A. " add x1, x1, x3 \n\t" // Forward B. " sub x8, x8, #1 \n\t" BRANCH(K_LEFT_LOOP) " \n\t" LABEL(WRITE_MEM_PREP) " \n\t" " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ldr x4, [x4] \n\t" // Load alpha & beta (value). " ldr x8, [x8] \n\t" " dup z30.d, x4 \n\t" // Broadcast alpha & beta into vectors. " dup z31.d, x8 \n\t" " fmov d28, #1.0 \n\t" // Prepare FP 1.0. " fmov x16, d28 \n\t" " \n\t" LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" #ifdef _A64FX " mov x8, 0x2 \n\t" // Tag B address. " lsl x8, x8, #56 \n\t" " orr x1, x1, x8 \n\t" " mov x8, 0x1 \n\t" // Tag A address. " lsl x8, x8, #56 \n\t" " orr x0, x0, x8 \n\t" #endif " prfm PLDL1STRM, [x0] \n\t" " prfm PLDL1STRM, [x0, 256*1] \n\t" // " prfm PLDL2KEEP, [x0, 256*2] \n\t" // " prfm PLDL2KEEP, [x0, 256*3] \n\t" // " prfm PLDL2KEEP, [x0, 256*4] \n\t" // " prfm PLDL2KEEP, [x0, 256*5] \n\t" // " prfm PLDL2KEEP, [x0, 256*6] \n\t" // " prfm PLDL2KEEP, [x0, 256*7] \n\t" // " prfm PLDL2KEEP, [x0, 256*8] \n\t" // " prfm PLDL2KEEP, [x0, 256*9] \n\t" // " prfm PLDL2KEEP, [x0, 256*10] \n\t" // " prfm PLDL2KEEP, [x0, 256*11] \n\t" // " prfm PLDL2KEEP, [x0, 256*12] \n\t" // " prfm PLDL2KEEP, [x0, 256*13] \n\t" // " prfm PLDL2KEEP, [x0, 256*14] \n\t" // " prfm PLDL2KEEP, [x0, 256*15] \n\t" " prfm PLDL1STRM, [x1] \n\t" " prfm PLDL1STRM, [x1, 256*1] \n\t" // " prfm PLDL2KEEP, [x1, 256*2] \n\t" // " prfm PLDL2KEEP, [x1, 256*3] \n\t" // " prfm PLDL2KEEP, [x1, 256*4] \n\t" // " prfm PLDL2KEEP, [x1, 256*5] \n\t" // " prfm PLDL2KEEP, [x1, 256*6] \n\t" // " prfm PLDL2KEEP, [x1, 256*7] \n\t" // " prfm PLDL2KEEP, [x1, 256*8] \n\t" // " prfm PLDL2KEEP, [x1, 256*9] \n\t" " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" // Preload first half of C for contiguous case. // BNE(WRITE_MEM) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) " \n\t" LABEL(WRITE_MEM) " \n\t" " cmp x16, x4 \n\t" BEQ(UNIT_ALPHA) " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" LABEL(UNIT_ALPHA) // " cmp x6, #1 \n\t" // BNE(WRITE_MEM_G) " \n\t" LABEL(WRITE_MEM_C) " \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. BEQ(BETA_ZERO_C) // First half of C is already loaded in this case. // GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" LABEL(BETA_ZERO_C) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) // BRANCH(END_WRITE_MEM) // " \n\t" // LABEL(END_WRITE_MEM) // BRANCH(END_EXEC) // " \n\t" // LABEL(END_ERROR) // " mov x0, #1 \n\t" // Return error. LABEL(END_EXEC) " mov x0, #0 \n\t" // Return normal. : : [m] "m" (m), [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8", "x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); GEMM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c000066400000000000000000000360361427272030600302720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Copyright (C) 2019, Forschunszentrum Juelich Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Single-precision composite instructions. #include "armsve_asm_macros_single.h" // 2vx10 microkernels. #include "armsve_asm_2vx10.h" void bli_sgemm_armsve_asm_2vx10_unindexed ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( s, m, 10, false ); __asm__ volatile ( " mov x0, xzr \n\t" " ldr x1, %[m] \n\t" " whilelo p0.s, x0, x1 \n\t" " incw x0 \n\t" " whilelo p1.s, x0, x1 \n\t" " \n\t" " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" " incw x2, ALL, MUL #2 \n\t" // Column-skip of A. " mov x3, #10 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" // " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x8, 0x3 \n\t" // Tag C address. " lsl x8, x8, #56 \n\t" " orr x5, x5, x8 \n\t" " mov x8, 0x2 \n\t" // Tag B address. " lsl x8, x8, #56 \n\t" " orr x1, x1, x8 \n\t" " mov x8, 0x1 \n\t" // Tag A address. " lsl x8, x8, #56 \n\t" " orr x0, x0, x8 \n\t" #endif " \n\t" " mov x8, #4 \n\t" // Multiply some address skips by sizeof(float). " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" LABEL(LOAD_ABC) " cmp x4, #0 \n\t" // Don't preload if no microkernel there. BEQ(END_CCOL_PRFM) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rw z21.s, p0/z, [x1, 4] \n\t" " ld1rw z22.s, p0/z, [x1, 8] \n\t" " ld1rw z23.s, p0/z, [x1, 12] \n\t" " ld1rw z24.s, p0/z, [x1, 16] \n\t" " ld1rw z25.s, p0/z, [x1, 20] \n\t" " ld1rw z26.s, p0/z, [x1, 24] \n\t" " ld1rw z27.s, p0/z, [x1, 28] \n\t" " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" LABEL(CCOL_PRFM) // " cmp x6, #1 \n\t" // BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x4, #0 \n\t" // If no 4-microkernel can be applied BEQ(K_LEFT_LOOP) " \n\t" LABEL(K_MKER_LOOP) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) BRANCH(K_MKER_LOOP) " \n\t" LABEL(FIN_MKER_LOOP) GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " add x0, x0, x2 \n\t" // Forward A to fill the blank. " \n\t" LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of execution. BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rw z21.s, p0/z, [x1, 4] \n\t" " ld1rw z22.s, p0/z, [x1, 8] \n\t" " ld1rw z23.s, p0/z, [x1, 12] \n\t" " ld1rw z24.s, p0/z, [x1, 16] \n\t" " ld1rw z25.s, p0/z, [x1, 20] \n\t" " ld1rw z26.s, p0/z, [x1, 24] \n\t" " ld1rw z27.s, p0/z, [x1, 28] \n\t" " ld1rw z28.s, p0/z, [x1, 32] \n\t" " ld1rw z29.s, p0/z, [x1, 36] \n\t" GEMM_FMLA2(z0,z1,p0,z30,z31,z20) GEMM_FMLA2(z2,z3,p0,z30,z31,z21) GEMM_FMLA2(z4,z5,p0,z30,z31,z22) GEMM_FMLA2(z6,z7,p0,z30,z31,z23) GEMM_FMLA2(z8,z9,p0,z30,z31,z24) GEMM_FMLA2(z10,z11,p0,z30,z31,z25) GEMM_FMLA2(z12,z13,p0,z30,z31,z26) GEMM_FMLA2(z14,z15,p0,z30,z31,z27) GEMM_FMLA2(z16,z17,p0,z30,z31,z28) GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x0, x0, x2 \n\t" // Forward A. " add x1, x1, x3 \n\t" // Forward B. " sub x8, x8, #1 \n\t" BRANCH(K_LEFT_LOOP) " \n\t" LABEL(WRITE_MEM_PREP) " \n\t" " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ldr w4, [x4] \n\t" // Load alpha & beta (value). " ldr w8, [x8] \n\t" " dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors. " dup z31.s, w8 \n\t" " \n\t" LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" " prfm PLDL2KEEP, [x0] \n\t" " prfm PLDL2KEEP, [x0, 256*1] \n\t" " prfm PLDL2KEEP, [x0, 256*2] \n\t" " prfm PLDL2KEEP, [x0, 256*3] \n\t" " prfm PLDL2KEEP, [x0, 256*4] \n\t" " prfm PLDL2KEEP, [x0, 256*5] \n\t" " prfm PLDL2KEEP, [x0, 256*6] \n\t" " prfm PLDL2KEEP, [x0, 256*7] \n\t" " prfm PLDL2KEEP, [x0, 256*8] \n\t" " prfm PLDL2KEEP, [x0, 256*9] \n\t" " prfm PLDL2KEEP, [x0, 256*10] \n\t" " prfm PLDL2KEEP, [x0, 256*11] \n\t" " prfm PLDL2KEEP, [x0, 256*12] \n\t" " prfm PLDL2KEEP, [x0, 256*13] \n\t" " prfm PLDL2KEEP, [x0, 256*14] \n\t" " prfm PLDL2KEEP, [x0, 256*15] \n\t" " prfm PLDL2KEEP, [x1] \n\t" " prfm PLDL2KEEP, [x1, 256*1] \n\t" " prfm PLDL2KEEP, [x1, 256*2] \n\t" " prfm PLDL2KEEP, [x1, 256*3] \n\t" " prfm PLDL2KEEP, [x1, 256*4] \n\t" " prfm PLDL2KEEP, [x1, 256*5] \n\t" " prfm PLDL2KEEP, [x1, 256*6] \n\t" " prfm PLDL2KEEP, [x1, 256*7] \n\t" " prfm PLDL2KEEP, [x1, 256*8] \n\t" " prfm PLDL2KEEP, [x1, 256*9] \n\t" " \n\t" LABEL(WRITE_MEM) " \n\t" " fmov s28, #1.0 \n\t" " fmov w16, s28 \n\t" " cmp w16, w4 \n\t" BEQ(UNIT_ALPHA) " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" LABEL(UNIT_ALPHA) " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" // BNE(WRITE_MEM_G) " \n\t" LABEL(WRITE_MEM_C) " \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " fcmp s31, #0.0 \n\t" BEQ(BETA_ZERO_C) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" LABEL(BETA_ZERO_C) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) // BRANCH(END_WRITE_MEM) // " \n\t" // LABEL(END_WRITE_MEM) // BRANCH(END_EXEC) // " \n\t" // LABEL(END_ERROR) // " mov x0, #1 \n\t" // Return error. LABEL(END_EXEC) " mov x0, #0 \n\t" // Return normal. : : [m] "m" (m), [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8", "x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); GEMM_UKR_FLUSH_CT( s ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c000066400000000000000000000373711427272030600303040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Double-precision composite instructions. #include "armsve_asm_macros_dcomplex.h" // 2vx10 microkernels. #include "armsve_asm_2vx10cmplx.h" void bli_zgemm_armsve_asm_2vx10_unindexed ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; GEMM_UKR_SETUP_CT( z, m, 10, false ); __asm__ volatile ( " whilelo p0.d, xzr, %12 \n\t" // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" " incd x2, ALL, MUL #1 \n\t" // Column-skip of A. " mov x3, #10 \n\t" // Row-skip of B. " \n\t" // " ldr x2, %[c] \n\t" // " ldr x3, %[rs_c] \n\t" // Row-skip of C. // " ldr x4, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %0, %0, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %1, %1, x16 \n\t" " mov x16, 0x3 \n\t" // Tag C address. " lsl x16, x16, #56 \n\t" " orr %2, %2, x16 \n\t" #endif " \n\t" " mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex). " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" LABEL(LOAD_ABC) " cmp %5, #0 \n\t" // Don't preload if no microkernel there. BEQ(END_CCOL_PRFM) " \n\t" " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rd z21.d, p0/z, [%1, 8*2] \n\t" " ld1rd z22.d, p0/z, [%1, 8*4] \n\t" " ld1rd z23.d, p0/z, [%1, 8*6] \n\t" " ld1rd z24.d, p0/z, [%1, 8*8] \n\t" " ld1rd z25.d, p0/z, [%1, 8*10] \n\t" " ld1rd z26.d, p0/z, [%1, 8*12] \n\t" " ld1rd z27.d, p0/z, [%1, 8*14] \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" LABEL(CCOL_PRFM) // " cmp %3, #1 \n\t" // BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp %5, #0 \n\t" // If no 4-microkernel can be applied. BEQ(K_LEFT_LOOP) " \n\t" LABEL(K_MKER_LOOP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) BRANCH(K_MKER_LOOP) " \n\t" LABEL(FIN_MKER_LOOP) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" LABEL(K_LEFT_LOOP) " cmp %6, #0 \n\t" // End of execution. BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rd z21.d, p0/z, [%1, 8*2] \n\t" " ld1rd z22.d, p0/z, [%1, 8*4] \n\t" " ld1rd z23.d, p0/z, [%1, 8*6] \n\t" " ld1rd z24.d, p0/z, [%1, 8*8] \n\t" " ld1rd z25.d, p0/z, [%1, 8*10] \n\t" " ld1rd z26.d, p0/z, [%1, 8*12] \n\t" " ld1rd z27.d, p0/z, [%1, 8*14] \n\t" GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " sub %6, %6, #1 \n\t" BRANCH(K_LEFT_LOOP) " \n\t" LABEL(WRITE_MEM_PREP) " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" " ld1rd z28.d, p0/z, [%7] \n\t" // Real(alpha). " ld1rd z29.d, p0/z, [%7, 8] \n\t" // Imag(alpha). " ld1rd z30.d, p0/z, [%8] \n\t" // Real(beta). " ld1rd z31.d, p0/z, [%8, 8] \n\t" // Imag(beta). " \n\t" LABEL(PREFETCH_ABNEXT) // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %9, %9, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %10, %10, x16 \n\t" #endif " prfm PLDL1STRM, [%9] \n\t" " prfm PLDL1STRM, [%9, 256*1] \n\t" " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" LABEL(WRITE_MEM) " fmov d27, #1.0 \n\t" " fcmp d29, #0.0 \n\t" // Whether Imag(alpha) == 0. " fccmp d28, d27, 0, eq \n\t" // Whether Real(alpha) == 1. BEQ(UNIT_ALPHA) " \n\t" GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) BRANCH(WRITE_MEM_EXEC) " \n\t" LABEL(UNIT_ALPHA) MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " \n\t" LABEL(WRITE_MEM_EXEC) " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. // " cmp %3, #1 \n\t" // BNE(WRITE_MEM_G) " \n\t" LABEL(WRITE_MEM_C) " fmov d29, xzr \n\t" " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. BEQ(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) LABEL(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) " \n\t" BEQ(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) LABEL(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) // BRANCH(END_WRITE_MEM) // " \n\t" // LABEL(WRITE_MEM_G) // " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, // " index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. // " fmov d29, xzr \n\t" // " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. // " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. // BEQ(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) // LABEL(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) // " \n\t" // BEQ(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) // LABEL(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) // " \n\t" // LABEL(END_WRITE_MEM) // BRANCH(END_EXEC) // " \n\t" LABEL(END_EXEC) " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 "+r" (c), // %2 "+r" (rs_c), // %3 "+r" (cs_c), // %4 "+r" (k_mker), // %5 "+r" (k_left), // %6 "+r" (alpha), // %7 "+r" (beta), // %8 "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 : "r" (m) // %12 : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/000077500000000000000000000000001427272030600215165ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/armsve_asm_2vx7cmplx.h000066400000000000000000000174111427272030600257620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,PT,AColRe,AColIm,B0Re,B1Re,B2Re,B3Re,B4Re,B5Re,B6Re,B0Im,B1Im,B2Im,B3Im,B4Im,B5Im,B6Im,BAddr,BRSBit) \ GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,B0Re,BAddr,0) \ GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,B1Re,BAddr,2) \ GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,B2Re,BAddr,4) \ GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,B3Re,BAddr,6) \ GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,B4Re,BAddr,8) \ GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,B5Re,BAddr,10) \ GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,B6Re,BAddr,12) \ GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,B0Im,BAddr,1) \ GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,B1Im,BAddr,3) \ GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,B2Im,BAddr,5) \ GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,B3Im,BAddr,7) \ GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,B4Im,BAddr,9) \ GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,B5Im,BAddr,11) \ GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,B6Im,BAddr,13) \ " add "#BAddr", "#BRSBit", "#BAddr" \n\t" #define GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,PT,AColRe,AColIm,B0Re,B1Re,B2Re,B3Re,B4Re,B5Re,B6Re,B0Im,B1Im,B2Im,B3Im,B4Im,B5Im,B6Im,BAddr,BRSBit) \ GEMM_FMLA2(C0Re,C0Im,PT,AColRe,AColIm,B0Re) \ GEMM_FMLA2(C1Re,C1Im,PT,AColRe,AColIm,B1Re) \ GEMM_FMLA2(C2Re,C2Im,PT,AColRe,AColIm,B2Re) \ GEMM_FMLA2(C3Re,C3Im,PT,AColRe,AColIm,B3Re) \ GEMM_FMLA2(C4Re,C4Im,PT,AColRe,AColIm,B4Re) \ GEMM_FMLA2(C5Re,C5Im,PT,AColRe,AColIm,B5Re) \ GEMM_FMLA2(C6Re,C6Im,PT,AColRe,AColIm,B6Re) \ GEMM_FMLX2(C0Im,C0Re,PT,AColRe,AColIm,B0Im) \ GEMM_FMLX2(C1Im,C1Re,PT,AColRe,AColIm,B1Im) \ GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,B2Im) \ GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,B3Im) \ GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,B4Im) \ GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,B5Im) \ GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,B6Im) #define CLEAR_COL14(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13) \ CLEAR_COL4(Z00,Z01,Z02,Z03) \ CLEAR_COL4(Z04,Z05,Z06,Z07) \ CLEAR_COL4(Z08,Z09,Z10,Z11) \ CLEAR_COL2(Z12,Z13) #define GEMM_FMULCMPLX_COL7(ZD0Re,ZD0Im,ZD1Re,ZD1Im,ZD2Re,ZD2Im,ZD3Re,ZD3Im,ZD4Re,ZD4Im,ZD5Re,ZD5Im,ZD6Re,ZD6Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,ZFactorRe,ZFactorIm) \ FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \ FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \ FMUL_COL2(ZD2Re,ZD2Im,Z2Re,Z2Im,ZFactorRe) \ FMUL_COL2(ZD3Re,ZD3Im,Z3Re,Z3Im,ZFactorRe) \ FMUL_COL2(ZD4Re,ZD4Im,Z4Re,Z4Im,ZFactorRe) \ FMUL_COL2(ZD5Re,ZD5Im,Z5Re,Z5Im,ZFactorRe) \ FMUL_COL2(ZD6Re,ZD6Im,Z6Re,Z6Im,ZFactorRe) \ GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \ GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) \ GEMM_FMLX2(ZD2Im,ZD2Re,PT,Z2Re,Z2Im,ZFactorIm) \ GEMM_FMLX2(ZD3Im,ZD3Re,PT,Z3Re,Z3Im,ZFactorIm) \ GEMM_FMLX2(ZD4Im,ZD4Re,PT,Z4Re,Z4Im,ZFactorIm) \ GEMM_FMLX2(ZD5Im,ZD5Re,PT,Z5Re,Z5Im,ZFactorIm) \ GEMM_FMLX2(ZD6Im,ZD6Re,PT,Z6Re,Z6Im,ZFactorIm) #define GEMM_FMLACMPLX_COL7(ZD0Re,ZD0Im,ZD1Re,ZD1Im,ZD2Re,ZD2Im,ZD3Re,ZD3Im,ZD4Re,ZD4Im,ZD5Re,ZD5Im,ZD6Re,ZD6Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD2Re,ZD2Im,PT,Z2Re,Z2Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD3Re,ZD3Im,PT,Z3Re,Z3Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD4Re,ZD4Im,PT,Z4Re,Z4Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD5Re,ZD5Im,PT,Z5Re,Z5Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD6Re,ZD6Im,PT,Z6Re,Z6Im,ZFactorRe,ZFactorIm) #define GEMM_CCMPLX_LOAD_COL7_C(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z2Re,Z2Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z3Re,Z3Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z4Re,Z4Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z5Re,Z5Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z6Re,Z6Im,PT,CAddr,CCS) #define GEMM_CCMPLX_STORE_COL7_C(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z2Re,Z2Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z3Re,Z3Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z4Re,Z4Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z5Re,Z5Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z6Re,Z6Im,PT,CAddr,CCS) #define GEMM_CCMPLX_LOAD_COL7_G(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,ZIndex,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z2Re,Z2Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z3Re,Z3Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z4Re,Z4Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z5Re,Z5Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z6Re,Z6Im,ZIndex,PT,PT,CAddr,CCS,CTemp) #define GEMM_CCMPLX_STORE_COL7_G(Z0Re,Z0Im,Z1Re,Z1Im,Z2Re,Z2Im,Z3Re,Z3Im,Z4Re,Z4Im,Z5Re,Z5Im,Z6Re,Z6Im,PT,ZIndex,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z2Re,Z2Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z3Re,Z3Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z4Re,Z4Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z5Re,Z5Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z6Re,Z6Im,ZIndex,PT,PT,CAddr,CCS,CTemp) cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/armsve_asm_2vx8cmplx.h000066400000000000000000000153751427272030600257720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \ GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \ GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \ GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \ " add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ GEMM_FMLA2_LD1R(C4Re,C4Im,PT,AColRe,AColIm,BV4,BAddr,0) \ GEMM_FMLA2_LD1R(C5Re,C5Im,PT,AColRe,AColIm,BV5,BAddr,2) \ GEMM_FMLA2_LD1R(C6Re,C6Im,PT,AColRe,AColIm,BV6,BAddr,4) \ GEMM_FMLA2_LD1R(C7Re,C7Im,PT,AColRe,AColIm,BV7,BAddr,6) \ \ GEMM_FMLX2_LD1R(C0Im,C0Re,PT,AColRe,AColIm,BV8,BAddr,8) \ GEMM_FMLX2_LD1R(C1Im,C1Re,PT,AColRe,AColIm,BV9,BAddr,10) \ GEMM_FMLX2_LD1R(C2Im,C2Re,PT,AColRe,AColIm,BV10,BAddr,12) \ GEMM_FMLX2_LD1R(C3Im,C3Re,PT,AColRe,AColIm,BV11,BAddr,14) \ GEMM_FMLX2_LD1R(C4Im,C4Re,PT,AColRe,AColIm,BV0,BAddr,1) \ GEMM_FMLX2_LD1R(C5Im,C5Re,PT,AColRe,AColIm,BV1,BAddr,3) \ GEMM_FMLX2_LD1R(C6Im,C6Re,PT,AColRe,AColIm,BV2,BAddr,5) \ GEMM_FMLX2_LD1R(C7Im,C7Re,PT,AColRe,AColIm,BV3,BAddr,7) #define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BAddr,BRSBit) #define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) #define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ GEMM_FMLA2_LD1R(C0Re,C0Im,PT,AColRe,AColIm,BV0,BAddr,9) \ GEMM_FMLA2_LD1R(C1Re,C1Im,PT,AColRe,AColIm,BV1,BAddr,11) \ GEMM_FMLA2_LD1R(C2Re,C2Im,PT,AColRe,AColIm,BV2,BAddr,13) \ GEMM_FMLA2_LD1R(C3Re,C3Im,PT,AColRe,AColIm,BV3,BAddr,15) \ " add "#BAddr", "#BRSBit", "#BAddr" \n\t" /* B address forward */ \ GEMM_FMLA2(C4Re,C4Im,PT,AColRe,AColIm,BV4) \ GEMM_FMLA2(C5Re,C5Im,PT,AColRe,AColIm,BV5) \ GEMM_FMLA2(C6Re,C6Im,PT,AColRe,AColIm,BV6) \ GEMM_FMLA2(C7Re,C7Im,PT,AColRe,AColIm,BV7) \ \ GEMM_FMLX2(C0Im,C0Re,PT,AColRe,AColIm,BV8) \ GEMM_FMLX2(C1Im,C1Re,PT,AColRe,AColIm,BV9) \ GEMM_FMLX2(C2Im,C2Re,PT,AColRe,AColIm,BV10) \ GEMM_FMLX2(C3Im,C3Re,PT,AColRe,AColIm,BV11) \ GEMM_FMLX2(C4Im,C4Re,PT,AColRe,AColIm,BV0) \ GEMM_FMLX2(C5Im,C5Re,PT,AColRe,AColIm,BV1) \ GEMM_FMLX2(C6Im,C6Re,PT,AColRe,AColIm,BV2) \ GEMM_FMLX2(C7Im,C7Re,PT,AColRe,AColIm,BV3) #define GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BV8,BV9,BV10,BV11,BAddr,BRSBit) \ GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(C0Re,C1Re,C2Re,C3Re,C4Re,C5Re,C6Re,C7Re,C0Im,C1Im,C2Im,C3Im,C4Im,C5Im,C6Im,C7Im,PT,AColRe,AColIm,BV8,BV9,BV10,BV11,BV0,BV1,BV2,BV3,BV4,BV5,BV6,BV7,BAddr,BRSBit) #define CLEAR_COL16(Z00,Z01,Z02,Z03,Z04,Z05,Z06,Z07,Z08,Z09,Z10,Z11,Z12,Z13,Z14,Z15) \ CLEAR_COL4(Z00,Z01,Z02,Z03) \ CLEAR_COL4(Z04,Z05,Z06,Z07) \ CLEAR_COL4(Z08,Z09,Z10,Z11) \ CLEAR_COL4(Z12,Z13,Z14,Z15) #define GEMM_FMULCMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ FMUL_COL2(ZD0Re,ZD0Im,Z0Re,Z0Im,ZFactorRe) \ FMUL_COL2(ZD1Re,ZD1Im,Z1Re,Z1Im,ZFactorRe) \ GEMM_FMLX2(ZD0Im,ZD0Re,PT,Z0Re,Z0Im,ZFactorIm) \ GEMM_FMLX2(ZD1Im,ZD1Re,PT,Z1Re,Z1Im,ZFactorIm) #define GEMM_FMLACMPLX_COL2(ZD0Re,ZD0Im,ZD1Re,ZD1Im,PT,Z0Re,Z0Im,Z1Re,Z1Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD0Re,ZD0Im,PT,Z0Re,Z0Im,ZFactorRe,ZFactorIm) \ GEMM_FMLACMPLX(ZD1Re,ZD1Im,PT,Z1Re,Z1Im,ZFactorRe,ZFactorIm) #define GEMM_CCMPLX_LOAD_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_LOAD_FWD(Z1Re,Z1Im,PT,CAddr,CCS) #define GEMM_CCMPLX_STORE_COL2_C(Z0Re,Z0Im,Z1Re,Z1Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z0Re,Z0Im,PT,CAddr,CCS) \ GEMM_CCOLCMPLX_CONTIGUOUS_STORE_FWD(Z1Re,Z1Im,PT,CAddr,CCS) #define GEMM_CCMPLX_LOAD_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_GATHER_LOAD_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) #define GEMM_CCMPLX_STORE_COL2_G(Z0Re,Z0Im,Z1Re,Z1Im,PT,ZIndex,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z0Re,Z0Im,ZIndex,PT,PT,CAddr,CCS,CTemp) \ GEMM_CCOLCMPLX_SCATTER_STORE_FWD(Z1Re,Z1Im,ZIndex,PT,PT,CAddr,CCS,CTemp) cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/armsve_asm_macros_half.h000066400000000000000000000036421427272030600263670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Specify to use half precision. #define DT "h" #define LD1 "ld1h" #define ST1 "st1h" #define LD1R "ld1rh" #define PRFG "prfh" #define SZ "2" // #define OFFS UNSUPPORTED // Include macros. #include "armsve_asm_macros.h" cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/bli_gemm_armsve256_asm_d8x8.c000066400000000000000000001471201427272030600267670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Linaro Limited Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" /* o 8x8 Double precision micro-kernel o Runnable on ARMv8a with SVE 256 feature, compiled with aarch64 GCC. o Tested on qemu-aarch64 and armie for SVE. Preconditions: - to use this kernel, SVE with vector length of 256 bits is a must. April 2020. */ void bli_dgemm_armsve256_asm_8x8 ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" " ldr x0,%[aaddr] \n\t" // Load address of A " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" " ldr x3,%[a_next] \n\t" // Move pointer " ldr x4,%[b_next] \n\t" // Move pointer " \n\t" " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) " ldr x6,%[k_left] \n\t" // Init guard (k_iter) " \n\t" " ldr x7,%[alpha] \n\t" // Alpha address " ldr x8,%[beta] \n\t" // Beta address " \n\t" " ldr x9,%[cs_c] \n\t" // Load cs_c " lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) " \n\t" " ldr x13,%[rs_c] \n\t" // Load rs_c. " lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). " \n\t" " add x20,x2,x10 \n\t" //Load address Column 1 of C " add x21,x20,x10 \n\t" //Load address Column 2 of C " add x22,x21,x10 \n\t" //Load address Column 3 of C " add x23,x22,x10 \n\t" //Load address Column 4 of C " add x24,x23,x10 \n\t" //Load address Column 5 of C " add x25,x24,x10 \n\t" //Load address Column 6 of C " add x26,x25,x10 \n\t" //Load address Column 7 of C " \n\t" " prfm pldl1keep,[x2] \n\t" // Prefetch c. " prfm pldl1keep,[x20] \n\t" // Prefetch c. " prfm pldl1keep,[x21] \n\t" // Prefetch c. " prfm pldl1keep,[x22] \n\t" // Prefetch c. " prfm pldl1keep,[x23] \n\t" // Prefetch c. " prfm pldl1keep,[x24] \n\t" // Prefetch c. " prfm pldl1keep,[x25] \n\t" // Prefetch c. " prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" " ldr z0, [x0] \n\t" // Load a " ldr z1, [x0, #1, MUL VL] \n\t" " \n\t" " ptrue p0.d, all \n\t" " ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) " ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) " ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) " ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // PRFM, the following prefetch on [x1] and [x0] " \n\t" // is for b rows 4..7 and a columns 4..7. " \n\t" // both of them will be used in next iteration " \n\t" // of k_iter (unrolled per 4 loops) " \n\t" " dup z16.d, #0 \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #256] \n\t" // prefetch b row no.4 " dup z17.d, #0 \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #320] \n\t" // prefetch b row no.5 " dup z18.d, #0 \n\t" // Vector for accummulating column 1 " prfm PLDL1KEEP, [x1, #384] \n\t" // prefetch b row no.6 " dup z19.d, #0 \n\t" // Vector for accummulating column 1 " prfm PLDL1KEEP, [x1, #448] \n\t" // preftech b row no.7 " dup z20.d, #0 \n\t" // Vector for accummulating column 2 " dup z21.d, #0 \n\t" // Vector for accummulating column 2 " \n\t" " dup z22.d, #0 \n\t" // Vector for accummulating column 3 " prfm PLDL1KEEP, [x0, #256] \n\t" // prefetch a col. no.4 " dup z23.d, #0 \n\t" // Vector for accummulating column 3 " prfm PLDL1KEEP, [x0, #320] \n\t" // prefetch a col. no.5 " dup z24.d, #0 \n\t" // Vector for accummulating column 4 " prfm PLDL1KEEP, [x0, #384] \n\t" // prefetch a col. no.6 " dup z25.d, #0 \n\t" // Vector for accummulating column 4 " prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.7 " dup z26.d, #0 \n\t" // Vector for accummulating column 5 " dup z27.d, #0 \n\t" // Vector for accummulating column 5 " \n\t" " dup z28.d, #0 \n\t" // Vector for accummulating column 6 " dup z29.d, #0 \n\t" // Vector for accummulating column 6 " dup z30.d, #0 \n\t" // Vector for accummulating column 7 " dup z31.d, #0 \n\t" // Vector for accummulating column 7 " \n\t" " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. " beq .DCONSIDERKLEFT \n\t" " \n\t" " add x0, x0, #64 \n\t" //update address of A " add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. " beq .DLASTITER \n\t" // (as loop is do-while-like). " \n\t" " DLOOP: \n\t" // Body " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " prfm PLDL1KEEP, [x1, #448] \n\t" // prefetch b row no.8, 512-64=448 " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " prfm PLDL1KEEP, [x1, #512] \n\t" // prefetch b row no.9 " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " prfm PLDL1KEEP, [x1, #576] \n\t" // prefetch b row no.10 " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " ldr z6, [x0] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 1 " \n\t" " fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " prfm PLDL1KEEP, [x1, #640] \n\t" // prefetch b row no.11 " fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.8 " fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " prfm PLDL1KEEP, [x0, #512] \n\t" // prefetch a col. no.9 " \n\t" " fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" " \n\t" //End it 2 " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " prfm PLDL1KEEP, [x0, #576] \n\t" // prefetch a col. no.10 " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " prfm PLDL1KEEP, [x0, #640] \n\t" // prefetch a col. no.11 " \n\t" " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " \n\t" " add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be " \n\t" // in range -128 to 112 " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 3 " \n\t" " fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " ldr z0, [x0, #6, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " ldr z1, [x0, #7, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" //End it 4 " add x0, x0, #256 \n\t" " add x1, x1, #128 \n\t" " \n\t" " sub x5,x5,1 \n\t" // i-=1 " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. " bne DLOOP \n\t" " \n\t" ".DLASTITER: \n\t" " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " ldr z6, [x0] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 1 " \n\t" " fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" " \n\t" //End it 2 " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) " \n\t" " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) " \n\t" " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be " \n\t" // in range -128 to 112 " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) " \n\t" " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) " \n\t" " \n\t" // End it 3 " \n\t" " fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " \n\t" " fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " \n\t" " fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " \n\t" " fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " \n\t" " fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " \n\t" " fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " add x1, x1, #64 \n\t" " \n\t" " fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " \n\t" " fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " \n\t" " \n\t" //End it 4 " add x0, x0, #192 \n\t" " \n\t" " .DCONSIDERKLEFT: \n\t" " cmp x6,0 \n\t" // If k_left == 0, we are done. " beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. " \n\t" ".DLOOPKLEFT: \n\t" " \n\t" " ldr z0, [x0] \n\t" // Load a " ldr z1, [x0, #1, MUL VL] \n\t" " add x0, x0, #64 \n\t" " \n\t" " ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) " ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) " ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) " ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) " add x1, x1, #64 \n\t" " \n\t" " sub x6,x6,1 \n\t" " \n\t" " fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) " fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) " \n\t" " fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) " fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) " \n\t" " fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) " fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) " \n\t" " fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) " fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) " \n\t" " fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) " fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) " \n\t" " fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) " fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) " \n\t" " fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) " fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) " \n\t" " fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) " fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) " \n\t" " cmp x6,0 \n\t" // Iterate again. " bne .DLOOPKLEFT \n\t" // if i!=0. " \n\t" " .DPOSTACCUM: \n\t" " \n\t" " ld1rd {z6.d}, p0/z, [x7] \n\t" // Load alpha. " ld1rd {z7.d}, p0/z, [x8] \n\t" // Load beta " \n\t" " cmp x13,#1 \n\t" // If rs_c != 1 (column-major) " bne .DGENSTORED \n\t" " \n\t" " .DCOLSTORED: \n\t" // C is column-major. " \n\t" " dup z0.d, #0 \n\t" " dup z1.d, #0 \n\t" " dup z2.d, #0 \n\t" " dup z3.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" " ldr z0, [x2] \n\t" //Load column 0 of C " ldr z1, [x2, #1, MUL VL] \n\t" " \n\t" " ldr z2, [x20] \n\t" //Load column 1 of C " ldr z3, [x20, #1, MUL VL] \n\t" " \n\t" " fmul z0.d, z0.d, z7.d \n\t" // Scale by beta " fmul z1.d, z1.d, z7.d \n\t" // Scale by beta " fmul z2.d, z2.d, z7.d \n\t" // Scale by beta " fmul z3.d, z3.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROCOLSTOREDS1: \n\t" " \n\t" " fmla z0.d, z16.d, z6.d[0] \n\t" // Scale by alpha " fmla z1.d, z17.d, z6.d[0] \n\t" // Scale by alpha " fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha " fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " str z0, [x2] \n\t" //Store column 0 of C " str z1, [x2, #1, MUL VL] \n\t" " \n\t" " str z2, [x20] \n\t" //Store column 1 of C " str z3, [x20, #1, MUL VL] \n\t" " \n\t" " dup z8.d, #0 \n\t" " dup z9.d, #0 \n\t" " dup z10.d, #0 \n\t" " dup z11.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. " \n\t" " ldr z8, [x21] \n\t" //Load column 2 of C " ldr z9, [x21, #1, MUL VL] \n\t" " \n\t" " ldr z10, [x22] \n\t" //Load column 3 of C " ldr z11, [x22, #1, MUL VL] \n\t" " \n\t" " fmul z8.d, z8.d, z7.d \n\t" // Scale by beta " fmul z9.d, z9.d, z7.d \n\t" // Scale by beta " fmul z10.d, z10.d, z7.d \n\t" // Scale by beta " fmul z11.d, z11.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROCOLSTOREDS2: \n\t" " \n\t" " fmla z8.d, z20.d, z6.d[0] \n\t" // Scale by alpha " fmla z9.d, z21.d, z6.d[0] \n\t" // Scale by alpha " fmla z10.d, z22.d, z6.d[0] \n\t" // Scale by alpha " fmla z11.d, z23.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " str z8, [x21] \n\t" //Store column 2 of C " str z9, [x21, #1, MUL VL] \n\t" " \n\t" " str z10, [x22] \n\t" //Store column 3 of C " str z11, [x22, #1, MUL VL] \n\t" " \n\t" " dup z0.d, #0 \n\t" " dup z1.d, #0 \n\t" " dup z2.d, #0 \n\t" " dup z3.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. " \n\t" " ldr z0, [x23] \n\t" //Load column 4 of C " ldr z1, [x23, #1, MUL VL] \n\t" " \n\t" " ldr z2, [x24] \n\t" //Load column 5 of C " ldr z3, [x24, #1, MUL VL] \n\t" " \n\t" " fmul z0.d, z0.d, z7.d \n\t" // Scale by beta " fmul z1.d, z1.d, z7.d \n\t" // Scale by beta " fmul z2.d, z2.d, z7.d \n\t" // Scale by beta " fmul z3.d, z3.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROCOLSTOREDS3: \n\t" " \n\t" " fmla z0.d, z24.d, z6.d[0] \n\t" // Scale by alpha " fmla z1.d, z25.d, z6.d[0] \n\t" // Scale by alpha " fmla z2.d, z26.d, z6.d[0] \n\t" // Scale by alpha " fmla z3.d, z27.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " str z0, [x23] \n\t" //Store column 4 of C " str z1, [x23, #1, MUL VL] \n\t" " \n\t" " str z2, [x24] \n\t" //Store column 5 of C " str z3, [x24, #1, MUL VL] \n\t" " \n\t" " dup z8.d, #0 \n\t" " dup z9.d, #0 \n\t" " dup z10.d, #0 \n\t" " dup z11.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. " \n\t" " ldr z8, [x25] \n\t" //Load column 6 of C " ldr z9, [x25, #1, MUL VL] \n\t" " \n\t" " ldr z10, [x26] \n\t" //Load column 7 of C " ldr z11, [x26, #1, MUL VL] \n\t" " \n\t" " fmul z8.d, z8.d, z7.d \n\t" // Scale by beta " fmul z9.d, z9.d, z7.d \n\t" // Scale by beta " fmul z10.d, z10.d, z7.d \n\t" // Scale by beta " fmul z11.d, z11.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROCOLSTOREDS4: \n\t" " \n\t" " prfm pldl2keep,[x3] \n\t" " prfm pldl2keep,[x4] \n\t" " \n\t" " fmla z8.d, z28.d, z6.d[0] \n\t" // Scale by alpha " fmla z9.d, z29.d, z6.d[0] \n\t" // Scale by alpha " fmla z10.d, z30.d, z6.d[0] \n\t" // Scale by alpha " fmla z11.d, z31.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " str z8, [x25] \n\t" //Store column 6 of C " str z9, [x25, #1, MUL VL] \n\t" " \n\t" " str z10, [x26] \n\t" //Store column 7 of C " str z11, [x26, #1, MUL VL] \n\t" " \n\t" " b .DEND \n\t" " \n\t" " .DGENSTORED: \n\t" // C is general-stride stored. " \n\t" " \n\t" // x14 is row-stride in number of bytes. " lsl x15,x14,#2 \n\t" // x15 is 4-row-stride, which is the address offset " \n\t" // btw c(4,*) and c(0,*) " index z4.d, xzr, x14 \n\t" // z4 is address offsets of four contiguous elements " \n\t" // in a column. such as c( 0:3,* ). " \n\t" // z4 is used as vector index for gather/scatter " \n\t" // loading/storing from column of *c " \n\t" " \n\t" // C's each column's address: " \n\t" // x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7) " \n\t" // x5, x6, x7, x8, x16, x17, x18, x19: are addresses of c(4,0:7) " add x5, x15, x2 \n\t" // x5 is address of c(4,0) " add x6, x15, x20 \n\t" // x6 is address of c(4,1) " add x7, x15, x21 \n\t" // x7 is address of c(4,2) " add x8, x15, x22 \n\t" // x8 is address of c(4,3) " add x16, x15, x23 \n\t" // x16 is address of c(4,4) " add x17, x15, x24 \n\t" // x17 is address of c(4,5) " add x18, x15, x25 \n\t" // x18 is address of c(4,6) " add x19, x15, x26 \n\t" // x19 is address of c(4,7) " \n\t" " dup z0.d, #0 \n\t" // C column 0, 1 " dup z1.d, #0 \n\t" " dup z2.d, #0 \n\t" " dup z3.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. " \n\t" " \n\t" // x2 is address of c(0,0) " \n\t" // x5 is address of c(4,0) " \n\t" // x20 is address of c(0,1) " \n\t" // x6 is address of c(4,1) " ld1d {z0.d}, p0/z, [x2, z4.d] \n\t" // Load c( 0:3,0 ) into z0 " ld1d {z1.d}, p0/z, [x5, z4.d] \n\t" // Load c( 4:7,0 ) into z1 " ld1d {z2.d}, p0/z, [x20, z4.d] \n\t" // Load c( 0:3,1 ) into z2 " ld1d {z3.d}, p0/z, [x6 , z4.d] \n\t" // Load c( 4:7,1 ) into z3 " \n\t" " fmul z0.d, z0.d, z7.d \n\t" // Scale by beta " fmul z1.d, z1.d, z7.d \n\t" // Scale by beta " fmul z2.d, z2.d, z7.d \n\t" // Scale by beta " fmul z3.d, z3.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROGENSTOREDS1: \n\t" " \n\t" " fmla z0.d, z16.d, z6.d[0] \n\t" // Scale by alpha " fmla z1.d, z17.d, z6.d[0] \n\t" // Scale by alpha " fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha " fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " st1d {z0.d}, p0, [x2 , z4.d] \n\t" // Store c( 0:3,0 ) <- z0 " st1d {z1.d}, p0, [x5 , z4.d] \n\t" // Store c( 4:7,0 ) <- z1 " st1d {z2.d}, p0, [x20, z4.d] \n\t" // Store c( 0:3,1 ) <- z2 " st1d {z3.d}, p0, [x6 , z4.d] \n\t" // Store c( 4:7,1 ) <- z3 " \n\t" " \n\t" " \n\t" " dup z8.d, #0 \n\t" // C column 2, 3 " dup z9.d, #0 \n\t" " dup z10.d, #0 \n\t" " dup z11.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. " \n\t" " \n\t" // x21 is address of c(0,2) " \n\t" // x7 is address of c(4,2) " \n\t" // x22 is address of c(0,3) " \n\t" // x8 is address of c(4,3) " ld1d {z8.d}, p0/z, [x21, z4.d] \n\t" // Load c( 0:3,2 ) into z8 " ld1d {z9.d}, p0/z, [x7 , z4.d] \n\t" // Load c( 4:7,2 ) into z9 " ld1d {z10.d}, p0/z, [x22, z4.d] \n\t" // Load c( 0:3,3 ) into z10 " ld1d {z11.d}, p0/z, [x8 , z4.d] \n\t" // Load c( 4:7,3 ) into z11 " \n\t" " fmul z8.d, z8.d, z7.d \n\t" // Scale by beta " fmul z9.d, z9.d, z7.d \n\t" // Scale by beta " fmul z10.d, z10.d, z7.d \n\t" // Scale by beta " fmul z11.d, z11.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROGENSTOREDS2: \n\t" " \n\t" " fmla z8.d, z20.d, z6.d[0] \n\t" // Scale by alpha " fmla z9.d, z21.d, z6.d[0] \n\t" // Scale by alpha " fmla z10.d, z22.d, z6.d[0] \n\t" // Scale by alpha " fmla z11.d, z23.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " st1d {z8.d}, p0, [x21, z4.d] \n\t" // Store c( 0:3,2 ) <- z8 " st1d {z9.d}, p0, [x7 , z4.d] \n\t" // Store c( 4:7,2 ) <- z9 " st1d {z10.d}, p0, [x22, z4.d] \n\t" // Store c( 0:3,3 ) <- z10 " st1d {z11.d}, p0, [x8 , z4.d] \n\t" // Store c( 4:7,3 ) <- z11 " \n\t" " dup z0.d, #0 \n\t" // C column 4, 5 " dup z1.d, #0 \n\t" " dup z2.d, #0 \n\t" " dup z3.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. " \n\t" " \n\t" // x23 is address of c(0,4) " \n\t" // x16 is address of c(4,4) " \n\t" // x24 is address of c(0,5) " \n\t" // x17 is address of c(4,5) " ld1d {z0.d}, p0/z, [x23, z4.d] \n\t" // Load c( 0:3,4 ) into z0 " ld1d {z1.d}, p0/z, [x16, z4.d] \n\t" // Load c( 4:7,4 ) into z1 " ld1d {z2.d}, p0/z, [x24, z4.d] \n\t" // Load c( 0:3,5 ) into z2 " ld1d {z3.d}, p0/z, [x17, z4.d] \n\t" // Load c( 4:7,5 ) into z3 " \n\t" " fmul z0.d, z0.d, z7.d \n\t" // Scale by beta " fmul z1.d, z1.d, z7.d \n\t" // Scale by beta " fmul z2.d, z2.d, z7.d \n\t" // Scale by beta " fmul z3.d, z3.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROGENSTOREDS3: \n\t" " \n\t" " fmla z0.d, z24.d, z6.d[0] \n\t" // Scale by alpha " fmla z1.d, z25.d, z6.d[0] \n\t" // Scale by alpha " fmla z2.d, z26.d, z6.d[0] \n\t" // Scale by alpha " fmla z3.d, z27.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " st1d {z0.d}, p0, [x23, z4.d] \n\t" // Store c( 0:3,4 ) <- z0 " st1d {z1.d}, p0, [x16, z4.d] \n\t" // Store c( 4:7,4 ) <- z1 " st1d {z2.d}, p0, [x24, z4.d] \n\t" // Store c( 0:3,5 ) <- z2 " st1d {z3.d}, p0, [x17, z4.d] \n\t" // Store c( 4:7,5 ) <- z3 " \n\t" " dup z8.d, #0 \n\t" // C column 6, 7 " dup z9.d, #0 \n\t" " dup z10.d, #0 \n\t" " dup z11.d, #0 \n\t" " \n\t" " fcmp d7,#0.0 \n\t" " beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. " \n\t" " \n\t" // x25 is address of c(0,6) " \n\t" // x18 is address of c(4,6) " \n\t" // x26 is address of c(0,7) " \n\t" // x19 is address of c(4,7) " ld1d {z8.d}, p0/z, [x25, z4.d] \n\t" // Load c( 0:3,6 ) into z8 " ld1d {z9.d}, p0/z, [x18, z4.d] \n\t" // Load c( 4:7,6 ) into z9 " ld1d {z10.d}, p0/z, [x26, z4.d] \n\t" // Load c( 0:3,7 ) into z10 " ld1d {z11.d}, p0/z, [x19, z4.d] \n\t" // Load c( 4:7,7 ) into z11 " \n\t" " fmul z8.d, z8.d, z7.d \n\t" // Scale by beta " fmul z9.d, z9.d, z7.d \n\t" // Scale by beta " fmul z10.d, z10.d, z7.d \n\t" // Scale by beta " fmul z11.d, z11.d, z7.d \n\t" // Scale by beta " \n\t" " .DBETAZEROGENSTOREDS4: \n\t" " \n\t" " fmla z8.d, z28.d, z6.d[0] \n\t" // Scale by alpha " fmla z9.d, z29.d, z6.d[0] \n\t" // Scale by alpha " fmla z10.d, z30.d, z6.d[0] \n\t" // Scale by alpha " fmla z11.d, z31.d, z6.d[0] \n\t" // Scale by alpha " \n\t" " st1d {z8.d}, p0, [x25, z4.d] \n\t" // Store c( 0:3,6 ) <- z8 " st1d {z9.d}, p0, [x18, z4.d] \n\t" // Store c( 4:7,6 ) <- z9 " st1d {z10.d}, p0, [x26, z4.d] \n\t" // Store c( 0:3,7 ) <- z10 " st1d {z11.d}, p0, [x19, z4.d] \n\t" // Store c( 4:7,7 ) <- z11 " \n\t" " .DEND: \n\t" // Done! " \n\t" :// output operands (none) :// input operands [aaddr] "m" (a), // 0 [baddr] "m" (b), // 1 [caddr] "m" (c), // 2 [k_iter] "m" (k_iter), // 3 [k_left] "m" (k_left), // 4 [alpha] "m" (alpha), // 5 [beta] "m" (beta), // 6 [rs_c] "m" (rs_c), // 6 [cs_c] "m" (cs_c), // 7 [a_next] "m" (a_next), // 8 [b_next] "m" (b_next) // 9 :// Register clobber list "x0","x1","x2","x3", "x4","x5","x6", "x7","x8","x9", "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19", "x20","x21","x22","x23","x24","x25","x26", "x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", "v9","v10","v11", "v12","v13","v14", "v15","v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", "v28","v29","v30","v31" ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/bli_gemm_armsve_asm_sh2vx10_unindexed.c000066400000000000000000000445441427272030600312230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Copyright (C) 2019, Forschunszentrum Juelich Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Half-precision composite instructions. #include "armsve_asm_macros_half.h" // 2vx10 microkernels. #include "armsve_asm_2vx10.h" // Gather-load / scatter-store instruction for half-precision // needs being defined separately. #undef GEMM_CCOL_GATHER_LOAD_FWD #undef GEMM_CCOL_SCATTER_STORE_FWD #define GEMM_CCOL_GATHER_LOAD_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \ " add x28, "#CADDR", "#CRS2" \n\t" \ " ld1h z31.s, "#PT"/z, ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \ " ld1h "#ZFH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \ " revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \ " fadd "#ZFH".h, "#ZFH".h, z31.h \n\t" \ " add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \ " add x28, "#CTEMP", "#CRS2" \n\t" \ " ld1h z31.s, "#PT"/z, ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \ " ld1h "#ZLH".s, "#PT"/z, [x28, "#ZIDX2".s, uxtw #1] \n\t" \ " revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \ " fadd "#ZLH".h, "#ZLH".h, z31.h \n\t" \ " add "#CADDR", "#CADDR", "#CCS" \n\t" #define GEMM_CCOL_SCATTER_STORE_FWD(ZFH,ZLH,ZIDX2,PT,CRS2,CADDR,CCS,CVSKIP,CTEMP) \ " add x28, "#CADDR", "#CRS2" \n\t" \ " st1h "#ZFH".s, "#PT", ["#CADDR", "#ZIDX2".s, uxtw #1] \n\t" \ " revh "#ZFH".s, "#PT"/m, "#ZFH".s \n\t" \ " st1h "#ZFH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \ " add "#CTEMP", "#CADDR", "#CVSKIP" \n\t" \ " add x28, "#CTEMP", "#CRS2" \n\t" \ " st1h "#ZLH".s, "#PT", ["#CTEMP", "#ZIDX2".s, uxtw #1] \n\t" \ " revh "#ZLH".s, "#PT"/m, "#ZLH".s \n\t" \ " st1h "#ZLH".s, "#PT", [x28, "#ZIDX2".s, uxtw #1] \n\t" \ " add "#CADDR", "#CADDR", "#CCS" \n\t" void bli_shgemm_armsve_asm_2vx10_unindexed ( dim_t k0, void* restrict alpha, void* restrict a, void* restrict b, void* restrict beta, void* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" " inch x2, ALL, MUL #2 \n\t" // Column-skip of A. " mov x3, #10 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x8, 0x3 \n\t" // Tag C address. " lsl x8, x8, #56 \n\t" " orr x5, x5, x8 \n\t" " mov x8, 0x2 \n\t" // Tag B address. " lsl x8, x8, #56 \n\t" " orr x1, x1, x8 \n\t" " mov x8, 0x1 \n\t" // Tag A address. " lsl x8, x8, #56 \n\t" " orr x0, x0, x8 \n\t" #endif " \n\t" " mov x8, #2 \n\t" // Multiply some address skips by sizeof(float16_t). " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c " ptrue p0.b \n\t" " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" " LOAD_ABC: \n\t" " cmp x4, #0 \n\t" // Don't preload if no microkernel there. " b.eq END_CCOL_PRFM \n\t" " ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rh z21.h, p0/z, [x1, 2] \n\t" " ld1rh z22.h, p0/z, [x1, 4] \n\t" " ld1rh z23.h, p0/z, [x1, 6] \n\t" " ld1rh z24.h, p0/z, [x1, 8] \n\t" " ld1rh z25.h, p0/z, [x1, 10] \n\t" " ld1rh z26.h, p0/z, [x1, 12] \n\t" " ld1rh z27.h, p0/z, [x1, 14] \n\t" " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " \n\t" " CCOL_PRFM: \n\t" " cmp x6, #1 \n\t" " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x4, #0 \n\t" // If no 4-microkernel can be applied " b.eq K_LEFT_LOOP \n\t" " \n\t" " K_MKER_LOOP: \n\t" " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " b K_MKER_LOOP \n\t" " \n\t" " FIN_MKER_LOOP: \n\t" GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " add x0, x0, x2 \n\t" // Forward A to fill the blank. " \n\t" " K_LEFT_LOOP: \n\t" " cmp x8, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) " ld1rh z20.h, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rh z21.h, p0/z, [x1, 2] \n\t" " ld1rh z22.h, p0/z, [x1, 4] \n\t" " ld1rh z23.h, p0/z, [x1, 6] \n\t" " ld1rh z24.h, p0/z, [x1, 8] \n\t" " ld1rh z25.h, p0/z, [x1, 10] \n\t" " ld1rh z26.h, p0/z, [x1, 12] \n\t" " ld1rh z27.h, p0/z, [x1, 14] \n\t" " ld1rh z28.h, p0/z, [x1, 16] \n\t" " ld1rh z29.h, p0/z, [x1, 18] \n\t" GEMM_FMLA2(z0,z1,p0,z30,z31,z20) GEMM_FMLA2(z2,z3,p0,z30,z31,z21) GEMM_FMLA2(z4,z5,p0,z30,z31,z22) GEMM_FMLA2(z6,z7,p0,z30,z31,z23) GEMM_FMLA2(z8,z9,p0,z30,z31,z24) GEMM_FMLA2(z10,z11,p0,z30,z31,z25) GEMM_FMLA2(z12,z13,p0,z30,z31,z26) GEMM_FMLA2(z14,z15,p0,z30,z31,z27) GEMM_FMLA2(z16,z17,p0,z30,z31,z28) GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x0, x0, x2 \n\t" // Forward A. " add x1, x1, x3 \n\t" // Forward B. " sub x8, x8, #1 \n\t" " b K_LEFT_LOOP \n\t" // Next column / row. " \n\t" " WRITE_MEM_PREP: \n\t" " \n\t" " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ld1rh z30.h, p0/z, [x4] \n\t" // Load alpha & beta into vectors. " ld1rh z31.h, p0/z, [x8] \n\t" " fmov w4, h28 \n\t" // Copy alpha & beta to GP registers. " fmov w8, h29 \n\t" " \n\t" " PREFETCH_ABNEXT: \n\t" " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" " prfm PLDL2KEEP, [x0] \n\t" " prfm PLDL2KEEP, [x0, 256*1] \n\t" " prfm PLDL2KEEP, [x0, 256*2] \n\t" " prfm PLDL2KEEP, [x0, 256*3] \n\t" " prfm PLDL2KEEP, [x0, 256*4] \n\t" " prfm PLDL2KEEP, [x0, 256*5] \n\t" " prfm PLDL2KEEP, [x0, 256*6] \n\t" " prfm PLDL2KEEP, [x0, 256*7] \n\t" " prfm PLDL2KEEP, [x0, 256*8] \n\t" " prfm PLDL2KEEP, [x0, 256*9] \n\t" " prfm PLDL2KEEP, [x0, 256*10] \n\t" " prfm PLDL2KEEP, [x0, 256*11] \n\t" " prfm PLDL2KEEP, [x0, 256*12] \n\t" " prfm PLDL2KEEP, [x0, 256*13] \n\t" " prfm PLDL2KEEP, [x0, 256*14] \n\t" " prfm PLDL2KEEP, [x0, 256*15] \n\t" " prfm PLDL2KEEP, [x1] \n\t" " prfm PLDL2KEEP, [x1, 256*1] \n\t" " prfm PLDL2KEEP, [x1, 256*2] \n\t" " prfm PLDL2KEEP, [x1, 256*3] \n\t" " prfm PLDL2KEEP, [x1, 256*4] \n\t" " prfm PLDL2KEEP, [x1, 256*5] \n\t" " prfm PLDL2KEEP, [x1, 256*6] \n\t" " prfm PLDL2KEEP, [x1, 256*7] \n\t" " prfm PLDL2KEEP, [x1, 256*8] \n\t" " prfm PLDL2KEEP, [x1, 256*9] \n\t" " \n\t" " WRITE_MEM: \n\t" " \n\t" " fmov h28, #1.0 \n\t" " fmov w16, h28 \n\t" " cmp w16, w4 \n\t" " b.eq UNIT_ALPHA \n\t" " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" " UNIT_ALPHA: \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x6, #1 \n\t" " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x9,x7) " \n\t" GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x5,x7) GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-30] - Z30 as index. " mov x10, xzr \n\t" " incb x10 \n\t" " madd x10, x10, x6, xzr \n\t" // C-column's logical 1-vector skip. " mov x28, #2 \n\t" " madd x6, x28, x6, xzr \n\t" // Double index skip for half-precision case. " index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x9,x7,x10,x16) " dup z31.h, w8 \n\t" // Restore beta destroyed by loading. GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x9,x7,x10,x16) " \n\t" " dup z31.h, w8 \n\t" // Restore beta destroyed by loading. GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,x6,x5,x7,x10,x16) GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,x6,x5,x7,x10,x16) " \n\t" " END_WRITE_MEM: \n\t" " b END_EXEC \n\t" " \n\t" " END_ERROR: \n\t" " mov x0, #1 \n\t" // Return error. " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8", "x9","x16","x10","x28", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx7_unindexed.c000066400000000000000000000345271427272030600310100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Double-precision composite instructions. #include "armsve_asm_macros_dcomplex.h" // 2vx7 microkernels. #include "armsve_asm_2vx7cmplx.h" void bli_zgemm_armsve_asm_2vx7_unindexed ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; uint64_t mr = bli_vl_bytes_armsve() * 2 / 16; GEMM_UKR_SETUP_CT( z, mr, 7, false ); __asm__ volatile ( // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" " incd x2, ALL, MUL #1 \n\t" // Column-skip of A. " mov x3, #7 \n\t" // Row-skip of B. " \n\t" // " ldr x2, %[c] \n\t" // " ldr x3, %[rs_c] \n\t" // Row-skip of C. // " ldr x4, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %0, %0, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %1, %1, x16 \n\t" " mov x16, 0x3 \n\t" // Tag C address. " lsl x16, x16, #56 \n\t" " orr %2, %2, x16 \n\t" #endif " \n\t" " mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex). " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c " ptrue p0.d \n\t" " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" " LOAD_ABC: \n\t" " cmp %5, #0 \n\t" // Don't preload if no microkernel there. " b.eq END_CCOL_PRFM \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" " ld1rd z14.d, p0/z, [%1, 8*0] \n\t" // Load B's real & imaginary. " ld1rd z15.d, p0/z, [%1, 8*2] \n\t" " ld1rd z16.d, p0/z, [%1, 8*4] \n\t" " ld1rd z17.d, p0/z, [%1, 8*6] \n\t" " ld1rd z18.d, p0/z, [%1, 8*8] \n\t" " ld1rd z19.d, p0/z, [%1, 8*10] \n\t" " ld1rd z20.d, p0/z, [%1, 8*12] \n\t" " ld1rd z21.d, p0/z, [%1, 8*1] \n\t" " ld1rd z22.d, p0/z, [%1, 8*3] \n\t" " ld1rd z23.d, p0/z, [%1, 8*5] \n\t" " ld1rd z24.d, p0/z, [%1, 8*7] \n\t" " ld1rd z25.d, p0/z, [%1, 8*9] \n\t" " ld1rd z26.d, p0/z, [%1, 8*11] \n\t" " ld1rd z27.d, p0/z, [%1, 8*13] \n\t" " add %1, %1, x3 \n\t" " \n\t" " CCOL_PRFM: \n\t" " cmp %3, #1 \n\t" " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL14(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13) " \n\t" " cmp %5, #0 \n\t" // If no 4-microkernel can be applied " b.eq K_LEFT_LOOP \n\t" " \n\t" " K_MKER_LOOP: \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " b K_MKER_LOOP \n\t" " \n\t" " FIN_MKER_LOOP: \n\t" GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z30,z31,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " K_LEFT_LOOP: \n\t" " cmp %6, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rd z14.d, p0/z, [%1, 8*0] \n\t" " ld1rd z15.d, p0/z, [%1, 8*2] \n\t" " ld1rd z16.d, p0/z, [%1, 8*4] \n\t" " ld1rd z17.d, p0/z, [%1, 8*6] \n\t" " ld1rd z18.d, p0/z, [%1, 8*8] \n\t" " ld1rd z19.d, p0/z, [%1, 8*10] \n\t" " ld1rd z20.d, p0/z, [%1, 8*12] \n\t" " ld1rd z21.d, p0/z, [%1, 8*1] \n\t" " ld1rd z22.d, p0/z, [%1, 8*3] \n\t" " ld1rd z23.d, p0/z, [%1, 8*5] \n\t" " ld1rd z24.d, p0/z, [%1, 8*7] \n\t" " ld1rd z25.d, p0/z, [%1, 8*9] \n\t" " ld1rd z26.d, p0/z, [%1, 8*11] \n\t" " ld1rd z27.d, p0/z, [%1, 8*13] \n\t" " add %1, %1, x3 \n\t" GEMM_2VX7CMPLX_MKER_LOOP_PLAIN_C_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z1,z3,z5,z7,z9,z11,z13,p0,z28,z29,z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " sub %6, %6, #1 \n\t" " b K_LEFT_LOOP \n\t" // Next column / row. " \n\t" " WRITE_MEM_PREP: \n\t" " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" " ld1rd z28.d, p0/z, [%7] \n\t" // Real(alpha). " ld1rd z29.d, p0/z, [%7, 8] \n\t" // Imag(alpha). " ld1rd z30.d, p0/z, [%8] \n\t" // Real(beta). " ld1rd z31.d, p0/z, [%8, 8] \n\t" // Imag(beta). " \n\t" " PREFETCH_ABNEXT: \n\t" // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %9, %9, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %10, %10, x16 \n\t" #endif " prfm PLDL1STRM, [%9] \n\t" " prfm PLDL1STRM, [%9, 256*1] \n\t" " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" " WRITE_MEM: \n\t" " \n\t" GEMM_FMULCMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z28,z29) " \n\t" " UNIT_ALPHA: \n\t" " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. " cmp %3, #1 \n\t" " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" GEMM_CCMPLX_LOAD_COL7_C(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,p0,x9,%4) GEMM_FMLACMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z30,z31) GEMM_CCMPLX_STORE_COL7_C(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,%2,%4) " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, " index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. GEMM_CCMPLX_LOAD_COL7_G(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,p0,z28,x9,%4,x16) GEMM_FMLACMPLX_COL7(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z30,z31) GEMM_CCMPLX_STORE_COL7_G(z14,z15,z16,z17,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,p0,z28,%2,%4,x16) " \n\t" " END_WRITE_MEM: \n\t" " b END_EXEC \n\t" " \n\t" " END_EXEC: \n\t" " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 "+r" (c), // %2 "+r" (rs_c), // %3 "+r" (cs_c), // %4 "+r" (k_mker), // %5 "+r" (k_left), // %6 "+r" (alpha), // %7 "+r" (beta), // %8 "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 : : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx8_unindexed.c000066400000000000000000000372511427272030600310060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Forschunszentrum Juelich Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Double-precision composite instructions. #include "armsve_asm_macros_dcomplex.h" // 2vx8 microkernels. #include "armsve_asm_2vx8cmplx.h" void bli_zgemm_armsve_asm_2vx8_unindexed ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k / 6; uint64_t k_left = k % 6; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t info = 0; uint64_t mr = bli_vl_bytes_armsve() * 2 / 16; GEMM_UKR_SETUP_CT( z, mr, 8, false ); __asm__ volatile ( // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" " incd x2, ALL, MUL #1 \n\t" // Column-skip of A. " mov x3, #8 \n\t" // Row-skip of B. " \n\t" // " ldr x2, %[c] \n\t" // " ldr x3, %[rs_c] \n\t" // Row-skip of C. // " ldr x4, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %0, %0, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %1, %1, x16 \n\t" " mov x16, 0x3 \n\t" // Tag C address. " lsl x16, x16, #56 \n\t" " orr %2, %2, x16 \n\t" #endif " \n\t" " mov x16, #16 \n\t" // Multiply some address skips by sizeof(dcomplex). " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c " ptrue p0.d \n\t" " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" " LOAD_ABC: \n\t" " cmp %5, #0 \n\t" // Don't preload if no microkernel there. " b.eq END_CCOL_PRFM \n\t" " \n\t" " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real & half of imaginary. " ld1rd z21.d, p0/z, [%1, 8*2] \n\t" " ld1rd z22.d, p0/z, [%1, 8*4] \n\t" " ld1rd z23.d, p0/z, [%1, 8*6] \n\t" " ld1rd z24.d, p0/z, [%1, 8*8] \n\t" " ld1rd z25.d, p0/z, [%1, 8*10] \n\t" " ld1rd z26.d, p0/z, [%1, 8*12] \n\t" " ld1rd z27.d, p0/z, [%1, 8*14] \n\t" " ld1rd z28.d, p0/z, [%1, 8*1] \n\t" " ld1rd z29.d, p0/z, [%1, 8*3] \n\t" " ld1rd z30.d, p0/z, [%1, 8*5] \n\t" " ld1rd z31.d, p0/z, [%1, 8*7] \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) " \n\t" " CCOL_PRFM: \n\t" " cmp %3, #1 \n\t" " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL16(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15) " \n\t" " cmp %5, #0 \n\t" // If no 6-microkernel can be applied " b.eq K_LEFT_LOOP \n\t" " \n\t" " K_MKER_LOOP: \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2) GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2) GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z18,z19,p0,%0,x2) GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " b K_MKER_LOOP \n\t" " \n\t" " FIN_MKER_LOOP: \n\t" GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_3_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z18,z19,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " \n\t" " K_LEFT_LOOP: \n\t" " cmp %6, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z16,z17,p0,%0,x2) " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Reload B's real & half of imaginary. " ld1rd z21.d, p0/z, [%1, 8*2] \n\t" " ld1rd z22.d, p0/z, [%1, 8*4] \n\t" " ld1rd z23.d, p0/z, [%1, 8*6] \n\t" " ld1rd z24.d, p0/z, [%1, 8*8] \n\t" " ld1rd z25.d, p0/z, [%1, 8*10] \n\t" " ld1rd z26.d, p0/z, [%1, 8*12] \n\t" " ld1rd z27.d, p0/z, [%1, 8*14] \n\t" " ld1rd z28.d, p0/z, [%1, 8*1] \n\t" " ld1rd z29.d, p0/z, [%1, 8*3] \n\t" " ld1rd z30.d, p0/z, [%1, 8*5] \n\t" " ld1rd z31.d, p0/z, [%1, 8*7] \n\t" GEMM_2VX8CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z1,z3,z5,z7,z9,z11,z13,z15,p0,z16,z17,z20,z21,z22,z23,z24,z25,z26,z27,z28,z29,z30,z31,%1,x3) " sub %6, %6, #1 \n\t" " b K_LEFT_LOOP \n\t" // Next column / row. " \n\t" " WRITE_MEM_PREP: \n\t" " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" " ld1rd z16.d, p0/z, [%7] \n\t" // Real(alpha). " ld1rd z17.d, p0/z, [%7, 8] \n\t" // Imag(alpha). " ld1rd z18.d, p0/z, [%8] \n\t" // Real(beta). " ld1rd z19.d, p0/z, [%8, 8] \n\t" // Imag(beta). " \n\t" " PREFETCH_ABNEXT: \n\t" // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr %9, %9, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr %10, %10, x16 \n\t" #endif " prfm PLDL1STRM, [%9] \n\t" " prfm PLDL1STRM, [%9, 256*1] \n\t" " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" " WRITE_MEM: \n\t" " \n\t" GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z16,z17) GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z4 ,z5 ,z6 ,z7 ,z16,z17) GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z8 ,z9 ,z10,z11,z16,z17) GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z12,z13,z14,z15,z16,z17) " \n\t" " UNIT_ALPHA: \n\t" " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. " cmp %3, #1 \n\t" " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z18,z19) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z24,z25,z26,z27,z18,z19) GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) " \n\t" GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z24,z25,z26,z27,p0,x9,%4) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z18,z19) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z24,z25,z26,z27,z18,z19) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, " index z16.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z16,x9,%4,x16) GEMM_CCMPLX_LOAD_COL2_G(z24,z25,z26,z27,p0,z16,x9,%4,x16) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z18,z19) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z24,z25,z26,z27,z18,z19) GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z16,%2,%4,x16) GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z16,%2,%4,x16) " \n\t" GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z16,x9,%4,x16) GEMM_CCMPLX_LOAD_COL2_G(z24,z25,z26,z27,p0,z16,x9,%4,x16) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z18,z19) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z24,z25,z26,z27,z18,z19) GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z16,%2,%4,x16) GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z16,%2,%4,x16) " \n\t" " END_WRITE_MEM: \n\t" " b END_EXEC \n\t" " \n\t" " END_EXEC: \n\t" " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 "+r" (c), // %2 "+r" (rs_c), // %3 "+r" (cs_c), // %4 "+r" (k_mker), // %5 "+r" (k_left), // %6 "+r" (alpha), // %7 "+r" (beta), // %8 "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 : : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/sup/000077500000000000000000000000001427272030600223255ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/sup/bli_gemmsup_armsve_ref.c000066400000000000000000000311441427272030600272100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Separate instantiation for ArmSVE reference kernels. // Temporary workaround. Will be removed after upstream has switched to a better way // of exposing gemmsup interface. // // -- Row storage case --------------------------------------------------------- // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ PASTEMAC(ch,conjs)( ab ); \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_r, _armsve, _ref2 ) // // -- Column storage case ------------------------------------------------------ // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ PASTEMAC(ch,conjs)( ab ); \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_c, _armsve, _ref2 ) cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/sup/bli_gemmsup_cv_armsve_asm_d2vx10_unindexed.c000066400000000000000000000624301427272030600330550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include // Double-precision composite instructions. #include "../armsve_asm_macros_double.h" // 2vx10 microkernels. #include "../armsve_asm_2vx10.h" // Prototype reference kernel. GEMMSUP_KER_PROT( double, d, gemmsup_c_armsve_ref2 ) void __attribute__ ((noinline,optimize(0))) bli_dgemmsup_cv_armsve_2vx10_unindexed ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { static int called = 0; if ( !called ) { fprintf(stderr, "rv called.\n"); called = 1; } // c*c requires A to be stored in columns. assert( rs_a0 == 1 ); dim_t n0_mker = n0 / 10; dim_t n0_left = n0 % 10; if ( n0_left ) { // A[:, ::] // B[::, n0_mker*10:n0] // C[: , n0_mker*10:n0] double *ai = a; double *bi = b + n0_mker * 10 * cs_b0; double *ci = c + n0_mker * 10 * cs_c0; bli_dgemmsup_c_armsve_ref2 ( conja, conjb, m0, n0_left, k0, alpha, ai, rs_a0, cs_a0, bi, rs_b0, cs_b0, beta, ci, rs_c0, cs_c0, data, cntx ); } // Return if it's a pure edge case. if ( !n0_mker ) return; // Determine VL. uint64_t vlen2; __asm__ ( " mov x0, xzr \n\t" " incd x0, ALL, MUL #2 \n\t" " mov %[vlen2], x0 \n\t" : [vlen2] "=r" (vlen2) : : "x0" ); uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // uint64_t rs_a = 1; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_mker = n0_mker; dim_t m0_mker = m0 / vlen2; dim_t m0_left = m0 % vlen2; if ( m0_left ) { // Edge case on A side can be handled with one more (predicated) loop. m0_mker++; } else m0_left = vlen2; // uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_b = bli_auxinfo_ps_b( data ); for ( dim_t im0_mker = 0; im0_mker < m0_mker; ++im0_mker ) { uint64_t m_curr = vlen2; if ( im0_mker == m0_mker - 1 ) { // Last m-loop. Maybe unnecessary. m_curr = m0_left; } double *ai = a + im0_mker * vlen2 * rs_a0; double *bi = b; double *ci = c + im0_mker * vlen2 * rs_c0; void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); __asm__ volatile ( " ldr x0, %[bi] \n\t" " ldr x1, %[rs_b] \n\t" // Row-skip of B. " ldr x2, %[cs_b] \n\t" // Column-skip of B (element skip of B[l, :]). " ldr x3, %[ps_b] \n\t" // Panel-skip (10*k) of B. " ldr x4, %[cs_a] \n\t" // Column-Skip of A. " \n\t" // Element skip of A[:, l] is guaranteed to be 1. " ldr x5, %[ci] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag C address. " lsl x16, x16, #56 \n\t" " orr x5, x5, x16 \n\t" " mov x16, 0x2 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr x0, x0, x16 \n\t" #endif " \n\t" " mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). " madd x1, x8, x1, xzr \n\t" // rs_b " madd x2, x8, x2, xzr \n\t" // cs_b " madd x3, x8, x3, xzr \n\t" // ps_b " madd x4, x8, x4, xzr \n\t" // cs_a " madd x7, x8, x7, xzr \n\t" // cs_c " mov x8, #4 \n\t" " madd x15, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for A. " \n\t" #ifdef _A64FX " mov x16, 0x20 \n\t" // Higher 6bit for Control#2: " lsl x16, x16, #58 \n\t" // Valid|Strong|Strong|NoAlloc|Load|Strong " orr x16, x16, x4 \n\t" // Stride. " msr S3_3_C11_C6_2, x16 \n\t" // Write system register. #endif " \n\t" " ldr x8, %[m_curr] \n\t" // Size of first dimension. " mov x9, xzr \n\t" " incd x9 \n\t" " ptrue p0.d \n\t" " whilelo p1.d, xzr, x8 \n\t" " whilelo p2.d, x9, x8 \n\t" " \n\t" " ldr x8, %[n_mker] \n\t" // Number of N-loops. " \n\t" " ldr x20, %[ai] \n\t" // Parameters to be reloaded " ldr x21, %[k_mker] \n\t" // within each millikernel loop. " ldr x22, %[k_left] \n\t" " ldr x23, %[alpha] \n\t" " ldr x24, %[beta] \n\t" " ldr x25, %[a_next] \n\t" " ldr x26, %[b_next] \n\t" " ldr x23, [x23] \n\t" // Directly load alpha and beta. " ldr x24, [x24] \n\t" " \n\t" " MILLIKER_MLOOP: \n\t" " \n\t" " mov x11, x0 \n\t" // B's address. // " ldr x10, %[ai] \n\t" // A's address. " mov x10, x20 \n\t" // " ldr x12, %[k_mker] \n\t" " mov x12, x21 \n\t" // " ldr x13, %[k_left] \n\t" " mov x13, x22 \n\t" #ifdef _A64FX " mov x16, 0x3 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr x10, x10, x16 \n\t" " mov x16, 0xa \n\t" // Control#2 for A address. " lsl x16, x16, #60 \n\t" " orr x10, x10, x16 \n\t" #endif " \n\t" " cmp x12, #0 \n\t" // Don't preload if no microkernel there. " b.eq END_CCOL_PRFM \n\t" " \n\t" " mov x14, x11 \n\t" " ld1rd z20.d, p0/z, [x14] \n\t" // Load 8/10 of first B row. " add x14, x14, x2 \n\t" " ld1rd z21.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z22.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z23.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z24.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z25.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z26.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z27.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " prfm PLDL1KEEP, [x14] \n\t" // And prefetch the 2/10 left. " add x14, x14, x2 \n\t" " prfm PLDL1KEEP, [x14] \n\t" " sub x14, x14, x2 \n\t" // Restore x14 to load edge. " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p1,p2,x10) " add x16, x10, x4 \n\t" " prfm PLDL1STRM, [x16] \n\t" // Prefetch 3/4 of A. " add x16, x10, x4 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x10, x4 \n\t" " prfm PLDL1STRM, [x16] \n\t" " \n\t" " CCOL_PRFM: \n\t" " cmp x6, #1 \n\t" " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x12, #0 \n\t" // If no 4-microkernel can be applied " b.eq K_LEFT_LOOP \n\t" " \n\t" " K_MKER_LOOP: \n\t" " \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_G_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) " \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_G_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) " \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z30,z31,p1,p2,x10,x15,x4,x16,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_G_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) " \n\t" " subs x12, x12, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_C(z28,z29,p1,p2,x10,x15,x4,x16,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_G_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) " b K_MKER_LOOP \n\t" " \n\t" " FIN_MKER_LOOP: \n\t" GEMM_2VX10_MKER_LOOP_PLAIN_G_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x14,x1,x2) " add x10, x10, x4 \n\t" // Forward A to fill the blank. " \n\t" " K_LEFT_LOOP: \n\t" " cmp x13, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p1,p2,x10) " mov x14, x11 \n\t" " ld1rd z20.d, p0/z, [x14] \n\t" // Load 10/10 B. " add x14, x14, x2 \n\t" " ld1rd z21.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z22.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z23.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z24.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z25.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z26.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z27.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z28.d, p0/z, [x14] \n\t" " add x14, x14, x2 \n\t" " ld1rd z29.d, p0/z, [x14] \n\t" GEMM_FMLA2(z0,z1,p0,z30,z31,z20) GEMM_FMLA2(z2,z3,p0,z30,z31,z21) GEMM_FMLA2(z4,z5,p0,z30,z31,z22) GEMM_FMLA2(z6,z7,p0,z30,z31,z23) GEMM_FMLA2(z8,z9,p0,z30,z31,z24) GEMM_FMLA2(z10,z11,p0,z30,z31,z25) GEMM_FMLA2(z12,z13,p0,z30,z31,z26) GEMM_FMLA2(z14,z15,p0,z30,z31,z27) GEMM_FMLA2(z16,z17,p0,z30,z31,z28) GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x10, x10, x4 \n\t" // Forward A. " add x11, x11, x1 \n\t" // Forward B. " sub x13, x13, #1 \n\t" " b K_LEFT_LOOP \n\t" // Next column / row. " \n\t" " WRITE_MEM_PREP: \n\t" " \n\t" // " ldr x10, %[ai] \n\t" " mov x10, x20 \n\t" " add x11, x0, x3 \n\t" " dup z30.d, x23 \n\t" // Broadcast alpha & beta into vectors. " dup z31.d, x24 \n\t" " \n\t" " cmp x8, #1 \n\t" " b.eq PREFETCH_ABNEXT \n\t" " prfm PLDL1STRM, [x10] \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " add x11, x11, x2 \n\t" " prfm PLDL1KEEP, [x11] \n\t" " b WRITE_MEM \n\t" " \n\t" " PREFETCH_ABNEXT: \n\t" // " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed. " mov x1, x25 \n\t" // " ldr x2, %[b_next] \n\t" " mov x2, x26 \n\t" " prfm PLDL2KEEP, [x1] \n\t" " prfm PLDL2KEEP, [x1, 256*1] \n\t" " prfm PLDL2KEEP, [x1, 256*2] \n\t" " prfm PLDL2KEEP, [x1, 256*3] \n\t" " prfm PLDL2KEEP, [x1, 256*4] \n\t" " prfm PLDL2KEEP, [x1, 256*5] \n\t" " prfm PLDL2KEEP, [x1, 256*6] \n\t" " prfm PLDL2KEEP, [x1, 256*7] \n\t" " prfm PLDL2KEEP, [x1, 256*8] \n\t" " prfm PLDL2KEEP, [x1, 256*9] \n\t" " prfm PLDL2KEEP, [x1, 256*10] \n\t" " prfm PLDL2KEEP, [x1, 256*11] \n\t" " prfm PLDL2KEEP, [x1, 256*12] \n\t" " prfm PLDL2KEEP, [x1, 256*13] \n\t" " prfm PLDL2KEEP, [x1, 256*14] \n\t" " prfm PLDL2KEEP, [x1, 256*15] \n\t" " prfm PLDL2KEEP, [x2] \n\t" " prfm PLDL2KEEP, [x2, 256*1] \n\t" " prfm PLDL2KEEP, [x2, 256*2] \n\t" " prfm PLDL2KEEP, [x2, 256*3] \n\t" " prfm PLDL2KEEP, [x2, 256*4] \n\t" " prfm PLDL2KEEP, [x2, 256*5] \n\t" " prfm PLDL2KEEP, [x2, 256*6] \n\t" " prfm PLDL2KEEP, [x2, 256*7] \n\t" " prfm PLDL2KEEP, [x2, 256*8] \n\t" " prfm PLDL2KEEP, [x2, 256*9] \n\t" " \n\t" " WRITE_MEM: \n\t" " \n\t" " fmov d28, #1.0 \n\t" " fmov x16, d28 \n\t" " cmp x16, x23 \n\t" " b.eq UNIT_ALPHA \n\t" " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" " UNIT_ALPHA: \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x6, #1 \n\t" " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " mov x13, xzr \n\t" // C-column's physical 1-vector skip. " incb x13 \n\t" GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7) " \n\t" GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x5,x7) GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x5,x7) " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-30] - Z30 as index. " mov x12, xzr \n\t" " incb x12 \n\t" " madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip. " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16) " \n\t" GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x5,x7,x13,x16) GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x5,x7,x13,x16) " \n\t" " END_WRITE_MEM: \n\t" " subs x8, x8, #1 \n\t" " b.eq END_EXEC \n\t" " \n\t" // Address of C already forwarded to next column. " add x0, x0, x3 \n\t" // Forward B's base address to the next logic panel. " b MILLIKER_MLOOP \n\t" " \n\t" " END_ERROR: \n\t" " mov x0, #1 \n\t" // Return error. " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : : [bi] "m" (bi), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b] "m" (ps_b), [cs_a] "m" (cs_a), [ci] "m" (ci), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [m_curr] "m" (m_curr), [n_mker] "m" (n_mker), [ai] "m" (ai), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8", "x9","x10","x11","x12","x13","x14","x15","x16","x17", "x20","x21","x22","x23","x24","x25","x26", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); } } void bli_dgemmsup_rv_armsve_10x2v_unindexed ( conj_t conjat, conj_t conjbt, dim_t m0t, dim_t n0t, dim_t k0, double* restrict alpha, double* restrict at, inc_t rs_at0, inc_t cs_at0, double* restrict bt, inc_t rs_bt0, inc_t cs_bt0, double* restrict beta, double* restrict ct, inc_t rs_ct0, inc_t cs_ct0, auxinfo_t* restrict datat, cntx_t* restrict cntx ) { auxinfo_t data; bli_auxinfo_set_next_a( bli_auxinfo_next_b( datat ), &data ); bli_auxinfo_set_next_b( bli_auxinfo_next_a( datat ), &data ); bli_auxinfo_set_ps_a( bli_auxinfo_ps_b( datat ), &data ); bli_auxinfo_set_ps_b( bli_auxinfo_ps_a( datat ), &data ); bli_dgemmsup_cv_armsve_2vx10_unindexed ( conjbt, conjat, n0t, m0t, k0, alpha, bt, cs_bt0, rs_bt0, at, cs_at0, rs_at0, beta, ct, cs_ct0, rs_ct0, &data, cntx ); } cython-blis-0.9.1/blis/_src/kernels/armsve/3/old/sup/bli_gemmsup_rv_armsve_asm_d2vx10_unindexed.c000066400000000000000000000500221427272030600330660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include // Double-precision composite instructions. #include "../armsve_asm_macros_double.h" // 2vx10 microkernels. #include "../armsve_asm_2vx10.h" // Prototype reference kernel. GEMMSUP_KER_PROT( double, d, gemmsup_r_armsve_ref2 ) void __attribute__ ((optimize(0))) bli_dgemmsup_rv_armsve_2vx10_unindexed ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { static int called = 0; if ( !called ) { fprintf(stderr, "rv called.\n"); called = 1; } // r*r requires B to be stored in rows. assert(cs_b0 == 1); dim_t n0_mker = n0 / 10; dim_t n0_left = n0 % 10; if ( n0_left ) { // A[:, ::] // B[::, n0_mker*10:n0] // C[: , n0_mker*10:n0] double *ai = a; double *bi = b + n0_mker * 10 * cs_b0; double *ci = c + n0_mker * 10 * cs_c0; bli_dgemmsup_r_armsve_ref2 ( conja, conjb, m0, n0_left, k0, alpha, ai, rs_a0, cs_a0, bi, rs_b0, cs_b0, beta, ci, rs_c0, cs_c0, data, cntx ); } // Return if it's a pure edge case. if ( !n0_mker ) return; // Determine VL. uint64_t vlen2; __asm__ ( " mov x0, xzr \n\t" " incd x0, ALL, MUL #2 \n\t" " mov %[vlen2], x0 \n\t" : [vlen2] "=r" (vlen2) : : "x0" ); uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; // uint64_t cs_b = 1; uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_mker = m0 / vlen2; uint64_t m_left = m0 % vlen2; if ( m_left ) { // Edge case on A side can be handled with one more (predicated) loop. m_mker++; } else m_left = vlen2; uint64_t ps_a = bli_auxinfo_ps_a( data ); // uint64_t ps_b = bli_auxinfo_ps_b( data ); for ( dim_t in0_mker = 0; in0_mker < n0_mker; ++in0_mker ) { double *ai = a; double *bi = b + in0_mker * 10 * cs_b0; double *ci = c + in0_mker * 10 * cs_c0; void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); __asm__ volatile ( " ldr x0, %[ai] \n\t" " ldr x1, %[rs_a] \n\t" // Row-skip of A (element skip of A[:, l]). " ldr x2, %[cs_a] \n\t" // Column-skip of A. " ldr x3, %[ps_a] \n\t" // Panel-skip (vlen2*k) of A. " ldr x4, %[rs_b] \n\t" // Row-Skip of B. " \n\t" // Element skip of B[l, :] is guaranteed to be 1. " ldr x5, %[ci] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x16, 0x1 \n\t" // Tag C address. " lsl x16, x16, #56 \n\t" " orr x5, x5, x16 \n\t" " mov x16, 0x2 \n\t" // Tag A address. " lsl x16, x16, #56 \n\t" " orr x0, x0, x16 \n\t" #endif " \n\t" " mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // ps_a " madd x4, x8, x4, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c " mov x8, xzr \n\t" " incb x8 \n\t" " madd x14, x8, x1, xzr \n\t" // A-column's logical 1-vector skip. " mov x8, #4 \n\t" " madd x15, x8, x2, xzr \n\t" // Logical K=4 microkernel skip for A. // " mov x8, #4 \n\t" // " madd x17, x8, x4, xzr \n\t" // Logical K=4 microkernel skip for B. " \n\t" " ldr x8, %[m_mker] \n\t" // Number of M-loops. " ptrue p0.d \n\t" " ptrue p1.d \n\t" " ptrue p2.d \n\t" " \n\t" " MILLIKER_MLOOP: \n\t" " \n\t" " cmp x8, #1 \n\t" " b.ne UKER_BEGIN \n\t" " \n\t" " ldr x10, %[m_left] \n\t" // Final (incomplete) millikernel loop. " mov x11, xzr \n\t" " incd x11 \n\t" " whilelo p1.d, xzr, x10 \n\t" // Overwrite p1/p2. " whilelo p2.d, x11, x10 \n\t" " \n\t" " UKER_BEGIN: \n\t" " mov x10, x0 \n\t" // A's address. " ldr x11, %[bi] \n\t" // B's address. " ldr x12, %[k_mker] \n\t" " ldr x13, %[k_left] \n\t" #ifdef _A64FX " mov x16, 0x3 \n\t" // Tag B address. " lsl x16, x16, #56 \n\t" " orr x11, x11, x16 \n\t" #endif " \n\t" " mov x16, x11 \n\t" // Prefetch first kernel of B. " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " \n\t" " ld1rd z20.d, p0/z, [x11] \n\t" // (Partial) first B row. " ld1rd z21.d, p0/z, [x11, #8] \n\t" " ld1rd z22.d, p0/z, [x11, #16] \n\t" " ld1rd z23.d, p0/z, [x11, #24] \n\t" " ld1rd z24.d, p0/z, [x11, #32] \n\t" " ld1rd z25.d, p0/z, [x11, #40] \n\t" " ld1rd z26.d, p0/z, [x11, #48] \n\t" " ld1rd z27.d, p0/z, [x11, #56] \n\t" " \n\t" " index z29.d, xzr, x1 \n\t" // First A column. " \n\t" // Skips passed to index is not multiplied by 8. GEMM_ACOL_GATHER_LOAD(z28,z29,z29,p1,p2,x10,x14,x16) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x12, #0 \n\t" // If no 4-microkernel can be applied " b.eq K_LEFT_LOOP \n\t" " \n\t" " K_MKER_LOOP: \n\t" // Unroll the 4-loop. " \n\t" " index z31.d, xzr, x1 \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) " \n\t" " index z29.d, xzr, x1 \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) " \n\t" " index z31.d, xzr, x1 \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z30,z31,z31,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) " \n\t" " subs x12, x12, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" " index z29.d, xzr, x1 \n\t" GEMMSUP_ACOL_PREFETCH_NEXT_LOAD_G(z28,z29,z29,p1,p2,x10,x15,x3,x2,x14,x16,noprfm,noprfm) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) " b K_MKER_LOOP \n\t" " \n\t" " FIN_MKER_LOOP: \n\t" GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x11,x4) " add x10, x10, x2 \n\t" // Forward A to fill the blank. " \n\t" " K_LEFT_LOOP: \n\t" " cmp x13, #0 \n\t" " b.eq WRITE_MEM_PREP \n\t" " \n\t" " index z31.d, xzr, x1 \n\t" GEMM_ACOL_GATHER_LOAD(z30,z31,z31,p1,p2,x10,x14,x16) " ld1rd z20.d, p0/z, [x11] \n\t" " ld1rd z21.d, p0/z, [x11, #8] \n\t" " ld1rd z22.d, p0/z, [x11, #16] \n\t" " ld1rd z23.d, p0/z, [x11, #24] \n\t" " ld1rd z24.d, p0/z, [x11, #32] \n\t" " ld1rd z25.d, p0/z, [x11, #40] \n\t" " ld1rd z26.d, p0/z, [x11, #48] \n\t" " ld1rd z27.d, p0/z, [x11, #56] \n\t" " ld1rd z28.d, p0/z, [x11, #64] \n\t" " ld1rd z29.d, p0/z, [x11, #72] \n\t" GEMM_FMLA2(z0,z1,p0,z30,z31,z20) GEMM_FMLA2(z2,z3,p0,z30,z31,z21) GEMM_FMLA2(z4,z5,p0,z30,z31,z22) GEMM_FMLA2(z6,z7,p0,z30,z31,z23) GEMM_FMLA2(z8,z9,p0,z30,z31,z24) GEMM_FMLA2(z10,z11,p0,z30,z31,z25) GEMM_FMLA2(z12,z13,p0,z30,z31,z26) GEMM_FMLA2(z14,z15,p0,z30,z31,z27) GEMM_FMLA2(z16,z17,p0,z30,z31,z28) GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x10, x10, x2 \n\t" // Forward A. " add x11, x11, x4 \n\t" // Forward B. " sub x13, x13, #1 \n\t" " b K_LEFT_LOOP \n\t" // Next column / row. " \n\t" " WRITE_MEM_PREP: \n\t" " \n\t" " ldr x11, %[bi] \n\t" " ldr x12, %[alpha] \n\t" // Load alpha & beta. " ldr x13, %[beta] \n\t" " ld1rd z30.d, p0/z, [x12] \n\t" " ld1rd z31.d, p0/z, [x13] \n\t" " ldr x12, [x12] \n\t" " \n\t" " cmp x8, #1 \n\t" " b.eq PREFETCH_ABNEXT \n\t" " prfm PLDL2STRM, [x11] \n\t" " b WRITE_MEM \n\t" " \n\t" " PREFETCH_ABNEXT: \n\t" " ldr x1, %[a_next] \n\t" // Final Millikernel loop, x1 and x2 not needed. " ldr x2, %[b_next] \n\t" " prfm PLDL2KEEP, [x1] \n\t" " prfm PLDL2KEEP, [x1, 256*1] \n\t" " prfm PLDL2KEEP, [x1, 256*2] \n\t" " prfm PLDL2KEEP, [x1, 256*3] \n\t" " prfm PLDL2KEEP, [x1, 256*4] \n\t" " prfm PLDL2KEEP, [x1, 256*5] \n\t" " prfm PLDL2KEEP, [x1, 256*6] \n\t" " prfm PLDL2KEEP, [x1, 256*7] \n\t" " prfm PLDL2KEEP, [x1, 256*8] \n\t" " prfm PLDL2KEEP, [x1, 256*9] \n\t" " prfm PLDL2KEEP, [x1, 256*10] \n\t" " prfm PLDL2KEEP, [x1, 256*11] \n\t" " prfm PLDL2KEEP, [x1, 256*12] \n\t" " prfm PLDL2KEEP, [x1, 256*13] \n\t" " prfm PLDL2KEEP, [x1, 256*14] \n\t" " prfm PLDL2KEEP, [x1, 256*15] \n\t" " prfm PLDL2KEEP, [x2] \n\t" " prfm PLDL2KEEP, [x2, 256*1] \n\t" " prfm PLDL2KEEP, [x2, 256*2] \n\t" " prfm PLDL2KEEP, [x2, 256*3] \n\t" " prfm PLDL2KEEP, [x2, 256*4] \n\t" " prfm PLDL2KEEP, [x2, 256*5] \n\t" " prfm PLDL2KEEP, [x2, 256*6] \n\t" " prfm PLDL2KEEP, [x2, 256*7] \n\t" " prfm PLDL2KEEP, [x2, 256*8] \n\t" " prfm PLDL2KEEP, [x2, 256*9] \n\t" " \n\t" " WRITE_MEM: \n\t" " \n\t" " fmov d28, #1.0 \n\t" " fmov x16, d28 \n\t" " cmp x16, x12 \n\t" " b.eq UNIT_ALPHA \n\t" " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" " UNIT_ALPHA: \n\t" " mov x9, x5 \n\t" // C address for loading. " mov x10, x5 \n\t" // C address for storing. " cmp x6, #1 \n\t" " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " mov x13, xzr \n\t" // C-column's physical 1-vector skip. " incb x13 \n\t" GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x9,x7) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x9,x7) " \n\t" GEMM_C_STORE_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,x10,x7) GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,x10,x7) " b END_WRITE_MEM \n\t" " \n\t" " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-30] - Z30 as index. " mov x12, xzr \n\t" " incb x12 \n\t" " madd x13, x12, x6, xzr \n\t" // C-column's logical 1-vector skip. " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x9,x7,x13,x16) GEMM_C_FMAD_UKER(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p1,p2,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31) GEMM_C_LOAD_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x9,x7,x13,x16) " \n\t" GEMM_C_STORE_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p1,p2,x10,x7,x13,x16) GEMM_C_FMAD_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p1,p2,z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z31) GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p1,p2,x10,x7,x13,x16) " \n\t" " END_WRITE_MEM: \n\t" " subs x8, x8, #1 \n\t" " b.eq END_EXEC \n\t" " \n\t" " add x0, x0, x3 \n\t" // Forward A's base address to the next logic panel. " add x5, x5, x13 \n\t" // Forward C's base address to the next logic panel. " add x5, x5, x13 \n\t" " b MILLIKER_MLOOP \n\t" " \n\t" " END_ERROR: \n\t" " mov x0, #1 \n\t" // Return error. " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : : [ai] "m" (ai), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a] "m" (ps_a), [rs_b] "m" (rs_b), [ci] "m" (ci), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [m_mker] "m" (m_mker), [m_left] "m" (m_left), [bi] "m" (bi), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8", "x9","x10","x11","x12","x13","x14","x15","x16",//"x17", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", "z16","z17","z18","z19", "z20","z21","z22","z23", "z24","z25","z26","z27", "z28","z29","z30","z31" ); } } cython-blis-0.9.1/blis/_src/kernels/armsve/bli_kernels_armsve.h000066400000000000000000000051401427272030600246150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "./3/bli_armsve_utils.h" // GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( scomplex, c, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed ) // GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) // GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed ) // Use SVE intrinsics only for referred cases. #if !defined(BLIS_FAMILY_A64FX) PACKM_KER_PROT( double, d, packm_armsve256_int_8xk ) PACKM_KER_PROT( double, d, packm_armsve512_int_12xk ) #endif PACKM_KER_PROT( double, d, packm_armsve512_asm_16xk ) PACKM_KER_PROT( double, d, packm_armsve512_asm_10xk ) cython-blis-0.9.1/blis/_src/kernels/armv7a/000077500000000000000000000000001427272030600204765ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv7a/3/000077500000000000000000000000001427272030600206405ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv7a/3/bli_cgemm_armv7a_asm_2x2.S000066400000000000000000000233231427272030600255150ustar00rootroot00000000000000 #define REALNAME bli_cgemm_armv7a_ker_2x2 #define STACKSIZE 256 #define K r0 #define PTR_ALPHA r1 #define OLD_A r2 #define OLD_B r3 #define PTR_BETA [fp, #0 ] #define OLD_C [fp, #4 ] #define OLD_RSC [fp, #8 ] #define OLD_CSC [fp, #12 ] #define AUX [fp, #16 ] /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * register *******************************************************/ #define L r2 #define AO r5 #define BO r6 #define CO1 r7 #define CO2 r8 #define A_PRE 96 #define B_PRE 96 #define C_PRE 0 /************************************************************************************** * Macro definitions **************************************************************************************/ #define FMAC_BR fnmacs #define FMAC_BI fmacs #define NN 1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubs #define FADD_I fadds #define FMAC_R1 fnmacs #define FMAC_R2 fnmacs #define FMAC_I1 fmacs #define FMAC_I2 fnmacs #elif defined(CN) || defined(CT) #define FADD_R fadds #define FADD_I fsubs #define FMAC_R1 fmacs #define FMAC_R2 fmacs #define FMAC_I1 fnmacs #define FMAC_I2 fmacs #elif defined(NC) || defined(TC) #define FADD_R fadds #define FADD_I fsubs #define FMAC_R1 fmacs #define FMAC_R2 fnmacs #define FMAC_I1 fmacs #define FMAC_I2 fmacs #else #define FADD_R fsubs #define FADD_I fadds #define FMAC_R1 fnmacs #define FMAC_R2 fmacs #define FMAC_I1 fnmacs #define FMAC_I2 fnmacs #endif .macro INIT2x2 vsub.f32 s16 , s16 , s16 vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 vmov.f32 s22, s16 vmov.f32 s23, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s26, s16 vmov.f32 s27, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 vmov.f32 s30, s16 vmov.f32 s31, s16 .endm .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmuls s16 , s0, s8 flds s2 , [ AO, #8 ] fmuls s24 , s1, s9 flds s3 , [ AO, #12 ] fmuls s17 , s0, s9 flds s10, [ BO, #8 ] fmuls s25 , s1, s8 flds s11, [ BO, #12 ] fmuls s18 , s2, s8 add BO , BO, #16 fmuls s26 , s3, s9 add AO , AO, #16 fmuls s19 , s2, s9 pld [ BO , #B_PRE ] fmuls s27 , s3, s8 pld [ AO , #A_PRE ] fmuls s20 , s0, s10 flds s4 , [ AO, #0 ] fmuls s28 , s1, s11 flds s5 , [ AO, #4 ] fmuls s21 , s0, s11 flds s12, [ BO ] fmuls s29 , s1, s10 flds s13, [ BO, #4 ] fmuls s22 , s2, s10 flds s6 , [ AO, #8 ] fmuls s30 , s3, s11 flds s7 , [ AO, #12 ] fmuls s23 , s2, s11 flds s14, [ BO, #8 ] fmuls s31 , s3, s10 flds s15, [ BO, #12 ] add BO , BO, #16 add AO , AO, #16 .endm .macro KERNEL2x2_M1 pld [ AO , #A_PRE ] fmacs s16 , s0, s8 pld [ BO , #B_PRE ] fmacs s24 , s1, s9 flds s4 , [ AO, #0 ] fmacs s17 , s0, s9 flds s5 , [ AO, #4 ] fmacs s25 , s1, s8 flds s12, [ BO ] fmacs s18 , s2, s8 flds s13, [ BO, #4 ] fmacs s26 , s3, s9 flds s6 , [ AO, #8 ] fmacs s19 , s2, s9 flds s7 , [ AO, #12 ] fmacs s27 , s3, s8 fmacs s20 , s0, s10 flds s14, [ BO, #8 ] fmacs s28 , s1, s11 fmacs s21 , s0, s11 flds s15, [ BO, #12 ] fmacs s29 , s1, s10 fmacs s22 , s2, s10 add BO , BO, #16 fmacs s30 , s3, s11 fmacs s23 , s2, s11 add AO , AO, #16 fmacs s31 , s3, s10 .endm .macro KERNEL2x2_M2 fmacs s16 , s4, s12 fmacs s24 , s5, s13 flds s0 , [ AO, #0 ] fmacs s17 , s4, s13 flds s1 , [ AO, #4 ] fmacs s25 , s5, s12 fmacs s18 , s6, s12 flds s8 , [ BO ] fmacs s26 , s7, s13 flds s9 , [ BO, #4 ] fmacs s19 , s6, s13 fmacs s27 , s7, s12 flds s2 , [ AO, #8 ] fmacs s20 , s4, s14 flds s3 , [ AO, #12 ] fmacs s28 , s5, s15 fmacs s21 , s4, s15 flds s10, [ BO, #8 ] fmacs s29 , s5, s14 flds s11, [ BO, #12 ] fmacs s22 , s6, s14 fmacs s30 , s7, s15 add BO , BO, #16 fmacs s23 , s6, s15 add AO , AO, #16 fmacs s31 , s7, s14 .endm .macro KERNEL2x2_E fmacs s16 , s4, s12 fmacs s24 , s5, s13 fmacs s17 , s4, s13 fmacs s25 , s5, s12 fmacs s18 , s6, s12 fmacs s26 , s7, s13 fmacs s19 , s6, s13 fmacs s27 , s7, s12 fmacs s20 , s4, s14 fmacs s28 , s5, s15 fmacs s21 , s4, s15 fmacs s29 , s5, s14 fmacs s22 , s6, s14 fmacs s30 , s7, s15 fmacs s23 , s6, s15 fmacs s31 , s7, s14 .endm .macro KERNEL2x2_SUB flds s0 , [ AO ] flds s1 , [ AO, #4 ] flds s8 , [ BO ] flds s9 , [ BO, #4 ] fmacs s16 , s0, s8 flds s2 , [ AO, #8 ] fmacs s24 , s1, s9 flds s3 , [ AO, #12 ] fmacs s17 , s0, s9 flds s10, [ BO, #8 ] fmacs s25 , s1, s8 flds s11, [ BO, #12 ] fmacs s18 , s2, s8 fmacs s26 , s3, s9 fmacs s19 , s2, s9 fmacs s27 , s3, s8 fmacs s20 , s0, s10 fmacs s28 , s1, s11 fmacs s21 , s0, s11 fmacs s29 , s1, s10 fmacs s22 , s2, s10 add BO , BO, #16 fmacs s30 , s3, s11 fmacs s23 , s2, s11 add AO , AO, #16 fmacs s31 , s3, s10 .endm .macro SAVE2x2 ldr r3, OLD_RSC // Row stride size lsl r3, r3, #3 // multiply with size of complex float flds s0, [ PTR_ALPHA ] // load real part of alpha flds s1, [ PTR_ALPHA, #4 ] // load imag part of alpha ldr r4, PTR_BETA flds s2, [ r4 ] // load real part of beta flds s3, [ r4, #4 ] // load imag part of beta // Add/Sub the real and the imag parts FADD_R s16, s24 , s16 FADD_I s17, s25 , s17 FADD_R s18, s26 , s18 FADD_I s19, s27 , s19 FADD_R s20, s28 , s20 FADD_I s21, s29 , s21 FADD_R s22, s30 , s22 FADD_I s23, s31 , s23 mov r4, CO1 // save pointer fldmias CO1, { s4 - s5 } // read real and imag part from C add CO1, CO1, r3 mov r2, CO2 // save pointer fldmias CO2, { s8 - s9 } // read real and imag part from C add CO2, CO2, r3 fmuls s24, s4, s2 // multiply Beta-real with C-real fmuls s25, s5, s2 // multiply Beta-real with C-imag fmuls s28, s8, s2 // multiply Beta-real with C-real fmuls s29, s9, s2 // multiply Beta-real with C-imag FMAC_BR s24, s3, s5 // multiply beta-imag with C-imag and add FMAC_BI s25, s3, s4 // multiply beta-imag with C-real and add FMAC_BR s28, s3, s9 // multiply beta-imag with C-imag and add FMAC_BI s29, s3, s8 // multiply beta-imag with C-real and add FMAC_R1 s24 , s0 , s16 FMAC_I1 s25 , s0 , s17 FMAC_R2 s24 , s1 , s17 FMAC_I2 s25 , s1 , s16 FMAC_R1 s28 , s0 , s20 FMAC_I1 s29 , s0 , s21 FMAC_R2 s28 , s1 , s21 FMAC_I2 s29 , s1 , s20 fldmias CO1, { s4 - s5 } // read real and imag part from C fldmias CO2, { s8 - s9 } // read real and imag part from C fmuls s26, s4, s2 // multiply Beta-real with C-real fmuls s27, s5, s2 // multiply Beta-real with C-imag fmuls s30, s8, s2 // multiply Beta-real with C-real fmuls s31, s9, s2 // multiply Beta-real with C-imag FMAC_BR s26, s3, s5 // multiply beta-imag with C-imag and add FMAC_BI s27, s3, s4 // multiply beta-imag with C-real and add FMAC_BR s30, s3, s9 // multiply beta-imag with C-imag and add FMAC_BI s31, s3, s8 // multiply beta-imag with C-real and add FMAC_R1 s26 , s0 , s18 FMAC_I1 s27 , s0 , s19 FMAC_R2 s26 , s1 , s19 FMAC_I2 s27 , s1 , s18 FMAC_R1 s30, s0 , s22 FMAC_I1 s31, s0 , s23 FMAC_R2 s30, s1 , s23 FMAC_I2 s31, s1 , s22 mov CO1, r4 // restore pointer mov CO2, r2 // restore pointer fstmias CO1, { s24 - s25 } fstmias CO2, { s28 - s29 } add CO1, CO1, r3 add CO2, CO2, r3 fstmias CO1, { s26 - s27 } fstmias CO2, { s30 - s31 } .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ .arm .global REALNAME .func REALNAME REALNAME: push {r4 - r9, fp} // save register add fp, sp, #28 // add number of saved register multiplied by size of int sub sp, sp, #STACKSIZE // reserve stack mov AO, OLD_A // pointer matrix A mov BO, OLD_B // pointer matrix B sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers ldr r2, OLD_C // pointer matrix C ldr r3, OLD_CSC // Col stride size of C lsl r3, r3, #3 // multiply with size of complex float mov CO1, r2 // first line of C add CO2, CO1, r3 // second line of C pld [ CO1, #C_PRE ] // prefetch the lines of C pld [ CO2, #C_PRE ] // prefetch the lines of C cgemm_kernel_L2_M2_20: asrs L , K, #3 // L = K / 8 cmp L , #2 blt cgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #2 ble cgemm_kernel_L2_M2_22a .align 5 cgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt cgemm_kernel_L2_M2_22 cgemm_kernel_L2_M2_22a: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_32: tst L, #1 ble cgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b cgemm_kernel_L2_M2_44 cgemm_kernel_L2_M2_40: INIT2x2 cgemm_kernel_L2_M2_44: ands L , K, #7 // L = K % 8 ble cgemm_kernel_L2_M2_100 cgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne cgemm_kernel_L2_M2_46 cgemm_kernel_L2_M2_100: SAVE2x2 cgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers sub sp, fp, #28 pop {r4 - r9, fp} bx lr cython-blis-0.9.1/blis/_src/kernels/armv7a/3/bli_dgemm_armv7a_asm_4x4.S000066400000000000000000000270251427272030600255250ustar00rootroot00000000000000 #define REALNAME bli_dgemm_armv7a_ker_4x4 #define STACKSIZE 256 #define K r0 #define PTR_ALPHA r1 #define OLD_A r2 #define OLD_B r3 #define PTR_BETA [fp, #0 ] #define OLD_C [fp, #4 ] #define OLD_RSC [fp, #8 ] #define OLD_CSC [fp, #12 ] #define AUX [fp, #16 ] /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * register *******************************************************/ #define L r2 #define AO r5 #define BO r6 #define CO1 r7 #define CO2 r8 #define CO3 r9 #define CO4 r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 0 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT4x4 vsub.f64 d16 , d16 , d16 vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 vmov.f64 d30, d16 vmov.f64 d31, d16 .endm .macro KERNEL4x4_I pld [ BO , #B_PRE ] fldd d8 , [ BO ] fldd d0 , [ AO ] pld [ AO , #A_PRE ] fldd d1 , [ AO, #8 ] fmuld d16 , d0, d8 fldd d2 , [ AO, #16 ] fmuld d17 , d1, d8 fldd d3 , [ AO, #24 ] fmuld d18 , d2, d8 fldd d9 , [ BO, #8 ] fmuld d19 , d3, d8 fldd d10, [ BO, #16 ] fmuld d20 , d0, d9 fldd d11, [ BO, #24 ] fmuld d21 , d1, d9 add BO , BO, #32 add AO , AO, #32 fmuld d22 , d2, d9 pld [ BO , #B_PRE ] fldd d12, [ BO ] fmuld d23 , d3, d9 pld [ AO , #A_PRE ] fldd d4 , [ AO, #0 ] fmuld d24 , d0, d10 fldd d5 , [ AO, #8 ] fmuld d25 , d1, d10 fldd d6 , [ AO, #16 ] fmuld d26 , d2, d10 fldd d7 , [ AO, #24 ] fmuld d27 , d3, d10 fldd d13, [ BO, #8 ] fmuld d28 , d0, d11 fldd d14, [ BO, #16 ] fmuld d29 , d1, d11 fldd d15, [ BO, #24 ] fmuld d30 , d2, d11 fmuld d31 , d3, d11 .endm .macro KERNEL4x4_M2 fmacd d16 , d4, d12 pld [ AO , #A_PRE+32 ] fmacd d17 , d5, d12 fldd d0 , [ AO , #32 ] fmacd d18 , d6, d12 pld [ BO , #B_PRE+32 ] fmacd d19 , d7, d12 fldd d8 , [ BO , #32 ] fmacd d20 , d4, d13 fldd d1 , [ AO, #40 ] fmacd d21 , d5, d13 fldd d2 , [ AO, #48 ] fmacd d22 , d6, d13 fldd d3 , [ AO, #56 ] fmacd d23 , d7, d13 fmacd d24 , d4, d14 fmacd d25 , d5, d14 fldd d9 , [ BO, #40 ] fmacd d26 , d6, d14 fldd d10, [ BO, #48 ] fmacd d27 , d7, d14 fldd d11, [ BO, #56 ] fmacd d28 , d4, d15 fmacd d29 , d5, d15 add AO , AO, #64 fmacd d30 , d6, d15 add BO , BO, #64 fmacd d31 , d7, d15 .endm .macro KERNEL4x4_M1 fmacd d16 , d0, d8 pld [ AO , #A_PRE ] fmacd d17 , d1, d8 fldd d4 , [ AO ] fmacd d18 , d2, d8 pld [ BO , #B_PRE ] fmacd d19 , d3, d8 fldd d12, [ BO ] fmacd d20 , d0, d9 fldd d5 , [ AO, #8 ] fmacd d21 , d1, d9 fldd d6 , [ AO, #16 ] fmacd d22 , d2, d9 fldd d7 , [ AO, #24 ] fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fldd d13, [ BO, #8 ] fmacd d26 , d2, d10 fldd d14, [ BO, #16 ] fmacd d27 , d3, d10 fldd d15, [ BO, #24 ] fmacd d28 , d0, d11 fmacd d29 , d1, d11 fmacd d30 , d2, d11 fmacd d31 , d3, d11 .endm .macro KERNEL4x4_E fmacd d16 , d4, d12 fmacd d17 , d5, d12 add BO , BO, #32 fmacd d18 , d6, d12 add AO , AO, #32 fmacd d19 , d7, d12 fmacd d20 , d4, d13 fmacd d21 , d5, d13 fmacd d22 , d6, d13 fmacd d23 , d7, d13 fmacd d24 , d4, d14 fmacd d25 , d5, d14 fmacd d26 , d6, d14 fmacd d27 , d7, d14 fmacd d28 , d4, d15 fmacd d29 , d5, d15 fmacd d30 , d6, d15 fmacd d31 , d7, d15 .endm .macro KERNEL4x4_SUB fldd d8 , [ BO ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] pld [ AO , #A_PRE ] fldd d1 , [ AO, #8 ] fmacd d16 , d0, d8 fldd d2 , [ AO, #16 ] fmacd d17 , d1, d8 fldd d3 , [ AO, #24 ] fmacd d18 , d2, d8 fldd d9 , [ BO, #8 ] fmacd d19 , d3, d8 fldd d10, [ BO, #16 ] fmacd d20 , d0, d9 fldd d11, [ BO, #24 ] fmacd d21 , d1, d9 fmacd d22 , d2, d9 fmacd d23 , d3, d9 fmacd d24 , d0, d10 fmacd d25 , d1, d10 fmacd d26 , d2, d10 fmacd d27 , d3, d10 fmacd d28 , d0, d11 fmacd d29 , d1, d11 add AO , AO, #32 fmacd d30 , d2, d11 add BO , BO, #32 fmacd d31 , d3, d11 .endm .macro SAVE4x4 ldr r3, OLD_RSC // Row stride size lsl r3, r3, #3 // multiply with size of double fldd d0, [ PTR_ALPHA ] // load alpha ldr r4, PTR_BETA fldd d1, [ r4 ] // load beta //----------------------------------------------------------- mov r2, CO1 // save pointer mov r4, CO2 // save pointer fldd d8, [ CO1 ] // load value from C fldd d12, [ CO2 ] // load value from C fmuld d8, d8, d1 // multiply with beta add CO1, CO1, r3 // compute next pointer fmacd d8, d0, d16 // multiply sum with alpha and add to value of C add CO2, CO2, r3 // compute next pointer fldd d9, [ CO1 ] // load value from C fldd d13, [ CO2 ] // load value from C fmuld d9, d9, d1 // multiply with beta add CO1, CO1, r3 // compute next pointer fmacd d9, d0, d17 // multiply sum with alpha and add to value of C add CO2, CO2, r3 // compute next pointer fldd d10, [ CO1 ] // load value from C fldd d14, [ CO2 ] // load value from C fmuld d10, d10, d1 // multiply with beta add CO1, CO1, r3 // compute next pointer fmacd d10, d0, d18 // multiply sum with alpha and add to value of C add CO2, CO2, r3 // compute next pointer fldd d11, [ CO1 ] // load value from C fldd d15, [ CO2 ] // load value from C fmuld d11, d11, d1 // multiply with beta mov CO1, r2 // restore pointer fmacd d11, d0, d19 // multiply sum with alpha and add to value of C mov CO2, r4 // restore pointer fstd d8, [ CO1 ] // store value in C add CO1 , CO1, r3 // compute next pointer fstd d9, [ CO1 ] // store value in C add CO1 , CO1, r3 // compute next pointer fstd d10, [ CO1 ] // store value in C add CO1 , CO1, r3 // compute next pointer fstd d11, [ CO1 ] // store value in C //----------------------------------------------------------- mov r2, CO3 // save pointer fldd d8, [ CO3 ] // load value from C fmuld d12, d12, d1 // multiply with beta add CO3, CO3, r3 // compute next pointer fmacd d12, d0, d20 // multiply sum with alpha and add to value of C fldd d9, [ CO3 ] // load value from C fmuld d13, d13, d1 // multiply with beta add CO3, CO3, r3 // compute next pointer fmacd d13, d0, d21 // multiply sum with alpha and add to value of C fldd d10, [ CO3 ] // load value from C fmuld d14, d14, d1 // multiply with beta add CO3, CO3, r3 // compute next pointer fmacd d14, d0, d22 // multiply sum with alpha and add to value of C fldd d11, [ CO3 ] // load value from C fmuld d15, d15, d1 // multiply with beta mov CO3, r2 // restore pointer fmacd d15, d0, d23 // multiply sum with alpha and add to value of C fstd d12, [ CO2 ] // store value in C add CO2 , CO2, r3 // compute next pointer fstd d13, [ CO2 ] // store value in C add CO2 , CO2, r3 // compute next pointer fstd d14, [ CO2 ] // store value in C add CO2 , CO2, r3 // compute next pointer fstd d15, [ CO2 ] // store value in C //----------------------------------------------------------- mov r4, CO4 // save pointer fldd d12, [ CO4 ] // load value from C fmuld d8, d8, d1 // multiply with beta add CO4, CO4, r3 // compute next pointer fmacd d8, d0, d24 // multiply sum with alpha and add to value of C fldd d13, [ CO4 ] // load value from C fmuld d9, d9, d1 // multiply with beta add CO4, CO4, r3 // compute next pointer fmacd d9, d0, d25 // multiply sum with alpha and add to value of C fldd d14, [ CO4 ] // load value from C fmuld d10, d10, d1 // multiply with beta add CO4, CO4, r3 // compute next pointer fmacd d10, d0, d26 // multiply sum with alpha and add to value of C fldd d15, [ CO4 ] // load value from C fmuld d11, d11, d1 // multiply with beta mov CO4, r4 // restore pointer fmacd d11, d0, d27 // multiply sum with alpha and add to value of C //----------------------------------------------------------- fstd d8, [ CO3 ] // store value in C fmuld d12, d12, d1 // multiply with beta add CO3 , CO3, r3 // compute next pointer fmacd d12, d0, d28 // multiply sum with alpha and add to value of C fstd d9, [ CO3 ] // store value in C fmuld d13, d13, d1 // multiply with beta add CO3 , CO3, r3 // compute next pointer fmacd d13, d0, d29 // multiply sum with alpha and add to value of C fstd d10, [ CO3 ] // store value in C fmuld d14, d14, d1 // multiply with beta add CO3 , CO3, r3 // compute next pointer fmacd d14, d0, d30 // multiply sum with alpha and add to value of C fstd d11, [ CO3 ] // store value in C fmuld d15, d15, d1 // multiply with beta fstd d12, [ CO4 ] // store value in C fmacd d15, d0, d31 // multiply sum with alpha and add to value of C add CO4 , CO4, r3 // compute next pointer fstd d13, [ CO4 ] // store value in C add CO4 , CO4, r3 // compute next pointer fstd d14, [ CO4 ] // store value in C add CO4 , CO4, r3 // compute next pointer fstd d15, [ CO4 ] // store value in C .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ .arm .global REALNAME .func REALNAME REALNAME: push {r4 - r9, fp} // save register add fp, sp, #28 // add number of saved register multiplied by size of int sub sp, sp, #STACKSIZE // reserve stack mov AO, OLD_A // pointer matrix A mov BO, OLD_B // pointer matrix B sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers ldr r2, OLD_C // pointer matrix C ldr r3, OLD_CSC // Col stride size of C lsl r3, r3, #3 // multiply with size of double mov CO1, r2 // first line of C add CO2, CO1, r3 // second line of C add CO3, CO2, r3 // third line of C add CO4, CO3, r3 // fourth line of C pld [ CO1, #C_PRE ] // prefetch the lines of C pld [ CO2, #C_PRE ] // prefetch the lines of C pld [ CO3, #C_PRE ] // prefetch the lines of C pld [ CO3, #C_PRE ] // prefetch the lines of C dgemm_kernel_L4_M4_20: asrs L , K, #3 // L = K / 8 cmp L , #2 blt dgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #2 ble dgemm_kernel_L4_M4_22a .align 5 dgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #1 bgt dgemm_kernel_L4_M4_22 dgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b dgemm_kernel_L4_M4_44 dgemm_kernel_L4_M4_32: tst L, #1 ble dgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b dgemm_kernel_L4_M4_44 dgemm_kernel_L4_M4_40: INIT4x4 dgemm_kernel_L4_M4_44: ands L , K, #7 // L = K % 8 ble dgemm_kernel_L4_M4_100 dgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 bne dgemm_kernel_L4_M4_46 dgemm_kernel_L4_M4_100: SAVE4x4 dgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers sub sp, fp, #28 pop {r4 - r9, fp} bx lr cython-blis-0.9.1/blis/_src/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c000066400000000000000000000126021427272030600255400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" extern void bli_sgemm_armv7a_ker_4x4 ( uint32_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, uint32_t rs_c, uint32_t cs_c, auxinfo_t* restrict data ); void bli_sgemm_armv7a_asm_4x4 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. GEMM_UKR_SETUP_CT_ANY( s, 4, 4, false ); bli_sgemm_armv7a_ker_4x4( k, alpha, a, b, beta, c, rs_c, cs_c, data ); GEMM_UKR_FLUSH_CT( s ); } extern void bli_dgemm_armv7a_ker_4x4 ( uint32_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, uint32_t rs_c, uint32_t cs_c, auxinfo_t* restrict data ); void bli_dgemm_armv7a_asm_4x4 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. GEMM_UKR_SETUP_CT_ANY( d, 4, 4, false ); bli_dgemm_armv7a_ker_4x4( k, alpha, a, b, beta, c, rs_c, cs_c, data ); GEMM_UKR_FLUSH_CT( d ); } extern void bli_cgemm_armv7a_ker_2x2 ( uint32_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, uint32_t rs_c, uint32_t cs_c, auxinfo_t* restrict data ); void bli_cgemm_armv7a_asm_2x2 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. GEMM_UKR_SETUP_CT_ANY( c, 2, 2, false ); bli_cgemm_armv7a_ker_2x2( k, alpha, a, b, beta, c, rs_c, cs_c, data ); GEMM_UKR_FLUSH_CT( c ); } extern void bli_zgemm_armv7a_ker_2x2 ( uint32_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, uint32_t rs_c, uint32_t cs_c, auxinfo_t* restrict data ); void bli_zgemm_armv7a_asm_2x2 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. GEMM_UKR_SETUP_CT_ANY( z, 2, 2, false ); bli_zgemm_armv7a_ker_2x2( k, alpha, a, b, beta, c, rs_c, cs_c, data ); GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c000066400000000000000000000302561427272030600255570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "arm_neon.h" void bli_sgemm_armv7a_int_4x4 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint32_t k_iter = k / 4; uint32_t k_left = k % 4; uint32_t rs_c = rs_c0; uint32_t cs_c = cs_c0; uint32_t i; GEMM_UKR_SETUP_CT( s, 4, 4, false ); void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); float32x4_t alphav; alphav = vmovq_n_f32( *alpha ); float32x4_t av1; float32x4_t av2; float32x4_t av3; float32x4_t av4; float32x4_t bv1; float32x4_t bv2; float32x4_t bv3; float32x4_t bv4; // Vector for column 0 float32x4_t cv0; // Vector for column 1 float32x4_t cv1; // Vector for column 2 float32x4_t cv2; // Vector for column 3 float32x4_t cv3; if ( *beta != 0.0F ) { // Load column 0 cv0 = vld1q_f32( c + 0*cs_c ); // Load column 1 cv1 = vld1q_f32( c + 1*cs_c ); // Load column 2 cv2 = vld1q_f32( c + 2*cs_c ); // Load column 3 cv3 = vld1q_f32( c + 3*cs_c ); } else { cv0 = vmovq_n_f32( 0.0 ); cv1 = vmovq_n_f32( 0.0 ); cv2 = vmovq_n_f32( 0.0 ); cv3 = vmovq_n_f32( 0.0 ); } // Vector for accummulating column 0 float32x4_t abv0; // Initialize vector to 0.0 abv0 = vmovq_n_f32( 0.0 ); // Vector for accummulating column 1 float32x4_t abv1; // Initialize vector to 0.0 abv1 = vmovq_n_f32( 0.0 ); // Vector for accummulating column 2 float32x4_t abv2; // Initialize vector to 0.0 abv2 = vmovq_n_f32( 0.0 ); // Vector for accummulating column 3 float32x4_t abv3; // Initialize vector to 0.0 abv3 = vmovq_n_f32( 0.0 ); for ( i = 0; i < k_iter; ++i ) { // Begin iter 0 av1 = vld1q_f32( a ); __builtin_prefetch( a + 224 ); __builtin_prefetch( b + 224 ); bv1 = vld1q_f32( b ); abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 ); abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 ); abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 ); abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 ); av2 = vld1q_f32( a+4 ); //__builtin_prefetch( a + 116 ); //__builtin_prefetch( b + 116 ); bv2 = vld1q_f32( b+4 ); abv0 = vmlaq_lane_f32( abv0, av2, vget_low_f32(bv2), 0 ); abv1 = vmlaq_lane_f32( abv1, av2, vget_low_f32(bv2), 1 ); abv2 = vmlaq_lane_f32( abv2, av2, vget_high_f32(bv2), 0 ); abv3 = vmlaq_lane_f32( abv3, av2, vget_high_f32(bv2), 1 ); av3 = vld1q_f32( a+8 ); //__builtin_prefetch( a + 120 ); //__builtin_prefetch( b + 120 ); bv3 = vld1q_f32( b+8 ); abv0 = vmlaq_lane_f32( abv0, av3, vget_low_f32(bv3), 0 ); abv1 = vmlaq_lane_f32( abv1, av3, vget_low_f32(bv3), 1 ); abv2 = vmlaq_lane_f32( abv2, av3, vget_high_f32(bv3), 0 ); abv3 = vmlaq_lane_f32( abv3, av3, vget_high_f32(bv3), 1 ); av4 = vld1q_f32( a+12); //__builtin_prefetch( a + 124 ); //__builtin_prefetch( b + 124 ); bv4 = vld1q_f32( b+12); abv0 = vmlaq_lane_f32( abv0, av4, vget_low_f32(bv4), 0 ); abv1 = vmlaq_lane_f32( abv1, av4, vget_low_f32(bv4), 1 ); abv2 = vmlaq_lane_f32( abv2, av4, vget_high_f32(bv4), 0 ); abv3 = vmlaq_lane_f32( abv3, av4, vget_high_f32(bv4), 1 ); a += 16; b += 16; } for ( i = 0; i < k_left; ++i ) { av1 = vld1q_f32( a ); __builtin_prefetch( a + 112 ); __builtin_prefetch( b + 112 ); bv1 = vld1q_f32( b ); abv0 = vmlaq_lane_f32( abv0, av1, vget_low_f32(bv1), 0 ); abv1 = vmlaq_lane_f32( abv1, av1, vget_low_f32(bv1), 1 ); abv2 = vmlaq_lane_f32( abv2, av1, vget_high_f32(bv1), 0 ); abv3 = vmlaq_lane_f32( abv3, av1, vget_high_f32(bv1), 1 ); a += 4; b += 4; } __builtin_prefetch( a_next ); __builtin_prefetch( b_next ); if ( *beta != 0.0F ) { // Multiply C by beta and then accumulate alpha * A * B. cv0 = vmulq_n_f32( cv0, *beta ); cv1 = vmulq_n_f32( cv1, *beta ); cv2 = vmulq_n_f32( cv2, *beta ); cv3 = vmulq_n_f32( cv3, *beta ); cv0 = vmlaq_f32( cv0, abv0, alphav ); cv1 = vmlaq_f32( cv1, abv1, alphav ); cv2 = vmlaq_f32( cv2, abv2, alphav ); cv3 = vmlaq_f32( cv3, abv3, alphav ); } else { // Since beta = 0, skip straight to accumulating alpha * A * B. // Note: C (cv?) was initialized to zero above. cv0 = vmlaq_f32( cv0, abv0, alphav ); cv1 = vmlaq_f32( cv1, abv1, alphav ); cv2 = vmlaq_f32( cv2, abv2, alphav ); cv3 = vmlaq_f32( cv3, abv3, alphav ); } // Store column 0 vst1q_f32( c + 0*cs_c, cv0 ); // Store column 1 vst1q_f32( c + 1*cs_c, cv1 ); // Store column 2 vst1q_f32( c + 2*cs_c, cv2 ); // Store column 3 vst1q_f32( c + 3*cs_c, cv3 ); GEMM_UKR_FLUSH_CT( s ); } void bli_dgemm_armv7a_int_4x4 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. //uint32_t k_iter = k0 / 4; uint32_t k_left = k % 4; uint32_t rs_c = rs_c0; uint32_t cs_c = cs_c0; uint32_t i; GEMM_UKR_SETUP_CT_ANY( d, 4, 4, false ); //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); register double a0; register double a1; register double a2; register double a3; register double A0; register double A1; register double A2; register double A3; double b0, b1, b2, b3; double B0, B1, B2, B3; double ab00, ab01, ab02, ab03; double ab10, ab11, ab12, ab13; double ab20, ab21, ab22, ab23; double ab30, ab31, ab32, ab33; double* restrict c00, * restrict c01, * restrict c02, * restrict c03; double* restrict c10, * restrict c11, * restrict c12, * restrict c13; double* restrict c20, * restrict c21, * restrict c22, * restrict c23; double* restrict c30, * restrict c31, * restrict c32, * restrict c33; double* restrict ap = a; double* restrict bp = b; double* restrict Ap = a + 4; double* restrict Bp = b + 4; c00 = (c + 0*rs_c + 0*cs_c); c10 = (c + 1*rs_c + 0*cs_c); c20 = (c + 2*rs_c + 0*cs_c); c30 = (c + 3*rs_c + 0*cs_c); c01 = (c + 0*rs_c + 1*cs_c); c11 = (c + 1*rs_c + 1*cs_c); c21 = (c + 2*rs_c + 1*cs_c); c31 = (c + 3*rs_c + 1*cs_c); c02 = (c + 0*rs_c + 2*cs_c); c12 = (c + 1*rs_c + 2*cs_c); c22 = (c + 2*rs_c + 2*cs_c); c32 = (c + 3*rs_c + 2*cs_c); c03 = (c + 0*rs_c + 3*cs_c); c13 = (c + 1*rs_c + 3*cs_c); c23 = (c + 2*rs_c + 3*cs_c); c33 = (c + 3*rs_c + 3*cs_c); ab00 = 0.0; ab10 = 0.0; ab20 = 0.0; ab30 = 0.0; ab01 = 0.0; ab11 = 0.0; ab21 = 0.0; ab31 = 0.0; ab02 = 0.0; ab12 = 0.0; ab22 = 0.0; ab32 = 0.0; ab03 = 0.0; ab13 = 0.0; ab23 = 0.0; ab33 = 0.0; A0 = *(Ap + 0); A1 = *(Ap + 1); A2 = *(Ap + 2); A3 = *(Ap + 3); a0 = *(ap + 0); a1 = *(ap + 1); a2 = *(ap + 2); B0 = *(Bp + 0); B1 = *(Bp + 1); B2 = *(Bp + 2); B3 = *(Bp + 3); b0 = *(bp + 0); b1 = *(bp + 1); b2 = *(bp + 2); double *Aplast = (Ap + 4*(k-k_left)); //for ( i = 0; i < k_iter; ++i ) // Unroll by factor 4. for ( ; Ap != Aplast ; ) // Unroll by factor 4. { /* Prefetch */ //__asm__ ("pld\t[%0],#100\n\t" : :"r"(Ap) : ); __builtin_prefetch( ap + 112 ); __builtin_prefetch( Ap + 112 ); __builtin_prefetch( bp + 112 ); __builtin_prefetch( Bp + 112 ); // Iteration 0. ab00 += A0 * B0; a3 = *(ap + 3); ab10 += A1 * B0; b3 = *(bp + 3); ab20 += A2 * B0; ab30 += A3 * B0; ab01 += A0 * B1; ab11 += A1 * B1; B0 = *(Bp + 8); // Prefetch. ab21 += A2 * B1; ab31 += A3 * B1; ab02 += A0 * B2; B1 = *(Bp + 9); ab12 += A1 * B2; ab22 += A2 * B2; ab32 += A3 * B2; B2 = *(Bp + 10); ab03 += A0 * B3; A0 = *(Ap + 8); // Prefetch. ab13 += A1 * B3; A1 = *(Ap + 9); // Prefetch. ab23 += A2 * B3; ab33 += A3 * B3; A2 = *(Ap + 10); // Prefetch. // Iteration 1. //__asm__ ("pld\t[%0],#200\n\t" : :"r"(Ap) : ); ab00 += a0 * b0; ab10 += a1 * b0; A3 = *(Ap + 11); // Prefetch. ab20 += a2 * b0; ab30 += a3 * b0; B3 = *(Bp + 11); ab01 += a0 * b1; b0 = *(bp + 8); ab11 += a1 * b1; ab21 += a2 * b1; ab31 += a3 * b1; b1 = *(bp + 9); ab02 += a0 * b2; ab12 += a1 * b2; ab22 += a2 * b2; ab32 += a3 * b2; b2 = *(bp + 10); ab03 += a0 * b3; a0 = *(ap + 8); ab13 += a1 * b3; a1 = *(ap + 9); ab23 += a2 * b3; a2 = *(ap + 10); ab33 += a3 * b3; //a3 = *(ap + 11); ap += 8; Ap += 8; bp += 8; Bp += 8; } for ( i = 0; i < k_left; ++i ) { a0 = *(ap + 0); a1 = *(ap + 1); a2 = *(ap + 2); a3 = *(ap + 3); b0 = *(bp + 0); b1 = *(bp + 1); b2 = *(bp + 2); b3 = *(bp + 3); ab00 += a0 * b0; ab10 += a1 * b0; ab20 += a2 * b0; ab30 += a3 * b0; ab01 += a0 * b1; ab11 += a1 * b1; ab21 += a2 * b1; ab31 += a3 * b1; ab02 += a0 * b2; ab12 += a1 * b2; ab22 += a2 * b2; ab32 += a3 * b2; ab03 += a0 * b3; ab13 += a1 * b3; ab23 += a2 * b3; ab33 += a3 * b3; ap += 4; bp += 4; } if ( *beta == 0.0 ) { *c00 = ab00 * *alpha; *c10 = ab10 * *alpha; *c20 = ab20 * *alpha; *c30 = ab30 * *alpha; *c01 = ab01 * *alpha; *c11 = ab11 * *alpha; *c21 = ab21 * *alpha; *c31 = ab31 * *alpha; *c02 = ab02 * *alpha; *c12 = ab12 * *alpha; *c22 = ab22 * *alpha; *c32 = ab32 * *alpha; *c03 = ab03 * *alpha; *c13 = ab13 * *alpha; *c23 = ab23 * *alpha; *c33 = ab33 * *alpha; } else { *c00 = *c00 * *beta; *c10 = *c10 * *beta; *c20 = *c20 * *beta; *c30 = *c30 * *beta; *c01 = *c01 * *beta; *c11 = *c11 * *beta; *c21 = *c21 * *beta; *c31 = *c31 * *beta; *c02 = *c02 * *beta; *c12 = *c12 * *beta; *c22 = *c22 * *beta; *c32 = *c32 * *beta; *c03 = *c03 * *beta; *c13 = *c13 * *beta; *c23 = *c23 * *beta; *c33 = *c33 * *beta; *c00 += ab00 * *alpha; *c10 += ab10 * *alpha; *c20 += ab20 * *alpha; *c30 += ab30 * *alpha; *c01 += ab01 * *alpha; *c11 += ab11 * *alpha; *c21 += ab21 * *alpha; *c31 += ab31 * *alpha; *c02 += ab02 * *alpha; *c12 += ab12 * *alpha; *c22 += ab22 * *alpha; *c32 += ab32 * *alpha; *c03 += ab03 * *alpha; *c13 += ab13 * *alpha; *c23 += ab23 * *alpha; *c33 += ab33 * *alpha; } GEMM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/armv7a/3/bli_sgemm_armv7a_asm_4x4.S000066400000000000000000000260351427272030600255440ustar00rootroot00000000000000 #define REALNAME bli_sgemm_armv7a_ker_4x4 #define STACKSIZE 256 #define K r0 #define PTR_ALPHA r1 #define OLD_A r2 #define OLD_B r3 #define PTR_BETA [fp, #0 ] #define OLD_C [fp, #4 ] #define OLD_RSC [fp, #8 ] #define OLD_CSC [fp, #12 ] #define AUX [fp, #16 ] /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * register *******************************************************/ #define L r2 #define AO r5 #define BO r6 #define CO1 r7 #define CO2 r8 #define CO3 r9 #define CO4 r12 #define A_PRE 96 #define B_PRE 96 #define C_PRE 0 /************************************************************************************** * Macro definitions **************************************************************************************/ .macro INIT4x4 vsub.f32 s16 , s16 , s16 vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 vmov.f32 s22, s16 vmov.f32 s23, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s26, s16 vmov.f32 s27, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 vmov.f32 s30, s16 vmov.f32 s31, s16 .endm .macro KERNEL4x4_I pld [ AO , #A_PRE ] fldmias AO!, { s0 - s1 } pld [ BO , #B_PRE ] fldmias BO!, { s8 - s9 } fmuls s16 , s0, s8 fldmias AO!, { s2 - s3 } fmuls s17 , s1, s8 fmuls s18 , s2, s8 fldmias BO!, { s10 - s11 } fmuls s19 , s3, s8 fmuls s20 , s0, s9 fldmias AO!, { s4 - s5 } fmuls s21 , s1, s9 fmuls s22 , s2, s9 fldmias AO!, { s6 - s7 } fmuls s23 , s3, s9 fmuls s24 , s0, s10 fldmias BO!, { s12 - s13 } fmuls s25 , s1, s10 fmuls s26 , s2, s10 fldmias BO!, { s14 - s15 } fmuls s27 , s3, s10 fmuls s28 , s0, s11 fmuls s29 , s1, s11 fmuls s30 , s2, s11 fmuls s31 , s3, s11 .endm .macro KERNEL4x4_M2 pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 fldmias AO!, { s0 - s3 } fmacs s18 , s6, s12 pld [ BO , #B_PRE ] fmacs s19 , s7, s12 fmacs s20 , s4, s13 fldmias BO!, { s8 - s11 } fmacs s21 , s5, s13 fmacs s22 , s6, s13 //fldmias AO!, { s2 - s3 } fmacs s23 , s7, s13 fmacs s24 , s4, s14 //fldmias BO!, { s10 - s11 } fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 fmacs s28 , s4, s15 fmacs s29 , s5, s15 fmacs s30 , s6, s15 fmacs s31 , s7, s15 .endm .macro KERNEL4x4_M1 fmacs s16 , s0, s8 fldmias AO!, { s4 - s7 } fmacs s17 , s1, s8 fmacs s18 , s2, s8 fldmias BO!, { s12 - s15 } //fldmias AO!, { s6 - s7 } fmacs s19 , s3, s8 fmacs s20 , s0, s9 fmacs s21 , s1, s9 fmacs s22 , s2, s9 //fldmias BO!, { s14 - s15 } fmacs s23 , s3, s9 fmacs s24 , s0, s10 fmacs s25 , s1, s10 fmacs s26 , s2, s10 fmacs s27 , s3, s10 fmacs s28 , s0, s11 fmacs s29 , s1, s11 fmacs s30 , s2, s11 fmacs s31 , s3, s11 .endm .macro KERNEL4x4_E fmacs s16 , s4, s12 fmacs s17 , s5, s12 fmacs s18 , s6, s12 fmacs s19 , s7, s12 fmacs s20 , s4, s13 fmacs s21 , s5, s13 fmacs s22 , s6, s13 fmacs s23 , s7, s13 fmacs s24 , s4, s14 fmacs s25 , s5, s14 fmacs s26 , s6, s14 fmacs s27 , s7, s14 fmacs s28 , s4, s15 fmacs s29 , s5, s15 fmacs s30 , s6, s15 fmacs s31 , s7, s15 .endm .macro KERNEL4x4_SUB flds s8 , [ BO ] flds s0 , [ AO ] flds s1 , [ AO, #4 ] fmacs s16 , s0, s8 flds s2 , [ AO, #8 ] fmacs s17 , s1, s8 flds s3 , [ AO, #12 ] fmacs s18 , s2, s8 flds s9 , [ BO, #4 ] fmacs s19 , s3, s8 flds s10, [ BO, #8 ] fmacs s20 , s0, s9 flds s11, [ BO, #12 ] fmacs s21 , s1, s9 fmacs s22 , s2, s9 fmacs s23 , s3, s9 fmacs s24 , s0, s10 fmacs s25 , s1, s10 fmacs s26 , s2, s10 fmacs s27 , s3, s10 fmacs s28 , s0, s11 fmacs s29 , s1, s11 add AO , AO, #16 fmacs s30 , s2, s11 add BO , BO, #16 fmacs s31 , s3, s11 .endm .macro SAVE4x4 ldr r3, OLD_RSC // Row stride size lsl r3, r3, #2 // multiply with size of float flds s0, [ PTR_ALPHA ] // load alpha ldr r4, PTR_BETA flds s1, [ r4 ] // load beta //----------------------------------------------------------- mov r2, CO1 // save pointer mov r4, CO2 // save pointer flds s8, [ CO1 ] // load value from C flds s12, [ CO2 ] // load value from C fmuls s8, s8, s1 // multiply with beta add CO1, CO1, r3 // compute next pointer fmacs s8, s0, s16 // multiply sum with alpha and add to value of C add CO2, CO2, r3 // compute next pointer flds s9, [ CO1 ] // load value from C flds s13, [ CO2 ] // load value from C fmuls s9, s9, s1 // multiply with beta add CO1, CO1, r3 // compute next pointer fmacs s9, s0, s17 // multiply sum with alpha and add to value of C add CO2, CO2, r3 // compute next pointer flds s10, [ CO1 ] // load value from C flds s14, [ CO2 ] // load value from C fmuls s10, s10, s1 // multiply with beta add CO1, CO1, r3 // compute next pointer fmacs s10, s0, s18 // multiply sum with alpha and add to value of C add CO2, CO2, r3 // compute next pointer flds s11, [ CO1 ] // load value from C flds s15, [ CO2 ] // load value from C fmuls s11, s11, s1 // multiply with beta mov CO1, r2 // restore pointer fmacs s11, s0, s19 // multiply sum with alpha and add to value of C mov CO2, r4 // restore pointer fsts s8, [ CO1 ] // store value in C add CO1 , CO1, r3 // compute next pointer fsts s9, [ CO1 ] // store value in C add CO1 , CO1, r3 // compute next pointer fsts s10, [ CO1 ] // store value in C add CO1 , CO1, r3 // compute next pointer fsts s11, [ CO1 ] // store value in C //----------------------------------------------------------- mov r2, CO3 // save pointer flds s8, [ CO3 ] // load value from C fmuls s12, s12, s1 // multiply with beta add CO3, CO3, r3 // compute next pointer fmacs s12, s0, s20 // multiply sum with alpha and add to value of C flds s9, [ CO3 ] // load value from C fmuls s13, s13, s1 // multiply with beta add CO3, CO3, r3 // compute next pointer fmacs s13, s0, s21 // multiply sum with alpha and add to value of C flds s10, [ CO3 ] // load value from C fmuls s14, s14, s1 // multiply with beta add CO3, CO3, r3 // compute next pointer fmacs s14, s0, s22 // multiply sum with alpha and add to value of C flds s11, [ CO3 ] // load value from C fmuls s15, s15, s1 // multiply with beta mov CO3, r2 // restore pointer fmacs s15, s0, s23 // multiply sum with alpha and add to value of C fsts s12, [ CO2 ] // store value in C add CO2 , CO2, r3 // compute next pointer fsts s13, [ CO2 ] // store value in C add CO2 , CO2, r3 // compute next pointer fsts s14, [ CO2 ] // store value in C add CO2 , CO2, r3 // compute next pointer fsts s15, [ CO2 ] // store value in C //----------------------------------------------------------- mov r4, CO4 // save pointer flds s12, [ CO4 ] // load value from C fmuls s8, s8, s1 // multiply with beta add CO4, CO4, r3 // compute next pointer fmacs s8, s0, s24 // multiply sum with alpha and add to value of C flds s13, [ CO4 ] // load value from C fmuls s9, s9, s1 // multiply with beta add CO4, CO4, r3 // compute next pointer fmacs s9, s0, s25 // multiply sum with alpha and add to value of C flds s14, [ CO4 ] // load value from C fmuls s10, s10, s1 // multiply with beta add CO4, CO4, r3 // compute next pointer fmacs s10, s0, s26 // multiply sum with alpha and add to value of C flds s15, [ CO4 ] // load value from C fmuls s11, s11, s1 // multiply with beta mov CO4, r4 // restore pointer fmacs s11, s0, s27 // multiply sum with alpha and add to value of C //----------------------------------------------------------- fsts s8, [ CO3 ] // store value in C fmuls s12, s12, s1 // multiply with beta add CO3 , CO3, r3 // compute next pointer fmacs s12, s0, s28 // multiply sum with alpha and add to value of C fsts s9, [ CO3 ] // store value in C fmuls s13, s13, s1 // multiply with beta add CO3 , CO3, r3 // compute next pointer fmacs s13, s0, s29 // multiply sum with alpha and add to value of C fsts s10, [ CO3 ] // store value in C fmuls s14, s14, s1 // multiply with beta add CO3 , CO3, r3 // compute next pointer fmacs s14, s0, s30 // multiply sum with alpha and add to value of C fsts s11, [ CO3 ] // store value in C fmuls s15, s15, s1 // multiply with beta fsts s12, [ CO4 ] // store value in C fmacs s15, s0, s31 // multiply sum with alpha and add to value of C add CO4 , CO4, r3 // compute next pointer fsts s13, [ CO4 ] // store value in C add CO4 , CO4, r3 // compute next pointer fsts s14, [ CO4 ] // store value in C add CO4 , CO4, r3 // compute next pointer fsts s15, [ CO4 ] // store value in C .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ .arm .global REALNAME .func REALNAME REALNAME: push {r4 - r9, fp} // save register add fp, sp, #28 // add number of saved register multiplied by size of int sub sp, sp, #STACKSIZE // reserve stack mov AO, OLD_A // pointer matrix A mov BO, OLD_B // pointer matrix B sub r3, fp, #128 vstm r3, { s8 - s31 } // store floating point registers ldr r2, OLD_C // pointer matrix C ldr r3, OLD_CSC // Col stride size of C lsl r3, r3, #2 // multiply with size of float mov CO1, r2 // first line of C add CO2, CO1, r3 // second line of C add CO3, CO2, r3 // third line of C add CO4, CO3, r3 // fourth line of C pld [ CO1, #C_PRE ] // prefetch the lines of C pld [ CO2, #C_PRE ] // prefetch the lines of C pld [ CO3, #C_PRE ] // prefetch the lines of C pld [ CO3, #C_PRE ] // prefetch the lines of C sgemm_kernel_L4_M4_20: asrs L , K, #3 // L = K / 8 cmp L , #2 blt sgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #2 ble sgemm_kernel_L4_M4_22a .align 5 sgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #1 bgt sgemm_kernel_L4_M4_22 sgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_32: tst L, #1 ble sgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b sgemm_kernel_L4_M4_44 sgemm_kernel_L4_M4_40: INIT4x4 sgemm_kernel_L4_M4_44: ands L , K, #7 // L = K % 8 ble sgemm_kernel_L4_M4_100 sgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 bne sgemm_kernel_L4_M4_46 sgemm_kernel_L4_M4_100: SAVE4x4 sgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31 } // restore floating point registers sub sp, fp, #28 pop {r4 - r9, fp} bx lr cython-blis-0.9.1/blis/_src/kernels/armv7a/3/bli_zgemm_armv7a_asm_2x2.S000066400000000000000000000234631427272030600255510ustar00rootroot00000000000000 #define REALNAME bli_zgemm_armv7a_ker_2x2 #define STACKSIZE 256 #define K r0 #define PTR_ALPHA r1 #define OLD_A r2 #define OLD_B r3 #define PTR_BETA [fp, #0 ] #define OLD_C [fp, #4 ] #define OLD_RSC [fp, #8 ] #define OLD_CSC [fp, #12 ] #define AUX [fp, #16 ] /****************************************************** * [fp, #-128] - [fp, #-64] is reserved * for store and restore of floating point * register *******************************************************/ #define L r2 #define AO r5 #define BO r6 #define CO1 r7 #define CO2 r8 #define A_PRE 96 #define B_PRE 96 #define C_PRE 0 /************************************************************************************** * Macro definitions **************************************************************************************/ #define FMAC_BR fnmacd #define FMAC_BI fmacd #define NN 1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define FADD_R fsubd #define FADD_I faddd #define FMAC_R1 fnmacd #define FMAC_R2 fnmacd #define FMAC_I1 fmacd #define FMAC_I2 fnmacd #elif defined(CN) || defined(CT) #define FADD_R faddd #define FADD_I fsubd #define FMAC_R1 fmacd #define FMAC_R2 fmacd #define FMAC_I1 fnmacd #define FMAC_I2 fmacd #elif defined(NC) || defined(TC) #define FADD_R faddd #define FADD_I fsubd #define FMAC_R1 fmacd #define FMAC_R2 fnmacd #define FMAC_I1 fmacd #define FMAC_I2 fmacd #else #define FADD_R fsubd #define FADD_I faddd #define FMAC_R1 fnmacd #define FMAC_R2 fmacd #define FMAC_I1 fnmacd #define FMAC_I2 fnmacd #endif .macro INIT2x2 vsub.f64 d16 , d16 , d16 vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 vmov.f64 d22, d16 vmov.f64 d23, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 vmov.f64 d26, d16 vmov.f64 d27, d16 vmov.f64 d28, d16 vmov.f64 d29, d16 vmov.f64 d30, d16 vmov.f64 d31, d16 .endm .macro KERNEL2x2_I pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmuld d16 , d0, d8 fldd d2 , [ AO, #16 ] fmuld d24 , d1, d9 fldd d3 , [ AO, #24 ] fmuld d17 , d0, d9 fldd d10, [ BO, #16 ] fmuld d25 , d1, d8 fldd d11, [ BO, #24 ] fmuld d18 , d2, d8 add BO , BO, #32 fmuld d26 , d3, d9 add AO , AO, #32 fmuld d19 , d2, d9 pld [ BO , #B_PRE ] fmuld d27 , d3, d8 pld [ AO , #A_PRE ] fmuld d20 , d0, d10 fldd d4 , [ AO, #0 ] fmuld d28 , d1, d11 fldd d5 , [ AO, #8 ] fmuld d21 , d0, d11 fldd d12, [ BO ] fmuld d29 , d1, d10 fldd d13, [ BO, #8 ] fmuld d22 , d2, d10 fldd d6 , [ AO, #16 ] fmuld d30 , d3, d11 fldd d7 , [ AO, #24 ] fmuld d23 , d2, d11 fldd d14, [ BO, #16 ] fmuld d31 , d3, d10 fldd d15, [ BO, #24 ] add BO , BO, #32 add AO , AO, #32 .endm .macro KERNEL2x2_M1 pld [ AO , #A_PRE ] fmacd d16 , d0, d8 pld [ BO , #B_PRE ] fmacd d24 , d1, d9 fldd d4 , [ AO, #0 ] fmacd d17 , d0, d9 fldd d5 , [ AO, #8 ] fmacd d25 , d1, d8 fldd d12, [ BO ] fmacd d18 , d2, d8 fldd d13, [ BO, #8 ] fmacd d26 , d3, d9 fldd d6 , [ AO, #16 ] fmacd d19 , d2, d9 fldd d7 , [ AO, #24 ] fmacd d27 , d3, d8 fmacd d20 , d0, d10 fldd d14, [ BO, #16 ] fmacd d28 , d1, d11 fmacd d21 , d0, d11 fldd d15, [ BO, #24 ] fmacd d29 , d1, d10 fmacd d22 , d2, d10 add BO , BO, #32 fmacd d30 , d3, d11 fmacd d23 , d2, d11 add AO , AO, #32 fmacd d31 , d3, d10 .endm .macro KERNEL2x2_M2 pld [ AO , #A_PRE ] fmacd d16 , d4, d12 pld [ BO , #B_PRE ] fmacd d24 , d5, d13 fldd d0 , [ AO, #0 ] fmacd d17 , d4, d13 fldd d1 , [ AO, #8 ] fmacd d25 , d5, d12 fmacd d18 , d6, d12 fldd d8 , [ BO ] fmacd d26 , d7, d13 fldd d9 , [ BO, #8 ] fmacd d19 , d6, d13 fmacd d27 , d7, d12 fldd d2 , [ AO, #16 ] fmacd d20 , d4, d14 fldd d3 , [ AO, #24 ] fmacd d28 , d5, d15 fmacd d21 , d4, d15 fldd d10, [ BO, #16 ] fmacd d29 , d5, d14 fldd d11, [ BO, #24 ] fmacd d22 , d6, d14 fmacd d30 , d7, d15 add BO , BO, #32 fmacd d23 , d6, d15 add AO , AO, #32 fmacd d31 , d7, d14 .endm .macro KERNEL2x2_E fmacd d16 , d4, d12 fmacd d24 , d5, d13 fmacd d17 , d4, d13 fmacd d25 , d5, d12 fmacd d18 , d6, d12 fmacd d26 , d7, d13 fmacd d19 , d6, d13 fmacd d27 , d7, d12 fmacd d20 , d4, d14 fmacd d28 , d5, d15 fmacd d21 , d4, d15 fmacd d29 , d5, d14 fmacd d22 , d6, d14 fmacd d30 , d7, d15 fmacd d23 , d6, d15 fmacd d31 , d7, d14 .endm .macro KERNEL2x2_SUB pld [ AO , #A_PRE ] pld [ BO , #B_PRE ] fldd d0 , [ AO ] fldd d1 , [ AO, #8 ] fldd d8 , [ BO ] fldd d9 , [ BO, #8 ] fmacd d16 , d0, d8 fldd d2 , [ AO, #16 ] fmacd d24 , d1, d9 fldd d3 , [ AO, #24 ] fmacd d17 , d0, d9 fldd d10, [ BO, #16 ] fmacd d25 , d1, d8 fldd d11, [ BO, #24 ] fmacd d18 , d2, d8 fmacd d26 , d3, d9 fmacd d19 , d2, d9 fmacd d27 , d3, d8 fmacd d20 , d0, d10 fmacd d28 , d1, d11 fmacd d21 , d0, d11 fmacd d29 , d1, d10 fmacd d22 , d2, d10 add BO , BO, #32 fmacd d30 , d3, d11 fmacd d23 , d2, d11 add AO , AO, #32 fmacd d31 , d3, d10 .endm .macro SAVE2x2 ldr r3, OLD_RSC // Row stride size lsl r3, r3, #4 // multiply with size of complex double fldd d0, [ PTR_ALPHA ] // load real part of alpha fldd d1, [ PTR_ALPHA, #8 ] // load imag part of alpha ldr r4, PTR_BETA fldd d2, [ r4 ] // load real part of beta fldd d3, [ r4, #8 ] // load imag part of beta // Add/Sub the real and the imag parts FADD_R d16, d24 , d16 FADD_I d17, d25 , d17 FADD_R d18, d26 , d18 FADD_I d19, d27 , d19 FADD_R d20, d28 , d20 FADD_I d21, d29 , d21 FADD_R d22, d30 , d22 FADD_I d23, d31 , d23 mov r4, CO1 // save pointer fldmiad CO1, { d4 - d5 } // read real and imag part from C add CO1, CO1, r3 mov r2, CO2 // save pointer fldmiad CO2, { d8 - d9 } // read real and imag part from C add CO2, CO2, r3 fmuld d24, d4, d2 // multiply Beta-real with C-real fmuld d25, d5, d2 // multiply Beta-real with C-imag fmuld d28, d8, d2 // multiply Beta-real with C-real fmuld d29, d9, d2 // multiply Beta-real with C-imag FMAC_BR d24, d3, d5 // multiply beta-imag with C-imag and add FMAC_BI d25, d3, d4 // multiply beta-imag with C-real and add FMAC_BR d28, d3, d9 // multiply beta-imag with C-imag and add FMAC_BI d29, d3, d8 // multiply beta-imag with C-real and add FMAC_R1 d24 , d0 , d16 FMAC_I1 d25 , d0 , d17 FMAC_R2 d24 , d1 , d17 FMAC_I2 d25 , d1 , d16 FMAC_R1 d28 , d0 , d20 FMAC_I1 d29 , d0 , d21 FMAC_R2 d28 , d1 , d21 FMAC_I2 d29 , d1 , d20 fldmiad CO1, { d4 - d5 } // read real and imag part from C fldmiad CO2, { d8 - d9 } // read real and imag part from C fmuld d26, d4, d2 // multiply Beta-real with C-real fmuld d27, d5, d2 // multiply Beta-real with C-imag fmuld d30, d8, d2 // multiply Beta-real with C-real fmuld d31, d9, d2 // multiply Beta-real with C-imag FMAC_BR d26, d3, d5 // multiply beta-imag with C-imag and add FMAC_BI d27, d3, d4 // multiply beta-imag with C-real and add FMAC_BR d30, d3, d9 // multiply beta-imag with C-imag and add FMAC_BI d31, d3, d8 // multiply beta-imag with C-real and add FMAC_R1 d26 , d0 , d18 FMAC_I1 d27 , d0 , d19 FMAC_R2 d26 , d1 , d19 FMAC_I2 d27 , d1 , d18 FMAC_R1 d30, d0 , d22 FMAC_I1 d31, d0 , d23 FMAC_R2 d30, d1 , d23 FMAC_I2 d31, d1 , d22 mov CO1, r4 // restore pointer mov CO2, r2 // restore pointer fstmiad CO1, { d24 - d25 } fstmiad CO2, { d28 - d29 } add CO1, CO1, r3 add CO2, CO2, r3 fstmiad CO1, { d26 - d27 } fstmiad CO2, { d30 - d31 } .endm /************************************************************************************** * End of macro definitions **************************************************************************************/ .arm .global REALNAME .func REALNAME REALNAME: push {r4 - r9, fp} // save register add fp, sp, #28 // add number of saved register multiplied by size of int sub sp, sp, #STACKSIZE // reserve stack mov AO, OLD_A // pointer matrix A mov BO, OLD_B // pointer matrix B sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers ldr r2, OLD_C // pointer matrix C ldr r3, OLD_CSC // Col stride size of C lsl r3, r3, #4 // multiply with size of complex double mov CO1, r2 // first line of C add CO2, CO1, r3 // second line of C pld [ CO1, #C_PRE ] // prefetch the lines of C pld [ CO2, #C_PRE ] // prefetch the lines of C zgemm_kernel_L2_M2_20: asrs L , K, #3 // L = K / 8 cmp L , #2 blt zgemm_kernel_L2_M2_32 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #2 ble zgemm_kernel_L2_M2_22a .align 5 zgemm_kernel_L2_M2_22: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 subs L, L, #1 bgt zgemm_kernel_L2_M2_22 zgemm_kernel_L2_M2_22a: KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_32: tst L, #1 ble zgemm_kernel_L2_M2_40 KERNEL2x2_I KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_M2 KERNEL2x2_M1 KERNEL2x2_E b zgemm_kernel_L2_M2_44 zgemm_kernel_L2_M2_40: INIT2x2 zgemm_kernel_L2_M2_44: ands L , K, #7 // L = K % 8 ble zgemm_kernel_L2_M2_100 zgemm_kernel_L2_M2_46: KERNEL2x2_SUB subs L, L, #1 bne zgemm_kernel_L2_M2_46 zgemm_kernel_L2_M2_100: SAVE2x2 zgemm_kernel_L999: sub r3, fp, #128 vldm r3, { d8 - d15} // restore floating point registers sub sp, fp, #28 pop {r4 - r9, fp} bx lr cython-blis-0.9.1/blis/_src/kernels/armv7a/bli_kernels_armv7a.h000066400000000000000000000036661427272030600244300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( float, s, gemm_armv7a_asm_4x4 ) GEMM_UKR_PROT( double, d, gemm_armv7a_asm_4x4 ) GEMM_UKR_PROT( scomplex, c, gemm_armv7a_asm_2x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_armv7a_asm_2x2 ) GEMM_UKR_PROT( float, s, gemm_armv7a_int_4x4 ) GEMM_UKR_PROT( double, d, gemm_armv7a_int_4x4 ) cython-blis-0.9.1/blis/_src/kernels/armv8a/000077500000000000000000000000001427272030600204775ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv8a/1m/000077500000000000000000000000001427272030600210145ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c000066400000000000000000000246211427272030600261720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Linaro Limited Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL_2 _Pragma("unroll 2") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL_2 #endif void bli_dpackm_armv8a_int_6xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 6; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 2; uint64_t k_left = k0 % 2; double* a_loc = a; double* p_loc = p; // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_deq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs ) { if ( unitk ) { if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_2 for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik ) { float64x2_t v0 = vld1q_f64( a_loc + 0 ); float64x2_t v1 = vld1q_f64( a_loc + 2 ); float64x2_t v2 = vld1q_f64( a_loc + 4 ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 ); PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f64( a_loc + inca * 0 ); v1 = vld1q_f64( a_loc + inca * 1 ); v2 = vld1q_f64( a_loc + inca * 2 ); v3 = vld1q_f64( a_loc + inca * 3 ); v4 = vld1q_f64( a_loc + inca * 4 ); v5 = vld1q_f64( a_loc + inca * 5 ); // In-register transpose. float64x2_t vd0_1 = vtrn1q_f64( v0, v1 ); float64x2_t vd1_1 = vtrn1q_f64( v2, v3 ); float64x2_t vd2_1 = vtrn1q_f64( v4, v5 ); float64x2_t vd0_2 = vtrn2q_f64( v0, v1 ); float64x2_t vd1_2 = vtrn2q_f64( v2, v3 ); float64x2_t vd2_2 = vtrn2q_f64( v4, v5 ); vst1q_f64( p_loc + 0, vd0_1 ); vst1q_f64( p_loc + 2, vd1_1 ); vst1q_f64( p_loc + 4, vd2_1 ); p_loc += ldp; vst1q_f64( p_loc + 0, vd0_2 ); vst1q_f64( p_loc + 2, vd1_2 ); vst1q_f64( p_loc + 4, vd2_2 ); p_loc += ldp; a_loc += 2 * lda; // 2; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 ); v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 ); v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 ); v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 ); v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 ); v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); p_loc += ldp; a_loc += lda; // 1; } } } else // if ( !unitk ) { float64x2_t vkappa = vld1q_dup_f64( kappa ); if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_2 for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik ) { float64x2_t v0 = vld1q_f64( a_loc + 0 ); float64x2_t v1 = vld1q_f64( a_loc + 2 ); float64x2_t v2 = vld1q_f64( a_loc + 4 ); // Scale by kappa. v0 = vmulq_f64( v0, vkappa ); v1 = vmulq_f64( v1, vkappa ); v2 = vmulq_f64( v2, vkappa ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 ); PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f64( a_loc + inca * 0 ); v1 = vld1q_f64( a_loc + inca * 1 ); v2 = vld1q_f64( a_loc + inca * 2 ); v3 = vld1q_f64( a_loc + inca * 3 ); v4 = vld1q_f64( a_loc + inca * 4 ); v5 = vld1q_f64( a_loc + inca * 5 ); // Scale by kappa. v0 = vmulq_f64( v0, vkappa ); v1 = vmulq_f64( v1, vkappa ); v2 = vmulq_f64( v2, vkappa ); v3 = vmulq_f64( v3, vkappa ); v4 = vmulq_f64( v4, vkappa ); v5 = vmulq_f64( v5, vkappa ); // In-register transpose. float64x2_t vd0_1 = vtrn1q_f64( v0, v1 ); float64x2_t vd1_1 = vtrn1q_f64( v2, v3 ); float64x2_t vd2_1 = vtrn1q_f64( v4, v5 ); float64x2_t vd0_2 = vtrn2q_f64( v0, v1 ); float64x2_t vd1_2 = vtrn2q_f64( v2, v3 ); float64x2_t vd2_2 = vtrn2q_f64( v4, v5 ); vst1q_f64( p_loc + 0, vd0_1 ); vst1q_f64( p_loc + 2, vd1_1 ); vst1q_f64( p_loc + 4, vd2_1 ); p_loc += ldp; vst1q_f64( p_loc + 0, vd0_2 ); vst1q_f64( p_loc + 2, vd1_2 ); vst1q_f64( p_loc + 4, vd2_2 ); p_loc += ldp; a_loc += 2 * lda; // 2; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 ); v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 ); v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 ); v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 ); v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 ); v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 ); // Scale by kappa. v0 = vmulq_f64( v0, vkappa ); v1 = vmulq_f64( v1, vkappa ); v2 = vmulq_f64( v2, vkappa ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); p_loc += ldp; a_loc += lda; // 1; } } } } else // if ( cdim0 < mnr || gs ) { PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } //bli_dfprintm( stdout, "packm 6xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" ); if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c000066400000000000000000000274431427272030600262010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Linaro Limited Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL_2 _Pragma("unroll 2") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL_2 #endif void bli_dpackm_armv8a_int_8xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 8; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 2; uint64_t k_left = k0 % 2; double* a_loc = a; double* p_loc = p; // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_deq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs ) { if ( unitk ) { if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_2 for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik ) { float64x2_t v0 = vld1q_f64( a_loc + 0 ); float64x2_t v1 = vld1q_f64( a_loc + 2 ); float64x2_t v2 = vld1q_f64( a_loc + 4 ); float64x2_t v3 = vld1q_f64( a_loc + 6 ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); vst1q_f64( p_loc + 6, v3 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v6 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v7 = (float64x2_t)vdupq_n_u64( 0 ); PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f64( a_loc + inca * 0 ); v1 = vld1q_f64( a_loc + inca * 1 ); v2 = vld1q_f64( a_loc + inca * 2 ); v3 = vld1q_f64( a_loc + inca * 3 ); v4 = vld1q_f64( a_loc + inca * 4 ); v5 = vld1q_f64( a_loc + inca * 5 ); v6 = vld1q_f64( a_loc + inca * 6 ); v7 = vld1q_f64( a_loc + inca * 7 ); // In-register transpose. float64x2_t vd0_1 = vtrn1q_f64( v0, v1 ); float64x2_t vd1_1 = vtrn1q_f64( v2, v3 ); float64x2_t vd2_1 = vtrn1q_f64( v4, v5 ); float64x2_t vd3_1 = vtrn1q_f64( v6, v7 ); float64x2_t vd0_2 = vtrn2q_f64( v0, v1 ); float64x2_t vd1_2 = vtrn2q_f64( v2, v3 ); float64x2_t vd2_2 = vtrn2q_f64( v4, v5 ); float64x2_t vd3_2 = vtrn2q_f64( v6, v7 ); vst1q_f64( p_loc + 0, vd0_1 ); vst1q_f64( p_loc + 2, vd1_1 ); vst1q_f64( p_loc + 4, vd2_1 ); vst1q_f64( p_loc + 6, vd3_1 ); p_loc += ldp; vst1q_f64( p_loc + 0, vd0_2 ); vst1q_f64( p_loc + 2, vd1_2 ); vst1q_f64( p_loc + 4, vd2_2 ); vst1q_f64( p_loc + 6, vd3_2 ); p_loc += ldp; a_loc += 2 * lda; // 2; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 ); v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 ); v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 ); v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 ); v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 ); v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 ); v3 = vld1q_lane_f64( a_loc + inca * 6, v3, 0 ); v3 = vld1q_lane_f64( a_loc + inca * 7, v3, 1 ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); vst1q_f64( p_loc + 6, v3 ); p_loc += ldp; a_loc += lda; // 1; } } } else // if ( !unitk ) { float64x2_t vkappa = vld1q_dup_f64( kappa ); if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_2 for ( dim_t ik = k_iter * 2 + k_left; ik > 0; --ik ) { float64x2_t v0 = vld1q_f64( a_loc + 0 ); float64x2_t v1 = vld1q_f64( a_loc + 2 ); float64x2_t v2 = vld1q_f64( a_loc + 4 ); float64x2_t v3 = vld1q_f64( a_loc + 6 ); // Scale by kappa. v0 = vmulq_f64( v0, vkappa ); v1 = vmulq_f64( v1, vkappa ); v2 = vmulq_f64( v2, vkappa ); v3 = vmulq_f64( v3, vkappa ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); vst1q_f64( p_loc + 6, v3 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float64x2_t v0 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v1 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v2 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v3 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v4 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v5 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v6 = (float64x2_t)vdupq_n_u64( 0 ); float64x2_t v7 = (float64x2_t)vdupq_n_u64( 0 ); PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f64( a_loc + inca * 0 ); v1 = vld1q_f64( a_loc + inca * 1 ); v2 = vld1q_f64( a_loc + inca * 2 ); v3 = vld1q_f64( a_loc + inca * 3 ); v4 = vld1q_f64( a_loc + inca * 4 ); v5 = vld1q_f64( a_loc + inca * 5 ); v6 = vld1q_f64( a_loc + inca * 6 ); v7 = vld1q_f64( a_loc + inca * 7 ); // Scale by kappa. v0 = vmulq_f64( v0, vkappa ); v1 = vmulq_f64( v1, vkappa ); v2 = vmulq_f64( v2, vkappa ); v3 = vmulq_f64( v3, vkappa ); v4 = vmulq_f64( v4, vkappa ); v5 = vmulq_f64( v5, vkappa ); v6 = vmulq_f64( v6, vkappa ); v7 = vmulq_f64( v7, vkappa ); // In-register transpose. float64x2_t vd0_1 = vtrn1q_f64( v0, v1 ); float64x2_t vd1_1 = vtrn1q_f64( v2, v3 ); float64x2_t vd2_1 = vtrn1q_f64( v4, v5 ); float64x2_t vd3_1 = vtrn1q_f64( v6, v7 ); float64x2_t vd0_2 = vtrn2q_f64( v0, v1 ); float64x2_t vd1_2 = vtrn2q_f64( v2, v3 ); float64x2_t vd2_2 = vtrn2q_f64( v4, v5 ); float64x2_t vd3_2 = vtrn2q_f64( v6, v7 ); vst1q_f64( p_loc + 0, vd0_1 ); vst1q_f64( p_loc + 2, vd1_1 ); vst1q_f64( p_loc + 4, vd2_1 ); vst1q_f64( p_loc + 6, vd3_1 ); p_loc += ldp; vst1q_f64( p_loc + 0, vd0_2 ); vst1q_f64( p_loc + 2, vd1_2 ); vst1q_f64( p_loc + 4, vd2_2 ); vst1q_f64( p_loc + 6, vd3_2 ); p_loc += ldp; a_loc += 2 * lda; // 2; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f64( a_loc + inca * 0, v0, 0 ); v0 = vld1q_lane_f64( a_loc + inca * 1, v0, 1 ); v1 = vld1q_lane_f64( a_loc + inca * 2, v1, 0 ); v1 = vld1q_lane_f64( a_loc + inca * 3, v1, 1 ); v2 = vld1q_lane_f64( a_loc + inca * 4, v2, 0 ); v2 = vld1q_lane_f64( a_loc + inca * 5, v2, 1 ); v3 = vld1q_lane_f64( a_loc + inca * 6, v3, 0 ); v3 = vld1q_lane_f64( a_loc + inca * 7, v3, 1 ); // Scale by kappa. v0 = vmulq_f64( v0, vkappa ); v1 = vmulq_f64( v1, vkappa ); v2 = vmulq_f64( v2, vkappa ); v3 = vmulq_f64( v3, vkappa ); vst1q_f64( p_loc + 0, v0 ); vst1q_f64( p_loc + 2, v1 ); vst1q_f64( p_loc + 4, v2 ); vst1q_f64( p_loc + 6, v3 ); p_loc += ldp; a_loc += lda; // 1; } } } } else // if ( cdim0 < mnr || gs ) { PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } //bli_dfprintm( stdout, "packm 8xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" ); if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c000066400000000000000000000371251427272030600262710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Linaro Limited Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL_2 _Pragma("unroll 2") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL_2 _Pragma("GCC unroll 2") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL_2 #endif void bli_spackm_armv8a_int_12xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 12; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; float* a_loc = a; float* p_loc = p; // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_seq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs ) { if ( unitk ) { if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_2 for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik ) { float32x4_t v0 = vld1q_f32( a_loc + 0 ); float32x4_t v1 = vld1q_f32( a_loc + 4 ); float32x4_t v2 = vld1q_f32( a_loc + 8 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); vst1q_f32( p_loc + 8, v2 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v8 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v9 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v10 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v11 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t vt0; float32x4_t vt1; float32x4_t vt2; float32x4_t vt3; PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f32( a_loc + inca * 0 ); v1 = vld1q_f32( a_loc + inca * 1 ); v2 = vld1q_f32( a_loc + inca * 2 ); v3 = vld1q_f32( a_loc + inca * 3 ); v4 = vld1q_f32( a_loc + inca * 4 ); v5 = vld1q_f32( a_loc + inca * 5 ); v6 = vld1q_f32( a_loc + inca * 6 ); v7 = vld1q_f32( a_loc + inca * 7 ); v8 = vld1q_f32( a_loc + inca * 8 ); v9 = vld1q_f32( a_loc + inca * 9 ); v10 = vld1q_f32( a_loc + inca * 10 ); v11 = vld1q_f32( a_loc + inca * 11 ); // In-register transpose. // // Column 0-3 vt0 = vtrn1q_f32( v0, v1 ); vt1 = vtrn2q_f32( v0, v1 ); vt2 = vtrn1q_f32( v2, v3 ); vt3 = vtrn2q_f32( v2, v3 ); v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); // Column 4-7 vt0 = vtrn1q_f32( v4, v5 ); vt1 = vtrn2q_f32( v4, v5 ); vt2 = vtrn1q_f32( v6, v7 ); vt3 = vtrn2q_f32( v6, v7 ); v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); // Column 8-11 vt0 = vtrn1q_f32( v8, v9 ); vt1 = vtrn2q_f32( v8, v9 ); vt2 = vtrn1q_f32( v10, v11 ); vt3 = vtrn2q_f32( v10, v11 ); v8 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v9 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v10 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v11 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v4 ); vst1q_f32( p_loc + 8, v8 ); p_loc += ldp; vst1q_f32( p_loc + 0, v1 ); vst1q_f32( p_loc + 4, v5 ); vst1q_f32( p_loc + 8, v9 ); p_loc += ldp; vst1q_f32( p_loc + 0, v2 ); vst1q_f32( p_loc + 4, v6 ); vst1q_f32( p_loc + 8, v10 ); p_loc += ldp; vst1q_f32( p_loc + 0, v3 ); vst1q_f32( p_loc + 4, v7 ); vst1q_f32( p_loc + 8, v11 ); p_loc += ldp; a_loc += 4 * lda; // 4; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 ); v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 ); v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 ); v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 ); v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 ); v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 ); v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 ); v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 ); v2 = vld1q_lane_f32( a_loc + inca * 8 , v2, 0 ); v2 = vld1q_lane_f32( a_loc + inca * 9 , v2, 1 ); v2 = vld1q_lane_f32( a_loc + inca * 10, v2, 2 ); v2 = vld1q_lane_f32( a_loc + inca * 11, v2, 3 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); vst1q_f32( p_loc + 8, v2 ); p_loc += ldp; a_loc += lda; // 1; } } } else // if ( !unitk ) { float32x4_t vkappa = vld1q_dup_f32( kappa ); if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_2 for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik ) { float32x4_t v0 = vld1q_f32( a_loc + 0 ); float32x4_t v1 = vld1q_f32( a_loc + 4 ); float32x4_t v2 = vld1q_f32( a_loc + 8 ); // Scale by kappa. v0 = vmulq_f32( v0, vkappa ); v1 = vmulq_f32( v1, vkappa ); v2 = vmulq_f32( v2, vkappa ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); vst1q_f32( p_loc + 8, v2 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v8 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v9 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v10 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v11 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t vt0; float32x4_t vt1; float32x4_t vt2; float32x4_t vt3; PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f32( a_loc + inca * 0 ); v1 = vld1q_f32( a_loc + inca * 1 ); v2 = vld1q_f32( a_loc + inca * 2 ); v3 = vld1q_f32( a_loc + inca * 3 ); v4 = vld1q_f32( a_loc + inca * 4 ); v5 = vld1q_f32( a_loc + inca * 5 ); v6 = vld1q_f32( a_loc + inca * 6 ); v7 = vld1q_f32( a_loc + inca * 7 ); v8 = vld1q_f32( a_loc + inca * 8 ); v9 = vld1q_f32( a_loc + inca * 9 ); v10 = vld1q_f32( a_loc + inca * 10 ); v11 = vld1q_f32( a_loc + inca * 11 ); // Scale by kappa. v0 = vmulq_f32( v0, vkappa ); v1 = vmulq_f32( v1, vkappa ); v2 = vmulq_f32( v2, vkappa ); v3 = vmulq_f32( v3, vkappa ); v4 = vmulq_f32( v4, vkappa ); v5 = vmulq_f32( v5, vkappa ); v6 = vmulq_f32( v6, vkappa ); v7 = vmulq_f32( v7, vkappa ); v8 = vmulq_f32( v8, vkappa ); v9 = vmulq_f32( v9, vkappa ); v10 = vmulq_f32( v10, vkappa ); v11 = vmulq_f32( v11, vkappa ); // In-register transpose. // // Column 0-3 vt0 = vtrn1q_f32( v0, v1 ); vt1 = vtrn2q_f32( v0, v1 ); vt2 = vtrn1q_f32( v2, v3 ); vt3 = vtrn2q_f32( v2, v3 ); v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); // Column 4-7 vt0 = vtrn1q_f32( v4, v5 ); vt1 = vtrn2q_f32( v4, v5 ); vt2 = vtrn1q_f32( v6, v7 ); vt3 = vtrn2q_f32( v6, v7 ); v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); // Column 8-11 vt0 = vtrn1q_f32( v8, v9 ); vt1 = vtrn2q_f32( v8, v9 ); vt2 = vtrn1q_f32( v10, v11 ); vt3 = vtrn2q_f32( v10, v11 ); v8 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v9 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v10 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v11 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v4 ); vst1q_f32( p_loc + 8, v8 ); p_loc += ldp; vst1q_f32( p_loc + 0, v1 ); vst1q_f32( p_loc + 4, v5 ); vst1q_f32( p_loc + 8, v9 ); p_loc += ldp; vst1q_f32( p_loc + 0, v2 ); vst1q_f32( p_loc + 4, v6 ); vst1q_f32( p_loc + 8, v10 ); p_loc += ldp; vst1q_f32( p_loc + 0, v3 ); vst1q_f32( p_loc + 4, v7 ); vst1q_f32( p_loc + 8, v11 ); p_loc += ldp; a_loc += 4 * lda; // 4; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 ); v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 ); v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 ); v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 ); v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 ); v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 ); v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 ); v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 ); v2 = vld1q_lane_f32( a_loc + inca * 8 , v2, 0 ); v2 = vld1q_lane_f32( a_loc + inca * 9 , v2, 1 ); v2 = vld1q_lane_f32( a_loc + inca * 10, v2, 2 ); v2 = vld1q_lane_f32( a_loc + inca * 11, v2, 3 ); // Scale by kappa. v0 = vmulq_f32( v0, vkappa ); v1 = vmulq_f32( v1, vkappa ); v2 = vmulq_f32( v2, vkappa ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); vst1q_f32( p_loc + 8, v2 ); p_loc += ldp; a_loc += lda; // 1; } } } } else // if ( cdim0 < mnr || gs ) { PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; float* restrict p_edge = p + (i )*1; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; float* restrict p_edge = p + (j )*ldp; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c000066400000000000000000000307771427272030600262240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Linaro Limited Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL_4 _Pragma("unroll 4") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL_4 _Pragma("GCC unroll 4") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL_4 #endif void bli_spackm_armv8a_int_8xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 8; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; float* a_loc = a; float* p_loc = p; // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_seq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs ) { if ( unitk ) { if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_4 for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik ) { float32x4_t v0 = vld1q_f32( a_loc + 0 ); float32x4_t v1 = vld1q_f32( a_loc + 4 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t vt0; float32x4_t vt1; float32x4_t vt2; float32x4_t vt3; PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f32( a_loc + inca * 0 ); v1 = vld1q_f32( a_loc + inca * 1 ); v2 = vld1q_f32( a_loc + inca * 2 ); v3 = vld1q_f32( a_loc + inca * 3 ); v4 = vld1q_f32( a_loc + inca * 4 ); v5 = vld1q_f32( a_loc + inca * 5 ); v6 = vld1q_f32( a_loc + inca * 6 ); v7 = vld1q_f32( a_loc + inca * 7 ); // In-register transpose. // // Column 0-3 vt0 = vtrn1q_f32( v0, v1 ); vt1 = vtrn2q_f32( v0, v1 ); vt2 = vtrn1q_f32( v2, v3 ); vt3 = vtrn2q_f32( v2, v3 ); v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); // Column 4-7 vt0 = vtrn1q_f32( v4, v5 ); vt1 = vtrn2q_f32( v4, v5 ); vt2 = vtrn1q_f32( v6, v7 ); vt3 = vtrn2q_f32( v6, v7 ); v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v4 ); p_loc += ldp; vst1q_f32( p_loc + 0, v1 ); vst1q_f32( p_loc + 4, v5 ); p_loc += ldp; vst1q_f32( p_loc + 0, v2 ); vst1q_f32( p_loc + 4, v6 ); p_loc += ldp; vst1q_f32( p_loc + 0, v3 ); vst1q_f32( p_loc + 4, v7 ); p_loc += ldp; a_loc += 4 * lda; // 4; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 ); v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 ); v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 ); v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 ); v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 ); v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 ); v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 ); v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); p_loc += ldp; a_loc += lda; // 1; } } } else // if ( !unitk ) { float32x4_t vkappa = vld1q_dup_f32( kappa ); if ( inca == 1 ) { // No need to use k-loops here. // Simply let compiler to expand loops. PRAGMA_UNROLL_4 for ( dim_t ik = k_iter * 4 + k_left; ik > 0; --ik ) { float32x4_t v0 = vld1q_f32( a_loc + 0 ); float32x4_t v1 = vld1q_f32( a_loc + 4 ); // Scale by kappa. v0 = vmulq_f32( v0, vkappa ); v1 = vmulq_f32( v1, vkappa ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); a_loc += lda; p_loc += ldp; } } else // if ( lda == 1 ) { float32x4_t v0 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v1 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v2 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v3 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v4 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v5 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v6 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t v7 = (float32x4_t)vdupq_n_u32( 0 ); float32x4_t vt0; float32x4_t vt1; float32x4_t vt2; float32x4_t vt3; PRAGMA_NOUNROLL for ( ; k_iter > 0; --k_iter ) { v0 = vld1q_f32( a_loc + inca * 0 ); v1 = vld1q_f32( a_loc + inca * 1 ); v2 = vld1q_f32( a_loc + inca * 2 ); v3 = vld1q_f32( a_loc + inca * 3 ); v4 = vld1q_f32( a_loc + inca * 4 ); v5 = vld1q_f32( a_loc + inca * 5 ); v6 = vld1q_f32( a_loc + inca * 6 ); v7 = vld1q_f32( a_loc + inca * 7 ); // Scale by kappa. v0 = vmulq_f32( v0, vkappa ); v1 = vmulq_f32( v1, vkappa ); v2 = vmulq_f32( v2, vkappa ); v3 = vmulq_f32( v3, vkappa ); v4 = vmulq_f32( v4, vkappa ); v5 = vmulq_f32( v5, vkappa ); v6 = vmulq_f32( v6, vkappa ); v7 = vmulq_f32( v7, vkappa ); // In-register transpose. // // Column 0-3 vt0 = vtrn1q_f32( v0, v1 ); vt1 = vtrn2q_f32( v0, v1 ); vt2 = vtrn1q_f32( v2, v3 ); vt3 = vtrn2q_f32( v2, v3 ); v0 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v1 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v2 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v3 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); // Column 4-7 vt0 = vtrn1q_f32( v4, v5 ); vt1 = vtrn2q_f32( v4, v5 ); vt2 = vtrn1q_f32( v6, v7 ); vt3 = vtrn2q_f32( v6, v7 ); v4 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v5 = (float32x4_t)vtrn1q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); v6 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt0, (float64x2_t)vt2 ); v7 = (float32x4_t)vtrn2q_f64( (float64x2_t)vt1, (float64x2_t)vt3 ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v4 ); p_loc += ldp; vst1q_f32( p_loc + 0, v1 ); vst1q_f32( p_loc + 4, v5 ); p_loc += ldp; vst1q_f32( p_loc + 0, v2 ); vst1q_f32( p_loc + 4, v6 ); p_loc += ldp; vst1q_f32( p_loc + 0, v3 ); vst1q_f32( p_loc + 4, v7 ); p_loc += ldp; a_loc += 4 * lda; // 4; } for ( ; k_left > 0; --k_left ) { v0 = vld1q_lane_f32( a_loc + inca * 0 , v0, 0 ); v0 = vld1q_lane_f32( a_loc + inca * 1 , v0, 1 ); v0 = vld1q_lane_f32( a_loc + inca * 2 , v0, 2 ); v0 = vld1q_lane_f32( a_loc + inca * 3 , v0, 3 ); v1 = vld1q_lane_f32( a_loc + inca * 4 , v1, 0 ); v1 = vld1q_lane_f32( a_loc + inca * 5 , v1, 1 ); v1 = vld1q_lane_f32( a_loc + inca * 6 , v1, 2 ); v1 = vld1q_lane_f32( a_loc + inca * 7 , v1, 3 ); // Scale by kappa. v0 = vmulq_f32( v0, vkappa ); v1 = vmulq_f32( v1, vkappa ); vst1q_f32( p_loc + 0, v0 ); vst1q_f32( p_loc + 4, v1 ); p_loc += ldp; a_loc += lda; // 1; } } } } else // if ( cdim0 < mnr || gs ) { PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; float* restrict p_edge = p + (i )*1; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; float* restrict p_edge = p + (j )*ldp; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/000077500000000000000000000000001427272030600206415ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv8a/3/armv8a_asm_d2x2.h000066400000000000000000000042231427272030600237100ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* C A B * || <- | * -- * || | * * or: * C B * A * -- <- | -- * -- | */ #define DGEMM_2X2_NANOKERNEL(C0,C1,A,B) \ " fmla v"#C0".2d, v"#A".2d, v"#B".d[0] \n\t" \ " fmla v"#C1".2d, v"#A".2d, v"#B".d[1] \n\t" #define SGEMM_4X4_NANOKERNEL(C0,C1,C2,C3,A,B) \ " fmla v"#C0".4s, v"#A".4s, v"#B".s[0] \n\t" \ " fmla v"#C1".4s, v"#A".4s, v"#B".s[1] \n\t" \ " fmla v"#C2".4s, v"#A".4s, v"#B".s[2] \n\t" \ " fmla v"#C3".4s, v"#A".4s, v"#B".s[3] \n\t" cython-blis-0.9.1/blis/_src/kernels/armv8a/3/armv8a_asm_utils.h000066400000000000000000000101331427272030600242660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Apple/Clang's local label requirements. #if defined(__APPLE__) || defined(__clang__) #define LABEL(str) " L" #str"%=: \n\t" #define BEQ(str) "b.eq L" #str"%= \n\t" #define BNE(str) "b.ne L" #str"%= \n\t" #define BRANCH(str) "b L" #str"%= \n\t" #else #define LABEL(str) " ." #str": \n\t" #define BEQ(str) "b.eq ." #str" \n\t" #define BNE(str) "b.ne ." #str" \n\t" #define BRANCH(str) "b ." #str" \n\t" #endif // Clear vectors. #define CLEAR1V(V) \ " dup v"#V".2d, xzr \n\t" #define CLEAR2V(V0,V1) \ CLEAR1V(V0) \ CLEAR1V(V1) #define CLEAR4V(V0,V1,V2,V3) \ CLEAR2V(V0,V1) \ CLEAR2V(V2,V3) #define CLEAR8V(V0,V1,V2,V3,V4,V5,V6,V7) \ CLEAR4V(V0,V1,V2,V3) \ CLEAR4V(V4,V5,V6,V7) // Scale vectors. #define DSCALE1V(V,A,IDX) \ " fmul v"#V".2d, v"#V".2d, v"#A".d["#IDX"] \n\t" #define DSCALE2V(V0,V1,A,IDX) \ DSCALE1V(V0,A,IDX) \ DSCALE1V(V1,A,IDX) #define DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE2V(V0,V1,A,IDX) \ DSCALE2V(V2,V3,A,IDX) #define DSCALE8V(V0,V1,V2,V3,V4,V5,V6,V7,A,IDX) \ DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE4V(V4,V5,V6,V7,A,IDX) // Scale-accumulate. #define DSCALEA1V(D,S,A,IDX) \ " fmla v"#D".2d, v"#S".2d, v"#A".d["#IDX"] \n\t" #define DSCALEA2V(D0,D1,S0,S1,A,IDX) \ DSCALEA1V(D0,S0,A,IDX) \ DSCALEA1V(D1,S1,A,IDX) #define DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA2V(D0,D1,S0,S1,A,IDX) \ DSCALEA2V(D2,D3,S2,S3,A,IDX) #define DSCALEA8V(D0,D1,D2,D3,D4,D5,D6,D7,S0,S1,S2,S3,S4,S5,S6,S7,A,IDX) \ DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX) // Load one line. #define DLOAD1V(V,ADDR,SHIFT) \ " ldr q"#V", ["#ADDR", #"#SHIFT"] \n\t" #define DLOAD2V(V0,V1,ADDR,SHIFT) \ DLOAD1V(V0,ADDR,SHIFT) \ DLOAD1V(V1,ADDR,SHIFT+16) #define DLOAD4V(V0,V1,V2,V3,ADDR,SHIFT) \ DLOAD2V(V0,V1,ADDR,SHIFT) \ DLOAD2V(V2,V3,ADDR,SHIFT+32) // Generic: load one line. #define DLOAD1V_GATHER_ELMFWD(V,ADDR,INC) \ " ld1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \ " ld1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t" // Store one line. #define DSTORE1V(V,ADDR,SHIFT) \ " str q"#V", ["#ADDR", #"#SHIFT"] \n\t" #define DSTORE2V(V0,V1,ADDR,SHIFT) \ DSTORE1V(V0,ADDR,SHIFT) \ DSTORE1V(V1,ADDR,SHIFT+16) #define DSTORE4V(V0,V1,V2,V3,ADDR,SHIFT) \ DSTORE2V(V0,V1,ADDR,SHIFT) \ DSTORE2V(V2,V3,ADDR,SHIFT+32) // Generic: store one line. #define DSTORE1V_SCATTER_ELMFWD(V,ADDR,INC) \ " st1 {v"#V".d}[0], ["#ADDR"], "#INC" \n\t" \ " st1 {v"#V".d}[1], ["#ADDR"], "#INC" \n\t" cython-blis-0.9.1/blis/_src/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c000066400000000000000000002474251427272030600255650ustar00rootroot00000000000000 /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "armv8a_asm_utils.h" /* o 4x4 Single precision micro-kernel fully functional. o Runnable on ARMv8, compiled with aarch64 GCC. o Use it together with the armv8 BLIS configuration. o Tested on Juno board. Around 7.3 GFLOPS @ 1.1 GHz. December 2014. * UPDATE NOVEMBER 2015 * Micro-kernel changed to 8x12 * Tested on Juno Board. Around 8.1 GFLOPS, 1 x A57 core @ 1.1 GHz. * Tested on Juno Board. Around 15.9 GFLOPS, 2 x A57 cores @ 1.1 GHz. * Tested on Juno board. Around 3.1 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 12 GFLOPS, 4 x A53 cores @ 850 MHz. */ void bli_sgemm_armv8a_asm_8x12 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( s, 8, 12, false ); __asm__ volatile ( " \n\t" " \n\t" " ldr x0,%[aaddr] \n\t" // Load address of A. " ldr x1,%[baddr] \n\t" // Load address of B. " ldr x2,%[caddr] \n\t" // Load address of C. " \n\t" " ldr x5,%[k_iter] \n\t" // Number of unrolled iterations (k_iter). " ldr x6,%[k_left] \n\t" // Number of remaining iterations (k_left). " \n\t" " ldr x10,%[cs_c] \n\t" // Load cs_c. " lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. " \n\t" // " ldr x14,%[rs_c] \n\t" // Load rs_c. // " lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). " \n\t" " add x16,x2,x10 \n\t" //Load address Column 1 of C " add x17,x16,x10 \n\t" //Load address Column 2 of C " add x19,x17,x10 \n\t" //Load address Column 3 of C " add x20,x19,x10 \n\t" //Load address Column 4 of C " add x21,x20,x10 \n\t" //Load address Column 5 of C " add x22,x21,x10 \n\t" //Load address Column 6 of C " add x23,x22,x10 \n\t" //Load address Column 7 of C " add x24,x23,x10 \n\t" //Load address Column 8 of C " add x25,x24,x10 \n\t" //Load address Column 9 of C " add x26,x25,x10 \n\t" //Load address Column 10 of C " add x27,x26,x10 \n\t" //Load address Column 11 of C " \n\t" " prfm pldl1keep,[x2] \n\t" // Prefetch c. " prfm pldl1keep,[x16] \n\t" // Prefetch c. " prfm pldl1keep,[x17] \n\t" // Prefetch c. " prfm pldl1keep,[x19] \n\t" // Prefetch c. " prfm pldl1keep,[x20] \n\t" // Prefetch c. " prfm pldl1keep,[x21] \n\t" // Prefetch c. " prfm pldl1keep,[x22] \n\t" // Prefetch c. " prfm pldl1keep,[x23] \n\t" // Prefetch c. " prfm pldl1keep,[x24] \n\t" // Prefetch c. " prfm pldl1keep,[x25] \n\t" // Prefetch c. " prfm pldl1keep,[x26] \n\t" // Prefetch c. " prfm pldl1keep,[x27] \n\t" // Prefetch c. " \n\t" " dup v8.4s, wzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #192] \n\t" " dup v9.4s, wzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #256] \n\t" " dup v10.4s, wzr \n\t" // Vector for accummulating column 1 " prfm PLDL1KEEP, [x1, #320] \n\t" " dup v11.4s, wzr \n\t" // Vector for accummulating column 1 " dup v12.4s, wzr \n\t" // Vector for accummulating column 2 " dup v13.4s, wzr \n\t" // Vector for accummulating column 2 " \n\t" " dup v14.4s, wzr \n\t" // Vector for accummulating column 3 " prfm PLDL1KEEP, [x0, #128] \n\t" " dup v15.4s, wzr \n\t" // Vector for accummulating column 3 " prfm PLDL1KEEP, [x0, #192] \n\t" " dup v16.4s, wzr \n\t" // Vector for accummulating column 4 " dup v17.4s, wzr \n\t" // Vector for accummulating column 4 " dup v18.4s, wzr \n\t" // Vector for accummulating column 5 " dup v19.4s, wzr \n\t" // Vector for accummulating column 5 " \n\t" " dup v20.4s, wzr \n\t" // Vector for accummulating column 6 " dup v21.4s, wzr \n\t" // Vector for accummulating column 6 " dup v22.4s, wzr \n\t" // Vector for accummulating column 7 " dup v23.4s, wzr \n\t" // Vector for accummulating column 7 " dup v24.4s, wzr \n\t" // Vector for accummulating column 8 " dup v25.4s, wzr \n\t" // Vector for accummulating column 8 " \n\t" " dup v26.4s, wzr \n\t" // Vector for accummulating column 9 " dup v27.4s, wzr \n\t" // Vector for accummulating column 9 " dup v28.4s, wzr \n\t" // Vector for accummulating column 10 " dup v29.4s, wzr \n\t" // Vector for accummulating column 10 " dup v30.4s, wzr \n\t" // Vector for accummulating column 11 " dup v31.4s, wzr \n\t" // Vector for accummulating column 11 " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. BEQ(SCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" " ldr q1, [x0, #16] \n\t" // Load a " \n\t" " ldr q2, [x1] \n\t" // Load b " ldr q3, [x1, #16] \n\t" " ldr q4, [x1, #32] \n\t" " \n\t" " add x0, x0, #32 \n\t" //update address of A " add x1, x1, #48 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. BEQ(SLASTITER) // (as loop is do-while-like). " \n\t" LABEL(SLOOPKITER) // Body of the k_iter loop. " \n\t" " ldr q5, [x0] \n\t" " fmla v8.4s, v0.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s, v1.4s,v2.s[0] \n\t" // Accummulate. " ldr q6, [x0, #16] \n\t" " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. " ldr q2, [x1] \n\t" " \n\t" " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. " prfm PLDL1KEEP, [x1, #336] \n\t" " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. " prfm PLDL1KEEP, [x1, #400] \n\t" " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. " prfm PLDL1KEEP, [x1, #464] \n\t" " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. " ldr q3, [x1, #16] \n\t" " \n\t" " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #32] \n\t" " \n\t" //End It 1 " \n\t" " ldr q0, [x0, #32] \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. " ldr q1, [x0, #48] \n\t" " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. " ldr q2, [x1, #48] \n\t" " \n\t" " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. " prfm PLDL1KEEP, [x0, #224] \n\t" " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. " prfm PLDL1KEEP, [x0, #288] \n\t" " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. " ldr q3, [x1, #64] \n\t" " \n\t" " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #80] \n\t" " \n\t" //End It 2 " \n\t" " ldr q5, [x0, #64] \n\t" " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. " ldr q6, [x0, #80] \n\t" " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. " ldr q2, [x1, #96] \n\t" " \n\t" " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. " ldr q3, [x1, #112] \n\t" " \n\t" " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #128] \n\t" " \n\t" //End It 3 " \n\t" " ldr q0, [x0, #96] \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. " ldr q1, [x0, #112] \n\t" " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. " ldr q2, [x1, #144] \n\t" " \n\t" " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. " ldr q3, [x1, #160] \n\t" " \n\t" " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #176] \n\t" " add x1, x1, #192 \n\t" " add x0, x0, #128 \n\t" " \n\t" //End It 4 " sub x5,x5,1 \n\t" // i-=1. " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. BNE(SLOOPKITER) " \n\t" LABEL(SLASTITER) // Last iteration of k_iter loop. " \n\t" " \n\t" " ldr q5, [x0] \n\t" " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. " ldr q6, [x0, #16] \n\t" " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. " ldr q2, [x1] \n\t" " \n\t" " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. " ldr q3, [x1, #16] \n\t" " \n\t" " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #32] \n\t" " \n\t" //End It 1 " \n\t" " ldr q0, [x0, #32] \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. " ldr q1, [x0, #48] \n\t" " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. " ldr q2, [x1, #48] \n\t" " \n\t" " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. " ldr q3, [x1, #64] \n\t" " \n\t" " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #80] \n\t" " \n\t" //End It 2 " \n\t" " ldr q5, [x0, #64] \n\t" " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. " ldr q6, [x0, #80] \n\t" " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. " ldr q2, [x1, #96] \n\t" " \n\t" " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. " ldr q3, [x1, #112] \n\t" " \n\t" " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " ldr q4, [x1, #128] \n\t" " \n\t" //End It 3 " \n\t" " fmla v8.4s,v5.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v6.4s,v2.s[0] \n\t" // Accummulate. " fmla v10.4s,v5.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v6.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v5.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v6.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v5.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v6.4s,v2.s[3] \n\t" // Accummulate. " \n\t" " fmla v16.4s,v5.4s,v3.s[0] \n\t" // Accummulate. " fmla v17.4s,v6.4s,v3.s[0] \n\t" // Accummulate. " fmla v18.4s,v5.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v6.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v5.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v6.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v5.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v6.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v5.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v5.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v5.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v5.4s,v4.s[3] \n\t" // Accummulate. " \n\t" " fmla v25.4s,v6.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v6.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v6.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v6.4s,v4.s[3] \n\t" // Accummulate. " add x1, x1, #144 \n\t" " add x0, x0, #96 \n\t" " \n\t" //End It 4 " \n\t" LABEL(SCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. BEQ(SPOSTACCUM) // else, we enter the k_left loop. " \n\t" LABEL(SLOOPKLEFT) // Body of the left iterations " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a " \n\t" " ldr q2, [x1],#16 \n\t" // Load b " ldr q3, [x1],#16 \n\t" " ldr q4, [x1],#16 \n\t" " \n\t" " sub x6,x6,1 \n\t" // i = i-1. " \n\t" " fmla v8.4s,v0.4s,v2.s[0] \n\t" // Accummulate. " fmla v9.4s,v1.4s,v2.s[0] \n\t" // Accummulate. " fmla v10.4s,v0.4s,v2.s[1] \n\t" // Accummulate. " fmla v11.4s,v1.4s,v2.s[1] \n\t" // Accummulate. " fmla v12.4s,v0.4s,v2.s[2] \n\t" // Accummulate. " fmla v13.4s,v1.4s,v2.s[2] \n\t" // Accummulate. " fmla v14.4s,v0.4s,v2.s[3] \n\t" // Accummulate. " fmla v15.4s,v1.4s,v2.s[3] \n\t" // Accummulate. " \n\t" " fmla v16.4s,v0.4s,v3.s[0] \n\t" // Accummulate. " fmla v17.4s,v1.4s,v3.s[0] \n\t" // Accummulate. " fmla v18.4s,v0.4s,v3.s[1] \n\t" // Accummulate. " fmla v19.4s,v1.4s,v3.s[1] \n\t" // Accummulate. " fmla v20.4s,v0.4s,v3.s[2] \n\t" // Accummulate. " fmla v21.4s,v1.4s,v3.s[2] \n\t" // Accummulate. " fmla v22.4s,v0.4s,v3.s[3] \n\t" // Accummulate. " fmla v23.4s,v1.4s,v3.s[3] \n\t" // Accummulate. " \n\t" " fmla v24.4s,v0.4s,v4.s[0] \n\t" // Accummulate. " fmla v26.4s,v0.4s,v4.s[1] \n\t" // Accummulate. " fmla v28.4s,v0.4s,v4.s[2] \n\t" // Accummulate. " fmla v30.4s,v0.4s,v4.s[3] \n\t" // Accummulate. " fmla v25.4s,v1.4s,v4.s[0] \n\t" // Accummulate. " fmla v27.4s,v1.4s,v4.s[1] \n\t" // Accummulate. " fmla v29.4s,v1.4s,v4.s[2] \n\t" // Accummulate. " fmla v31.4s,v1.4s,v4.s[3] \n\t" // Accummulate. " \n\t" " cmp x6,0 \n\t" // Iterate again. BNE(SLOOPKLEFT) // if i!=0. " \n\t" LABEL(SPOSTACCUM) " \n\t" " ldr x0,%[alpha] \n\t" // Alpha address. " ldr x1,%[beta] \n\t" // Beta address. " \n\t" " ld1r {v6.4s},[x0] \n\t" // Load alpha. " ld1r {v7.4s},[x1] \n\t" // Load beta " \n\t" " ldr x0,%[a_next] \n\t" // Pointer to next block of A. " ldr x1,%[b_next] \n\t" // Pointer to next pointer of B. " \n\t" LABEL(SCOLSTORED) // C is column-major. " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" " dup v2.4s, wzr \n\t" " dup v3.4s, wzr \n\t" " dup v4.4s, wzr \n\t" " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" " ldr q2, [x16] \n\t" //Load column 1 of C " ldr q3, [x16, #16] \n\t" " ldr q4, [x17] \n\t" //Load column 2 of C " ldr q5, [x17, #16] \n\t" " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" LABEL(SBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.4s,v8.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v9.4s,v6.s[0] \n\t" // Scale by alpha " fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha " fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" " str q0, [x2] \n\t" //Store column 0 of C " str q1, [x2, #16] \n\t" " str q2, [x16] \n\t" //Store column 1 of C " str q3, [x16, #16] \n\t" " str q4, [x17] \n\t" //Store column 2 of C " str q5, [x17, #16] \n\t" " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" " dup v10.4s, wzr \n\t" " dup v11.4s, wzr \n\t" " dup v12.4s, wzr \n\t" " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x19] \n\t" //Load column 3 of C " ldr q9, [x19, #16] \n\t" " ldr q10, [x20] \n\t" //Load column 4 of C " ldr q11, [x20, #16] \n\t" " ldr q12, [x21] \n\t" //Load column 5 of C " ldr q13, [x21, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" LABEL(SBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha " fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha " fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" " str q8, [x19] \n\t" //Store column 3 of C " str q9, [x19, #16] \n\t" " str q10, [x20] \n\t" //Store column 4 of C " str q11, [x20, #16] \n\t" " str q12, [x21] \n\t" //Store column 5 of C " str q13, [x21, #16] \n\t" " \n\t" " dup v0.4s, wzr \n\t" " dup v1.4s, wzr \n\t" " dup v2.4s, wzr \n\t" " dup v3.4s, wzr \n\t" " dup v4.4s, wzr \n\t" " dup v5.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x22] \n\t" //Load column 6 of C " ldr q1, [x22, #16] \n\t" " ldr q2, [x23] \n\t" //Load column 7 of C " ldr q3, [x23, #16] \n\t" " ldr q4, [x24] \n\t" //Load column 8 of C " ldr q5, [x24, #16] \n\t" " \n\t" " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta " \n\t" LABEL(SBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha " fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha " fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" " str q0, [x22] \n\t" //Store column 6 of C " str q1, [x22, #16] \n\t" " str q2, [x23] \n\t" //Store column 7 of C " str q3, [x23, #16] \n\t" " str q4, [x24] \n\t" //Store column 8 of C " str q5, [x24, #16] \n\t" " \n\t" " dup v8.4s, wzr \n\t" " dup v9.4s, wzr \n\t" " dup v10.4s, wzr \n\t" " dup v11.4s, wzr \n\t" " dup v12.4s, wzr \n\t" " dup v13.4s, wzr \n\t" " \n\t" " fcmp s7,#0.0 \n\t" BEQ(SBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x25] \n\t" //Load column 9 of C " ldr q9, [x25, #16] \n\t" " ldr q10, [x26] \n\t" //Load column 10 of C " ldr q11, [x26, #16] \n\t" " ldr q12, [x27] \n\t" //Load column 11 of C " ldr q13, [x27, #16] \n\t" " \n\t" " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta " \n\t" LABEL(SBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x0] \n\t" " prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha " fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha " fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha " \n\t" " str q8, [x25] \n\t" //Store column 9 of C " str q9, [x25, #16] \n\t" " str q10, [x26] \n\t" //Store column 10 of C " str q11, [x26, #16] \n\t" " str q12, [x27] \n\t" //Store column 11 of C " str q13, [x27, #16] \n\t" " \n\t" " \n\t" // BRANCH(SEND) // Done. // LABEL(SEND) // Done! " \n\t" :// output operands (none) :// input operands [aaddr] "m" (a), // 0 [baddr] "m" (b), // 1 [caddr] "m" (c), // 2 [k_iter] "m" (k_iter), // 3 [k_left] "m" (k_left), // 4 [alpha] "m" (alpha), // 5 [beta] "m" (beta), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [a_next] "m" (a_next), // 9 [b_next] "m" (b_next) // 10 :// Register clobber list "x0", "x1", "x2", "x5", "x6", "x10", "x16","x17","x19","x20", "x21","x22","x23","x24", "x25","x26","x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11", "v12","v13","v14","v15", "v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", "v28","v29","v30","v31" ); GEMM_UKR_FLUSH_CT( s ); } /* o 4x4 Double precision micro-kernel NOT fully functional yet. o Runnable on ARMv8, compiled with aarch64 GCC. o Use it together with the armv8 BLIS configuration. o Tested on Juno board. Around 3 GFLOPS @ 1.1 GHz. December 2014. * UPDATE OCTOBER 2015: Now is fully functional. * Tested on Juno board. Around 5.6 GFLOPS, 2 A57 cores @ 1.1 GHz. * Tested on Juno board. Around 4 GFLOPS, 4 A53 cores @ 850 MHz. * UPDATE NOVEMBER 2015 * Micro-kernel changed to 6x8 * Tested on Juno Board. Around 4 GFLOPS, 1 x A57 core @ 1.1 GHz. * Tested on Juno Board. Around 7.6 GFLOPS, 2 x A57 cores @ 1.1 GHz. * Tested on Juno board. Around 1.5 GFLOPS, 1 x A53 core @ 850 MHz. * Tested on Juno board. Around 5.5 GFLOPS, 4 x A53 cores @ 850 MHz. */ void bli_dgemm_armv8a_asm_6x8 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( d, 6, 8, false ); __asm__ volatile ( " \n\t" " ldr x0,%[aaddr] \n\t" // Load address of A " ldr x1,%[baddr] \n\t" // Load address of B " ldr x2,%[caddr] \n\t" // Load address of C " \n\t" " ldr x5,%[k_iter] \n\t" // Init guard (k_iter) " ldr x6,%[k_left] \n\t" // Init guard (k_iter) " \n\t" " ldr x10,%[cs_c] \n\t" // Load cs_c " lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) " \n\t" // " ldr x14,%[rs_c] \n\t" // Load rs_c. // " lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). " \n\t" " add x20,x2,x10 \n\t" //Load address Column 1 of C " add x21,x20,x10 \n\t" //Load address Column 2 of C " add x22,x21,x10 \n\t" //Load address Column 3 of C " add x23,x22,x10 \n\t" //Load address Column 4 of C " add x24,x23,x10 \n\t" //Load address Column 5 of C " add x25,x24,x10 \n\t" //Load address Column 6 of C " add x26,x25,x10 \n\t" //Load address Column 7 of C " \n\t" " prfm pldl1keep,[x2] \n\t" // Prefetch c. " prfm pldl1keep,[x20] \n\t" // Prefetch c. " prfm pldl1keep,[x21] \n\t" // Prefetch c. " prfm pldl1keep,[x22] \n\t" // Prefetch c. " prfm pldl1keep,[x23] \n\t" // Prefetch c. " prfm pldl1keep,[x24] \n\t" // Prefetch c. " prfm pldl1keep,[x25] \n\t" // Prefetch c. " prfm pldl1keep,[x26] \n\t" // Prefetch c. " \n\t" " dup v8.2d, xzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #256] \n\t" " dup v9.2d, xzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #320] \n\t" " dup v10.2d, xzr \n\t" // Vector for accummulating column 0 " prfm PLDL1KEEP, [x1, #384] \n\t" " dup v11.2d, xzr \n\t" // Vector for accummulating column 1 " prfm PLDL1KEEP, [x1, #448] \n\t" " dup v12.2d, xzr \n\t" // Vector for accummulating column 1 " dup v13.2d, xzr \n\t" // Vector for accummulating column 1 " \n\t" " dup v14.2d, xzr \n\t" // Vector for accummulating column 2 " prfm PLDL1KEEP, [x0, #192] \n\t" " dup v15.2d, xzr \n\t" // Vector for accummulating column 2 " prfm PLDL1KEEP, [x0, #256] \n\t" " dup v16.2d, xzr \n\t" // Vector for accummulating column 2 " prfm PLDL1KEEP, [x0, #320] \n\t" " dup v17.2d, xzr \n\t" // Vector for accummulating column 3 " dup v18.2d, xzr \n\t" // Vector for accummulating column 3 " dup v19.2d, xzr \n\t" // Vector for accummulating column 3 " \n\t" " dup v20.2d, xzr \n\t" // Vector for accummulating column 4 " dup v21.2d, xzr \n\t" // Vector for accummulating column 4 " dup v22.2d, xzr \n\t" // Vector for accummulating column 4 " dup v23.2d, xzr \n\t" // Vector for accummulating column 5 " dup v24.2d, xzr \n\t" // Vector for accummulating column 5 " dup v25.2d, xzr \n\t" // Vector for accummulating column 5 " \n\t" " dup v26.2d, xzr \n\t" // Vector for accummulating column 6 " dup v27.2d, xzr \n\t" // Vector for accummulating column 6 " dup v28.2d, xzr \n\t" // Vector for accummulating column 6 " dup v29.2d, xzr \n\t" // Vector for accummulating column 7 " dup v30.2d, xzr \n\t" // Vector for accummulating column 7 " dup v31.2d, xzr \n\t" // Vector for accummulating column 7 " \n\t" " \n\t" " cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. BEQ(DCONSIDERKLEFT) " \n\t" " ldr q0, [x0] \n\t" // Load a " ldr q1, [x0, #16] \n\t" " ldr q2, [x0, #32] \n\t" " \n\t" " ldr q3, [x1] \n\t" // Load b " ldr q4, [x1, #16] \n\t" " ldr q5, [x1, #32] \n\t" " ldr q6, [x1, #48] \n\t" " \n\t" " add x0, x0, #48 \n\t" //update address of A " add x1, x1, #64 \n\t" //update address of B " \n\t" " cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. BEQ(DLASTITER) // (as loop is do-while-like). " \n\t" LABEL(DLOOP) // Body " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x1, #448] \n\t" //512-64=448 " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x1, #512] \n\t" " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x1, #576] \n\t" " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate " ldr q3, [x1] \n\t" " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate " ldr q7, [x0, #32] \n\t" " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate " ldr q4, [x1, #16] \n\t" " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate " ldr q5, [x1, #32] \n\t" " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " ldr q0, [x0] \n\t" " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " ldr q1, [x0, #16] \n\t" " \n\t" " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #48] \n\t" " \n\t" // End it 1 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x1, #640] \n\t" " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x0, #336] \n\t" " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x0, #400] \n\t" " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate " ldr q3, [x1, #64] \n\t" " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate " ldr q2, [x0, #80] \n\t" " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate " ldr q4, [x1, #80] \n\t" " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate " ldr q5, [x1, #96] \n\t" " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " ldr q0, [x0, #48] \n\t" " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " ldr q1, [x0, #64] \n\t" " \n\t" " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #112] \n\t" " \n\t" //End it 2 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " prfm PLDL1KEEP, [x0, #464] \n\t" " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate " ldr q3, [x1, #128] \n\t" " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate " ldr q7, [x0, #128] \n\t" " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate " ldr q4, [x1, #144] \n\t" " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate " ldr q5, [x1, #160] \n\t" " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " ldr q0, [x0, #96] \n\t" " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " ldr q1, [x0, #112] \n\t" " \n\t" " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #176] \n\t" " \n\t" // End it 3 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate " ldr q3, [x1, #192] \n\t" " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate " ldr q2, [x0, #176] \n\t" " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate " ldr q4, [x1, #208] \n\t" " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate " ldr q5, [x1, #224] \n\t" " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " ldr q0, [x0, #144] \n\t" " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " ldr q1, [x0, #160] \n\t" " \n\t" " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #240] \n\t" " \n\t" //End it 4 " add x0, x0, #192 \n\t" " add x1, x1, #256 \n\t" " \n\t" " sub x5,x5,1 \n\t" // i-=1 " cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. BNE(DLOOP) " \n\t" LABEL(DLASTITER) " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " ldr q3, [x1] \n\t" " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate " ldr q7, [x0, #32] \n\t" " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate " ldr q4, [x1, #16] \n\t" " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate " ldr q5, [x1, #32] \n\t" " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " ldr q0, [x0] \n\t" " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " ldr q1, [x0, #16] \n\t" " \n\t" " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #48] \n\t" " \n\t" // End it 1 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate " ldr q3, [x1, #64] \n\t" " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate " ldr q2, [x0, #80] \n\t" " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate " ldr q4, [x1, #80] \n\t" " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate " ldr q5, [x1, #96] \n\t" " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " ldr q0, [x0, #48] \n\t" " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " ldr q1, [x0, #64] \n\t" " \n\t" " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #112] \n\t" " \n\t" //End it 2 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " ldr q3, [x1, #128] \n\t" " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate " ldr q7, [x0, #128] \n\t" " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate " ldr q4, [x1, #144] \n\t" " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate " ldr q5, [x1, #160] \n\t" " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " ldr q0, [x0, #96] \n\t" " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " ldr q1, [x0, #112] \n\t" " \n\t" " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " ldr q6, [x1, #176] \n\t" " \n\t" // End it 3 " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v7.2d,v3.d[0] \n\t" // Accummulate " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v7.2d,v3.d[1] \n\t" // Accummulate " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v7.2d,v4.d[0] \n\t" // Accummulate " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v7.2d,v4.d[1] \n\t" // Accummulate " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v7.2d,v5.d[0] \n\t" // Accummulate " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v7.2d,v5.d[1] \n\t" // Accummulate " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " add x1, x1, #192 \n\t" " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " \n\t" " fmla v28.2d,v7.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v7.2d,v6.d[1] \n\t" // Accummulate " \n\t" //End it 4 " add x0, x0, #144 \n\t" " \n\t" LABEL(DCONSIDERKLEFT) " cmp x6,0 \n\t" // If k_left == 0, we are done. BEQ(DPOSTACCUM) // else, we enter the k_left loop. " \n\t" LABEL(DLOOPKLEFT) " \n\t" " ldr q0, [x0],#16 \n\t" " ldr q1, [x0],#16 \n\t" // Load a " ldr q2, [x0],#16 \n\t" " \n\t" " ldr q3, [x1],#16 \n\t" // Load b " ldr q4, [x1],#16 \n\t" " ldr q5, [x1],#16 \n\t" " ldr q6, [x1],#16 \n\t" " \n\t" " sub x6,x6,1 \n\t" " \n\t" " fmla v8.2d ,v0.2d,v3.d[0] \n\t" // Accummulate " fmla v9.2d ,v1.2d,v3.d[0] \n\t" // Accummulate " fmla v10.2d,v2.2d,v3.d[0] \n\t" // Accummulate " \n\t" " fmla v11.2d,v0.2d,v3.d[1] \n\t" // Accummulate " fmla v12.2d,v1.2d,v3.d[1] \n\t" // Accummulate " fmla v13.2d,v2.2d,v3.d[1] \n\t" // Accummulate " \n\t" " fmla v14.2d,v0.2d,v4.d[0] \n\t" // Accummulate " fmla v15.2d,v1.2d,v4.d[0] \n\t" // Accummulate " fmla v16.2d,v2.2d,v4.d[0] \n\t" // Accummulate " \n\t" " fmla v17.2d,v0.2d,v4.d[1] \n\t" // Accummulate " fmla v18.2d,v1.2d,v4.d[1] \n\t" // Accummulate " fmla v19.2d,v2.2d,v4.d[1] \n\t" // Accummulate " \n\t" " fmla v20.2d,v0.2d,v5.d[0] \n\t" // Accummulate " fmla v21.2d,v1.2d,v5.d[0] \n\t" // Accummulate " fmla v22.2d,v2.2d,v5.d[0] \n\t" // Accummulate " \n\t" " fmla v23.2d,v0.2d,v5.d[1] \n\t" // Accummulate " fmla v24.2d,v1.2d,v5.d[1] \n\t" // Accummulate " fmla v25.2d,v2.2d,v5.d[1] \n\t" // Accummulate " \n\t" " fmla v26.2d,v0.2d,v6.d[0] \n\t" // Accummulate " fmla v29.2d,v0.2d,v6.d[1] \n\t" // Accummulate " \n\t" " fmla v27.2d,v1.2d,v6.d[0] \n\t" // Accummulate " fmla v30.2d,v1.2d,v6.d[1] \n\t" // Accummulate " \n\t" " fmla v28.2d,v2.2d,v6.d[0] \n\t" // Accummulate " fmla v31.2d,v2.2d,v6.d[1] \n\t" // Accummulate " \n\t" " cmp x6,0 \n\t" // Iterate again. BNE(DLOOPKLEFT) // if i!=0. " \n\t" LABEL(DPOSTACCUM) " \n\t" " ldr x0,%[alpha] \n\t" // Alpha address " ldr x1,%[beta] \n\t" // Beta address " \n\t" " ld1r {v6.2d},[x0] \n\t" // Load alpha. " ld1r {v7.2d},[x1] \n\t" // Load beta " \n\t" " ldr x0,%[a_next] \n\t" // Next A address for later use. " ldr x1,%[b_next] \n\t" // Next B address for later use. " \n\t" LABEL(DCOLSTORED) // C is column-major. " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" " dup v2.2d, xzr \n\t" " dup v3.2d, xzr \n\t" " dup v4.2d, xzr \n\t" " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS1) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x2] \n\t" //Load column 0 of C " ldr q1, [x2, #16] \n\t" " ldr q2, [x2, #32] \n\t" " \n\t" " ldr q3, [x20] \n\t" //Load column 1 of C " ldr q4, [x20, #16] \n\t" " ldr q5, [x20, #32] \n\t" " \n\t" " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" LABEL(DBETAZEROCOLSTOREDS1) " \n\t" " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha " fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha " fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha " fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha " fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" " str q0, [x2] \n\t" //Store column 0 of C " str q1, [x2, #16] \n\t" " str q2, [x2, #32] \n\t" " \n\t" " str q3, [x20] \n\t" //Store column 1 of C " str q4, [x20, #16] \n\t" " str q5, [x20, #32] \n\t" " \n\t" " dup v8.2d, xzr \n\t" " dup v9.2d, xzr \n\t" " dup v10.2d, xzr \n\t" " dup v11.2d, xzr \n\t" " dup v12.2d, xzr \n\t" " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS2) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x21] \n\t" //Load column 2 of C " ldr q9, [x21, #16] \n\t" " ldr q10, [x21, #32] \n\t" " \n\t" " ldr q11, [x22] \n\t" //Load column 3 of C " ldr q12, [x22, #16] \n\t" " ldr q13, [x22, #32] \n\t" " \n\t" " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" LABEL(DBETAZEROCOLSTOREDS2) " \n\t" " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha " fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha " fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha " fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha " fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" " str q8, [x21] \n\t" //Store column 2 of C " str q9, [x21, #16] \n\t" " str q10, [x21, #32] \n\t" " \n\t" " str q11, [x22] \n\t" //Store column 3 of C " str q12, [x22, #16] \n\t" " str q13, [x22, #32] \n\t" " \n\t" " dup v0.2d, xzr \n\t" " dup v1.2d, xzr \n\t" " dup v2.2d, xzr \n\t" " dup v3.2d, xzr \n\t" " dup v4.2d, xzr \n\t" " dup v5.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS3) // Taking care of the beta==0 case. " \n\t" " ldr q0, [x23] \n\t" //Load column 4 of C " ldr q1, [x23, #16] \n\t" " ldr q2, [x23, #32] \n\t" " \n\t" " ldr q3, [x24] \n\t" //Load column 5 of C " ldr q4, [x24, #16] \n\t" " ldr q5, [x24, #32] \n\t" " \n\t" " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta " \n\t" LABEL(DBETAZEROCOLSTOREDS3) " \n\t" " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha " fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha " fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha " fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha " fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" " str q0, [x23] \n\t" //Store column 4 of C " str q1, [x23, #16] \n\t" " str q2, [x23, #32] \n\t" " \n\t" " str q3, [x24] \n\t" //Store column 5 of C " str q4, [x24, #16] \n\t" " str q5, [x24, #32] \n\t" " \n\t" " dup v8.2d, xzr \n\t" " dup v9.2d, xzr \n\t" " dup v10.2d, xzr \n\t" " dup v11.2d, xzr \n\t" " dup v12.2d, xzr \n\t" " dup v13.2d, xzr \n\t" " \n\t" " fcmp d7,#0.0 \n\t" BEQ(DBETAZEROCOLSTOREDS4) // Taking care of the beta==0 case. " \n\t" " ldr q8, [x25] \n\t" //Load column 6 of C " ldr q9, [x25, #16] \n\t" " ldr q10, [x25, #32] \n\t" " \n\t" " ldr q11, [x26] \n\t" //Load column 7 of C " ldr q12, [x26, #16] \n\t" " ldr q13, [x26, #32] \n\t" " \n\t" " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta " \n\t" LABEL(DBETAZEROCOLSTOREDS4) " \n\t" " prfm pldl2keep,[x0] \n\t" " prfm pldl2keep,[x1] \n\t" " \n\t" " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha " fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha " fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha " fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha " fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha " \n\t" " str q8, [x25] \n\t" //Store column 6 of C " str q9, [x25, #16] \n\t" " str q10, [x25, #32] \n\t" " \n\t" " str q11, [x26] \n\t" //Store column 7 of C " str q12, [x26, #16] \n\t" " str q13, [x26, #32] \n\t" " \n\t" // BRANCH(DEND) // LABEL(DEND) // Done! " \n\t" :// output operands (none) :// input operands [aaddr] "m" (a), // 0 [baddr] "m" (b), // 1 [caddr] "m" (c), // 2 [k_iter] "m" (k_iter), // 3 [k_left] "m" (k_left), // 4 [alpha] "m" (alpha), // 5 [beta] "m" (beta), // 6 [rs_c] "m" (rs_c), // 6 [cs_c] "m" (cs_c), // 7 [a_next] "m" (a_next), // 8 [b_next] "m" (b_next) // 9 :// Register clobber list "x0","x1","x2", "x5","x6","x10", "x16","x17","x20", "x21","x22","x23", "x24","x25","x26","x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", "v9","v10","v11", "v12","v13","v14", "v15","v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", "v28","v29","v30","v31" ); GEMM_UKR_FLUSH_CT( d ); } #if 0 void bli_cgemm_armv8a_opt_4x4 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } void bli_zgemm_armv8a_opt_4x4 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif cython-blis-0.9.1/blis/_src/kernels/armv8a/3/old/000077500000000000000000000000001427272030600214175ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv8a/3/old/bli_gemm_armv8a_asm_d4x4.c000066400000000000000000000260331427272030600263230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" // Label locality & misc. #include "armv8a_asm_utils.h" // Nanokernel operations. #include "armv8a_asm_d2x2.h" #define DGEMM_4X4_MKER_LOOP_PLAIN(C00,C10,C01,C11,C02,C12,C03,C13,A0,A1,B0,B1) \ DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) // For contiguous storage of C. #define DLOADC_2V_C_FWD(C0,C1,CADDR,CSHIFT,LDC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#LDC" \n\t" #define DSTOREC_2V_C_FWD(C0,C1,CADDR,CSHIFT,LDC) \ DSTORE2V(C0,C1,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#LDC" \n\t" void bli_dgemm_armv8a_asm_4x4 ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 6; uint64_t k_left = k0 % 6; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, #4 \n\t" // Column-skip of A. " mov x3, #4 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:7 ] <- C // V[ 8:19] <- B // V[20:31] <- A // Under this scheme, the following is defined: #define DGEMM_4X4_MKER_LOOP_PLAIN_LOC(A0,A1,B0,B1) \ DGEMM_4X4_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,A0,A1,B0,B1) // TODO: Prefetch C. // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " ldr q20, [x0, #16*0] \n\t" " ldr q21, [x0, #16*1] \n\t" " ldr q22, [x0, #16*2] \n\t" " ldr q23, [x0, #16*3] \n\t" " ldr q24, [x0, #16*4] \n\t" " ldr q25, [x0, #16*5] \n\t" " add x0, x0, x2 \n\t" " add x0, x0, x2 \n\t" " add x0, x0, x2 \n\t" " ldr q26, [x0, #16*0] \n\t" " ldr q27, [x0, #16*1] \n\t" " ldr q28, [x0, #16*2] \n\t" " ldr q29, [x0, #16*3] \n\t" " ldr q30, [x0, #16*4] \n\t" " ldr q31, [x0, #16*5] \n\t" " add x0, x0, x2 \n\t" " add x0, x0, x2 \n\t" " add x0, x0, x2 \n\t" " \n\t" " ldr q8, [x1, #16*0] \n\t" " ldr q9, [x1, #16*1] \n\t" " ldr q10, [x1, #16*2] \n\t" " ldr q11, [x1, #16*3] \n\t" " ldr q12, [x1, #16*4] \n\t" " ldr q13, [x1, #16*5] \n\t" " add x1, x1, x3 \n\t" " add x1, x1, x3 \n\t" " add x1, x1, x3 \n\t" " ldr q14, [x1, #16*0] \n\t" " ldr q15, [x1, #16*1] \n\t" " ldr q16, [x1, #16*2] \n\t" " ldr q17, [x1, #16*3] \n\t" " ldr q18, [x1, #16*4] \n\t" " ldr q19, [x1, #16*5] \n\t" " add x1, x1, x3 \n\t" " add x1, x1, x3 \n\t" " add x1, x1, x3 \n\t" " \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_4X4_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,B0,B1) \ DGEMM_4X4_MKER_LOOP_PLAIN_LOC(A0,A1,B0,B1) \ "ldr q"#A0", [x0, #16*0] \n\t" \ "ldr q"#A1", [x0, #16*1] \n\t" \ "add x0, x0, x2 \n\t" \ "ldr q"#B0", [x1, #16*0] \n\t" \ "ldr q"#B1", [x1, #16*1] \n\t" \ "add x1, x1, x3 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_4X4_MKER_LOOP_PLAIN_LOC_FWD(20,21,8,9) DGEMM_4X4_MKER_LOOP_PLAIN_LOC_FWD(22,23,10,11) DGEMM_4X4_MKER_LOOP_PLAIN_LOC_FWD(24,25,12,13) DGEMM_4X4_MKER_LOOP_PLAIN_LOC_FWD(26,27,14,15) DGEMM_4X4_MKER_LOOP_PLAIN_LOC_FWD(28,29,16,17) DGEMM_4X4_MKER_LOOP_PLAIN_LOC_FWD(30,31,18,19) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_4X4_MKER_LOOP_PLAIN_LOC(20,21,8,9) DGEMM_4X4_MKER_LOOP_PLAIN_LOC(22,23,10,11) DGEMM_4X4_MKER_LOOP_PLAIN_LOC(24,25,12,13) DGEMM_4X4_MKER_LOOP_PLAIN_LOC(26,27,14,15) DGEMM_4X4_MKER_LOOP_PLAIN_LOC(28,29,16,17) DGEMM_4X4_MKER_LOOP_PLAIN_LOC(30,31,18,19) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " ldr q20, [x0, #16*0] \n\t" " ldr q21, [x0, #16*1] \n\t" " add x0, x0, x2 \n\t" " ldr q8, [x1, #16*0] \n\t" " ldr q9, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" DGEMM_4X4_MKER_LOOP_PLAIN_LOC(20,21,8,9) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ldr d8, [x4] \n\t" // Load alpha & beta (value). " ldr d9, [x8] \n\t" " \n\t" LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" " prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, " prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions " prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. " prfm PLDL1STRM, [x1, 64*0] \n\t" " prfm PLDL1STRM, [x1, 64*1] \n\t" " prfm PLDL1STRM, [x1, 64*3] \n\t" " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x6, #1 \n\t" // Check for generic storage. BNE(WRITE_MEM_G) // // Contiguous C-storage. LABEL(WRITE_MEM_C) DLOADC_2V_C_FWD(10,11,x9,0,x7) DLOADC_2V_C_FWD(12,13,x9,0,x7) DLOADC_2V_C_FWD(14,15,x9,0,x7) DLOADC_2V_C_FWD(16,17,x9,0,x7) DSCALE8V(10,11,12,13,14,15,16,17,9,0) DSCALEA8V(10,11,12,13,14,15,16,17,0,1,2,3,4,5,6,7,8,0) DSTOREC_2V_C_FWD(10,11,x5,0,x7) DSTOREC_2V_C_FWD(12,13,x5,0,x7) DSTOREC_2V_C_FWD(14,15,x5,0,x7) DSTOREC_2V_C_FWD(16,17,x5,0,x7) BRANCH(END_WRITE_MEM) // // Generic-strided C-storage. LABEL(WRITE_MEM_G) // TODO: Implement. LABEL(END_WRITE_MEM) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8", "x9","x16", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", "v28","v29","v30","v31" ); } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/old/bli_gemm_armv8a_asm_d6x8r.c000066400000000000000000000346021427272030600265140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Label locality & misc. #include "armv8a_asm_utils.h" // Nanokernel operations. #include "armv8a_asm_d2x2.h" /* Order of row-major DGEMM_6x8's execution in 2x2 blocks: * * +---+ +---+ +---+ +---+ * | 0 | | 1 | | 6 | | 7 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 2 | | 3 | | 8 | | 9 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 4 | | 5 | | 10| | 11| * +---+ +---+ +---+ +---+ * */ #define DGEMM_6X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,C50,C51,C52,C53,A0,A1,A2,B0,B1,B2,B3,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ DGEMM_LOAD2V_ ##LOADNEXT (B0,B1,BADDR,BSHIFT) \ DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \ DGEMM_LOAD1V_ ##LOADNEXT (A0,AADDR,ASHIFT) \ DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (A1,AADDR,ASHIFT+16) \ DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \ DGEMM_2X2_NANOKERNEL(C43,C53,B3,A2) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ " ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ DGEMM_LOAD1V_load(V1,ADDR,IMM) \ DGEMM_LOAD1V_load(V2,ADDR,IMM+16) // For contiguous storage of C. #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DPRFMC_FWD(CADDR,RSC) \ " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#RSC" \n\t" // For scattered storage of C. #define DLOADC_GATHER_4V_R_FWD(C0,C1,C2,C3,CADDR,CELEM,CSC,RSC) \ " mov "#CELEM", "#CADDR" \n\t" \ DLOAD1V_GATHER_ELMFWD(C0,CELEM,CSC) \ DLOAD1V_GATHER_ELMFWD(C1,CELEM,CSC) \ DLOAD1V_GATHER_ELMFWD(C2,CELEM,CSC) \ DLOAD1V_GATHER_ELMFWD(C3,CELEM,CSC) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_SCATTER_4V_R_FWD(C0,C1,C2,C3,CADDR,CELEM,CSC,RSC) \ " mov "#CELEM", "#CADDR" \n\t" \ DSTORE1V_SCATTER_ELMFWD(C0,CELEM,CSC) \ DSTORE1V_SCATTER_ELMFWD(C1,CELEM,CSC) \ DSTORE1V_SCATTER_ELMFWD(C2,CELEM,CSC) \ DSTORE1V_SCATTER_ELMFWD(C3,CELEM,CSC) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" void bli_dgemm_armv8a_asm_6x8r ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, #6 \n\t" // Column-skip of A. " mov x3, #8 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x2, x2, #3 \n\t" // cs_a " lsl x3, x3, #3 \n\t" // rs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " mov x9, x5 \n\t" " cmp x7, #8 \n\t" // Do not prefetch C for generic strided. BNE(C_PREFETCH_END) DPRFMC_FWD(x9,x6) DPRFMC_FWD(x9,x6) DPRFMC_FWD(x9,x6) DPRFMC_FWD(x9,x6) DPRFMC_FWD(x9,x6) DPRFMC_FWD(x9,x6) LABEL(C_PREFETCH_END) " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:23] <- C // V[24:27] <- A // V[28:31] <- B // Under this scheme, the following is defined: #define DGEMM_6X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ DGEMM_6X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,B2,B3,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " ldr q24, [x0, #16*0] \n\t" // Load A. " ldr q25, [x0, #16*1] \n\t" " ldr q26, [x0, #16*2] \n\t" " add x0, x0, x2 \n\t" " ldr q27, [x0, #16*0] \n\t" " \n\t" " ldr q28, [x1, #16*0] \n\t" // Load B. " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) CLEAR8V(16,17,18,19,20,21,22,23) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \ DGEMM_6X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x0,1*16,x1,0,load) \ "add x0, x0, x2 \n\t" \ "ldr q"#A2", [x0, #16*0] \n\t" \ "ldr q"#B2", [x1, #16*2] \n\t" \ "ldr q"#B3", [x1, #16*3] \n\t" \ "add x1, x1, x3 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,28,29,30,31) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29,30,31) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,28,29,30,31) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,30,31,x0,1*16,x1,0,load) " add x0, x0, x2 \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" DGEMM_6X8_MKER_LOOP_PLAIN_LOC(25,26,27,28,29,30,31,xzr,-1,xzr,-1,noload) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " ldr q24, [x0, #16*0] \n\t" // Load A col. " ldr q25, [x0, #16*1] \n\t" " ldr q26, [x0, #16*2] \n\t" " add x0, x0, x2 \n\t" " ldr q28, [x1, #16*0] \n\t" // Load B row. " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" DGEMM_6X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,30,31,xzr,-1,xzr,-1,noload) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ld1r {v24.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v25.2d}, [x8] \n\t" " \n\t" LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" " prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, " prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions " prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. " prfm PLDL1STRM, [x1, 64*0] \n\t" " prfm PLDL1STRM, [x1, 64*1] \n\t" " prfm PLDL1STRM, [x1, 64*3] \n\t" " \n\t" " fmov d26, #1.0 \n\t" " fcmp d24, d26 \n\t" BEQ(UNIT_ALPHA) DSCALE8V(0,1,2,3,4,5,6,7,24,0) DSCALE8V(8,9,10,11,12,13,14,15,24,0) DSCALE8V(16,17,18,19,20,21,22,23,24,0) LABEL(UNIT_ALPHA) " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for generic storage. BNE(WRITE_MEM_G) // // Contiguous C-storage. LABEL(WRITE_MEM_R) " fcmp d25, #0.0 \n\t" // Sets conditional flag whether *beta == 0. " \n\t" // This conditional flag will be used " \n\t" // multiple times for skipping load. // Row 0: BEQ(ZERO_BETA_R_0) DLOADC_4V_R_FWD(26,27,28,29,x9,0,x6) DSCALEA4V(0,1,2,3,26,27,28,29,25,0) LABEL(ZERO_BETA_R_0) DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) // Row 1 & 2: BEQ(ZERO_BETA_R_1_2) DLOADC_4V_R_FWD(26,27,28,29,x9,0,x6) DLOADC_4V_R_FWD(0,1,2,3,x9,0,x6) DSCALEA8V(4,5,6,7,8,9,10,11,26,27,28,29,0,1,2,3,25,0) LABEL(ZERO_BETA_R_1_2) DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) // Row 3 & 4 & 5: BEQ(ZERO_BETA_R_3_4_5) DLOADC_4V_R_FWD(0,1,2,3,x9,0,x6) DLOADC_4V_R_FWD(4,5,6,7,x9,0,x6) DLOADC_4V_R_FWD(8,9,10,11,x9,0,x6) DSCALEA8V(12,13,14,15,16,17,18,19,0,1,2,3,4,5,6,7,25,0) DSCALEA4V(20,21,22,23,8,9,10,11,25,0) LABEL(ZERO_BETA_R_3_4_5) DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6) DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6) DSTOREC_4V_R_FWD(20,21,22,23,x5,0,x6) BRANCH(END_WRITE_MEM) // // Generic-strided C-storage. LABEL(WRITE_MEM_G) " fcmp d25, #0.0 \n\t" // Sets conditional flag whether *beta == 0. " \n\t" // Row 0: BEQ(ZERO_BETA_G_0) DLOADC_GATHER_4V_R_FWD(26,27,28,29,x9,x0,x7,x6) DSCALEA4V(0,1,2,3,26,27,28,29,25,0) LABEL(ZERO_BETA_G_0) DSTOREC_SCATTER_4V_R_FWD(0,1,2,3,x5,x1,x7,x6) // Row 1 & 2: BEQ(ZERO_BETA_G_1_2) DLOADC_GATHER_4V_R_FWD(26,27,28,29,x9,x0,x7,x6) DLOADC_GATHER_4V_R_FWD(0,1,2,3,x9,x0,x7,x6) DSCALEA8V(4,5,6,7,8,9,10,11,26,27,28,29,0,1,2,3,25,0) LABEL(ZERO_BETA_G_1_2) DSTOREC_SCATTER_4V_R_FWD(4,5,6,7,x5,x1,x7,x6) DSTOREC_SCATTER_4V_R_FWD(8,9,10,11,x5,x1,x7,x6) // Row 3 & 4 & 5: BEQ(ZERO_BETA_G_3_4_5) DLOADC_GATHER_4V_R_FWD(0,1,2,3,x9,x0,x7,x6) DLOADC_GATHER_4V_R_FWD(4,5,6,7,x9,x0,x7,x6) DLOADC_GATHER_4V_R_FWD(8,9,10,11,x9,x0,x7,x6) DSCALEA8V(12,13,14,15,16,17,18,19,0,1,2,3,4,5,6,7,25,0) DSCALEA4V(20,21,22,23,8,9,10,11,25,0) LABEL(ZERO_BETA_G_3_4_5) DSTOREC_SCATTER_4V_R_FWD(12,13,14,15,x5,x1,x7,x6) DSTOREC_SCATTER_4V_R_FWD(16,17,18,19,x5,x1,x7,x6) DSTOREC_SCATTER_4V_R_FWD(20,21,22,23,x5,x1,x7,x6) LABEL(END_WRITE_MEM) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", "v28","v29","v30","v31" ); } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/old/bli_gemm_armv8a_asm_d8x4.c000066400000000000000000000274231427272030600263330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" // Label locality & misc. #include "armv8a_asm_utils.h" // Nanokernel operations. #include "armv8a_asm_d2x2.h" /* Order of DGEMM_8x4's execution in 2x2 blocks: * * +---+ +---+ * | 0 | | 2 | * +---+ +---+ * +---+ +---+ * | 1 | | 3 | * +---+ +---+ * +---+ +---+ * | 4 | | 6 | * +---+ +---+ * +---+ +---+ * | 5 | | 7 | * +---+ +---+ * */ #define DGEMM_8X4_MKER_LOOP_PLAIN(C00,C10,C20,C30,C01,C11,C21,C31,C02,C12,C22,C32,C03,C13,C23,C33,A0,A1,A2,A3,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \ DGEMM_LOAD2V_ ##LOADNEXT (A0,A1,AADDR,ASHIFT) \ DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \ DGEMM_2X2_NANOKERNEL(C30,C31,A3,B0) \ DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \ DGEMM_2X2_NANOKERNEL(C32,C33,A3,B1) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ " ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ DGEMM_LOAD1V_load(V1,ADDR,IMM) \ DGEMM_LOAD1V_load(V2,ADDR,IMM+16) // For contiguous storage of C. #define DLOADC_4V_C_FWD(C0,C1,C2,C3,CADDR,CSHIFT,LDC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#LDC" \n\t" #define DSTOREC_4V_C_FWD(C0,C1,C2,C3,CADDR,CSHIFT,LDC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#LDC" \n\t" void bli_dgemm_armv8a_asm_8x4 ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // This kernel is a WIP. // I have no generic stride support at this moment. assert( rs_c0 == 1 ); // if ( rs_c0 != 1 ) return ; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 6; uint64_t k_left = k0 % 6; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, #8 \n\t" // Column-skip of A. " mov x3, #4 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " mov x8, #8 \n\t" // Multiply some address skips by sizeof(double). " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:15] <- C // V[16:21] <- B // V[22:29] <- A // Under this scheme, the following is defined: #define DGEMM_8X4_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) \ DGEMM_8X4_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,A0,A1,A2,A3,B0,B1,AADDR,ASHIFT,BADDR,BSHIFT,LOADNEXT) // TODO: Prefetch C. // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " ldr q22, [x0, #16*0] \n\t" " ldr q23, [x0, #16*1] \n\t" " ldr q24, [x0, #16*2] \n\t" " ldr q25, [x0, #16*3] \n\t" " add x0, x0, x2 \n\t" " ldr q26, [x0, #16*0] \n\t" " ldr q27, [x0, #16*1] \n\t" " ldr q28, [x0, #16*2] \n\t" " ldr q29, [x0, #16*3] \n\t" " add x0, x0, x2 \n\t" " \n\t" " ldr q16, [x1, #16*0] \n\t" " ldr q17, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " ldr q18, [x1, #16*0] \n\t" " ldr q19, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " ldr q20, [x1, #16*0] \n\t" " ldr q21, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1) \ DGEMM_8X4_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,x0,0,x1,0,load) \ "ldr q"#B1", [x1, #16*1] \n\t" \ "ldr q"#A2", [x0, #16*2] \n\t" \ "ldr q"#A3", [x0, #16*3] \n\t" \ "add x1, x1, x3 \n\t" \ "add x0, x0, x2 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(22,23,24,25,16,17) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(26,27,28,29,18,19) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(22,23,24,25,20,21) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(26,27,28,29,16,17) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(22,23,24,25,18,19) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(26,27,28,29,20,21) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_8X4_MKER_LOOP_PLAIN_LOC(26,27,28,29,16,17,x0,0,x1,0,noload) " ldr q26, [x0, #16*0] \n\t" " ldr q27, [x0, #16*1] \n\t" " ldr q28, [x0, #16*2] \n\t" " ldr q29, [x0, #16*3] \n\t" " add x0, x0, x2 \n\t" DGEMM_8X4_MKER_LOOP_PLAIN_LOC(22,23,24,25,18,19,xzr,-1,xzr,-1,noload) DGEMM_8X4_MKER_LOOP_PLAIN_LOC(26,27,28,29,20,21,xzr,-1,xzr,-1,noload) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " ldr q22, [x0, #16*0] \n\t" // Load A col. " ldr q23, [x0, #16*1] \n\t" " ldr q24, [x0, #16*2] \n\t" " ldr q25, [x0, #16*3] \n\t" " add x0, x0, x2 \n\t" " ldr q16, [x1, #16*0] \n\t" // Load B col. " ldr q17, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" DGEMM_8X4_MKER_LOOP_PLAIN_LOC(22,23,24,25,16,17,xzr,-1,xzr,-1,noload) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ldr d16, [x4] \n\t" // Load alpha & beta (value). " ldr d17, [x8] \n\t" " \n\t" LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" " prfm PLDL1STRM, [x0, 64*0] \n\t" // Do not know cache line size, " prfm PLDL1STRM, [x0, 64*1] \n\t" // issue some number of prfm instructions " prfm PLDL1STRM, [x0, 64*2] \n\t" // to try to activate hardware prefetcher. " prfm PLDL1STRM, [x1, 64*0] \n\t" " prfm PLDL1STRM, [x1, 64*1] \n\t" " prfm PLDL1STRM, [x1, 64*3] \n\t" " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x6, #1 \n\t" // Check for generic storage. BNE(WRITE_MEM_G) // // Contiguous C-storage. LABEL(WRITE_MEM_C) DLOADC_4V_C_FWD(20,21,22,23,x9,0,x7) DLOADC_4V_C_FWD(24,25,26,27,x9,0,x7) DSCALE8V(20,21,22,23,24,25,26,27,17,0) DSCALEA8V(20,21,22,23,24,25,26,27,0,1,2,3,4,5,6,7,16,0) // DLOADC_4V_C_FWD(0,1,2,3,x9,0,x7) DLOADC_4V_C_FWD(4,5,6,7,x9,0,x7) DSCALE8V(0,1,2,3,4,5,6,7,17,0) DSCALEA8V(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,0) // DSTOREC_4V_C_FWD(20,21,22,23,x5,0,x7) DSTOREC_4V_C_FWD(24,25,26,27,x5,0,x7) DSTOREC_4V_C_FWD(0,1,2,3,x5,0,x7) DSTOREC_4V_C_FWD(4,5,6,7,x5,0,x7) BRANCH(END_WRITE_MEM) // // Generic-strided C-storage. LABEL(WRITE_MEM_G) // TODO: Implement. LABEL(END_WRITE_MEM) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta), [a_next] "m" (a_next), [b_next] "m" (b_next) : "x0","x1","x2","x3","x4","x5","x6","x7","x8", "x9","x16", "v0","v1","v2","v3","v4","v5","v6","v7", "v8","v9","v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19", "v20","v21","v22","v23", "v24","v25","v26","v27", "v28","v29","v30","v31" ); } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/000077500000000000000000000000001427272030600214505ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c000066400000000000000000000311451427272030600262350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // Separate instantiation for Armv8-A reference kernels. // Temporary workaround. Will be removed after upstream has switched to a better way // of exposing gemmsup interface. // // -- Row storage case --------------------------------------------------------- // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ PASTEMAC(ch,conjs)( ab ); \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_r, _armv8a, _ref2 ) // // -- Column storage case ------------------------------------------------------ // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ PASTEMAC(ch,conjs)( ab ); \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_c, _armv8a, _ref2 ) cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c000066400000000000000000000471731427272030600277640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" #define DGEMM_3X1X2_NKER_SUBLOOP(C0,C1,C2,A0,A1,A2,B) \ " fmla v"#C0".2d, v"#A0".2d, v"#B".2d \n\t" \ " fmla v"#C1".2d, v"#A1".2d, v"#B".2d \n\t" \ " fmla v"#C2".2d, v"#A2".2d, v"#B".2d \n\t" #define DGEMM_3X8X2_K_MKER_LOOP_PLAIN(C00,C01,C02,C03,C04,C05,C06,C07,C10,C11,C12,C13,C14,C15,C16,C17,C20,C21,C22,C23,C24,C25,C26,C27,A0,A1,A2,B0,B1,B2,B3,BADDR,BELEMADDR,BELEMST,LOADNEXT) \ /* Always load before forwarding to the next line. */ \ DGEMM_3X1X2_NKER_SUBLOOP(C00,C10,C20,A0,A1,A2,B0) \ DGEMM_LOAD1V_K_load(B0,BELEMADDR,BELEMST) \ DGEMM_3X1X2_NKER_SUBLOOP(C01,C11,C21,A0,A1,A2,B1) \ DGEMM_LOAD1V_K_load(B1,BELEMADDR,BELEMST) \ DGEMM_3X1X2_NKER_SUBLOOP(C02,C12,C22,A0,A1,A2,B2) \ DGEMM_LOAD1V_K_load(B2,BELEMADDR,BELEMST) \ DGEMM_3X1X2_NKER_SUBLOOP(C03,C13,C23,A0,A1,A2,B3) \ DGEMM_LOAD1V_K_load(B3,BELEMADDR,BELEMST) \ \ " add "#BADDR", "#BADDR", #16 \n\t" \ " mov "#BELEMADDR", "#BADDR" \n\t" \ DGEMM_3X1X2_NKER_SUBLOOP(C04,C14,C24,A0,A1,A2,B0) \ DGEMM_LOAD1V_K_ ##LOADNEXT (B0,BELEMADDR,BELEMST) \ DGEMM_3X1X2_NKER_SUBLOOP(C05,C15,C25,A0,A1,A2,B1) \ DGEMM_LOAD1V_K_ ##LOADNEXT (B1,BELEMADDR,BELEMST) \ DGEMM_3X1X2_NKER_SUBLOOP(C06,C16,C26,A0,A1,A2,B2) \ DGEMM_LOAD1V_K_ ##LOADNEXT (B2,BELEMADDR,BELEMST) \ DGEMM_3X1X2_NKER_SUBLOOP(C07,C17,C27,A0,A1,A2,B3) \ DGEMM_LOAD1V_K_ ##LOADNEXT (B3,BELEMADDR,BELEMST) #define DGEMM_LOAD1V_K_noload(V,ELEMADDR,ELEMST) #define DGEMM_LOAD1V_K_load(V,ELEMADDR,ELEMST) \ " ldr q"#V", [ "#ELEMADDR" ] \n\t" \ " add "#ELEMADDR", "#ELEMADDR", "#ELEMST" \n\t" // For row-storage of C. #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" // For column-storage of C. #define DLOADC_1V_1ELM_C_FWD(C0,CSCALAR,CIDX,CADDR,CSHIFT,CSC) \ DLOAD1V(C0,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " ld1 {v"#CSCALAR".d}["#CIDX"], ["#CADDR"] \n\t" \ " sub "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_1V_1ELM_C_FWD(C0,CSCALAR,CIDX,CADDR,CSHIFT,CSC) \ DSTORE1V(C0,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " st1 {v"#CSCALAR".d}["#CIDX"], ["#CADDR"] \n\t" \ " sub "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSCALE12V(V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,A,IDX) \ DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE4V(V4,V5,V6,V7,A,IDX) \ DSCALE4V(V8,V9,V10,V11,A,IDX) #define DSCALEA12V(D0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,S0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,A,IDX) \ DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX) \ DSCALEA4V(D8,D9,D10,D11,S8,S9,S10,S11,A,IDX) #define DPRFMC_FWD(CADDR,DLONGC) \ " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" void bli_dgemmsup_rd_armv8a_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { if ( n0 != 8 ) { if ( n0 < 8 ) { for ( ; n0 >= 4; n0 -= 4 ) { dim_t m = m0; double *a_loc = a; double *c_loc = c; for ( ; m >= 3; m -= 3 ) { bli_dgemmsup_rd_armv8a_asm_3x4 ( conja, conjb, 3, 4, k0, alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c_loc, rs_c0, cs_c0, data, cntx ); a_loc += 3 * rs_a0; c_loc += 3 * rs_c0; } if ( m > 0 ) { bli_dgemmsup_rd_armv8a_int_3x4 ( conja, conjb, m, 4, k0, alpha, a_loc, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c_loc, rs_c0, cs_c0, data, cntx ); } b += 4 * cs_b0; c += 4 * cs_c0; } for ( ; m0 > 0; m0 -= 3 ) { dim_t m_loc = ( m0 < 3 ) ? m0 : 3; bli_dgemmsup_rd_armv8a_int_3x4 ( conja, conjb, m_loc, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); a += 3 * rs_a0; c += 3 * rs_c0; } } else { assert( FALSE ); } return; } // LLVM has very bad routing ability for inline asm. // Limit number of registers in case of Clang compilation. #ifndef __clang__ void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); #endif // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; int64_t m_iter = m0 / 3; int64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; assert( cs_a0 == 1 ); assert( rs_b0 == 1 ); if ( m_iter == 0 ) goto consider_edge_cases; __asm__ volatile ( " ldr x10, %[a] \n\t" " ldr x13, %[c] \n\t" " ldr x12, %[m_iter] \n\t" " ldr x2, %[rs_a] \n\t" // Row-skip of A. " ldr x3, %[cs_b] \n\t" // Column-skip of B. " \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x2, x2, #3 \n\t" // rs_a " lsl x3, x3, #3 \n\t" // cs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " mov x1, x5 \n\t" " cmp x7, #8 \n\t" // Prefetch column-strided C. BEQ(C_PREFETCH_COLS) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) BRANCH(C_PREFETCH_END) LABEL(C_PREFETCH_COLS) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) LABEL(C_PREFETCH_END) // // Millikernel. LABEL(MILLIKER_MLOOP) " \n\t" " mov x0, x10 \n\t" // Parameters to be reloaded " mov x5, x13 \n\t" // within each millikernel loop. " ldr x1, %[b] \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:23] <- C // V[24:26] <- A // V[28:31] <- B // V[ 27 ] <- Not used. // Under this scheme, the following is defined: #define DGEMM_3X8X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,BADDR,BELEMADDR,BELEMST,LOADNEXT) \ DGEMM_3X8X2_K_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,B2,B3,BADDR,BELEMADDR,BELEMST,LOADNEXT) // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " mov x11, x1 \n\t" // Load B. " ldr q28, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q29, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q30, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q31, [x11] \n\t" " add x11, x11, x3 \n\t" " \n\t" " mov x14, x0 \n\t" // Load A. " ldr q24, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q25, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q26, [x14] \n\t" // " add x14, x14, x2 \n\t" " add x0, x0, #16 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) CLEAR8V(16,17,18,19,20,21,22,23) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_3X8X2_K_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \ DGEMM_3X8X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x1,x11,x3,load) \ "mov x14, x0 \n\t" \ "ldr q24, [x14] \n\t" \ "add x14, x14, x2 \n\t" \ "ldr q25, [x14] \n\t" \ "add x14, x14, x2 \n\t" \ "ldr q26, [x14] \n\t" \ /*"add x14, x14, x2 \n\t"*/ \ "add x0, x0, #16 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_3X8X2_K_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_3X8X2_K_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_3X8X2_K_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,30,31,x1,x11,x3,noload) // // If major kernel is executed, // an additional depth-summation is required. " faddp v0.2d, v0.2d, v1.2d \n\t" // Line 0. " faddp v1.2d, v2.2d, v3.2d \n\t" " faddp v2.2d, v4.2d, v5.2d \n\t" " faddp v3.2d, v6.2d, v7.2d \n\t" " faddp v4.2d, v8.2d, v9.2d \n\t" // Line 1. " faddp v5.2d, v10.2d, v11.2d \n\t" " faddp v6.2d, v12.2d, v13.2d \n\t" " faddp v7.2d, v14.2d, v15.2d \n\t" " faddp v8.2d, v16.2d, v17.2d \n\t" // Line 2. " faddp v9.2d, v18.2d, v19.2d \n\t" " faddp v10.2d, v20.2d, v21.2d \n\t" " faddp v11.2d, v22.2d, v23.2d \n\t" " \n\t" // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " mov x11, x1 \n\t" // Load B row. " ld1 {v28.d}[0], [x11], x3 \n\t" " ld1 {v28.d}[1], [x11], x3 \n\t" " ld1 {v29.d}[0], [x11], x3 \n\t" " ld1 {v29.d}[1], [x11], x3 \n\t" " ld1 {v30.d}[0], [x11], x3 \n\t" " ld1 {v30.d}[1], [x11], x3 \n\t" " ld1 {v31.d}[0], [x11], x3 \n\t" " ld1 {v31.d}[1], [x11], x3 \n\t" " add x1, x1, #8 \n\t" " mov x14, x0 \n\t" // Load A column. " ld1 {v24.d}[0], [x14], x2 \n\t" " ld1 {v24.d}[1], [x14], x2 \n\t" " ld1 {v25.d}[0], [x14], x2 \n\t" " add x0, x0, #8 \n\t" " fmla v0.2d, v28.2d, v24.d[0] \n\t" " fmla v1.2d, v29.2d, v24.d[0] \n\t" " fmla v2.2d, v30.2d, v24.d[0] \n\t" " fmla v3.2d, v31.2d, v24.d[0] \n\t" " fmla v4.2d, v28.2d, v24.d[1] \n\t" " fmla v5.2d, v29.2d, v24.d[1] \n\t" " fmla v6.2d, v30.2d, v24.d[1] \n\t" " fmla v7.2d, v31.2d, v24.d[1] \n\t" " fmla v8.2d, v28.2d, v25.d[0] \n\t" " fmla v9.2d, v29.2d, v25.d[0] \n\t" " fmla v10.2d, v30.2d, v25.d[0] \n\t" " fmla v11.2d, v31.2d, v25.d[0] \n\t" " sub x8, x8, #1 \n\t" BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta (value). " ld1r {v31.2d}, [x8] \n\t" " \n\t" " fmov d28, #1.0 \n\t" // Don't scale for unit alpha. " fcmp d30, d28 \n\t" BEQ(UNIT_ALPHA) DSCALE12V(0,1,2,3,4,5,6,7,8,9,10,11,30,0) LABEL(UNIT_ALPHA) " \n\t" " mov x1, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " fcmp d31, #0.0 \n\t" // Don't load for zero beta. BEQ(ZERO_BETA_R) DLOADC_4V_R_FWD(12,13,14,15,x1,0,x6) DLOADC_4V_R_FWD(16,17,18,19,x1,0,x6) DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) DSCALEA12V(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,31,0) LABEL(ZERO_BETA_R) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_R) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_R) #endif DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) " trn1 v12.2d, v0.2d, v4.2d \n\t" " trn2 v13.2d, v0.2d, v4.2d \n\t" " trn1 v14.2d, v1.2d, v5.2d \n\t" " trn2 v15.2d, v1.2d, v5.2d \n\t" " trn1 v16.2d, v2.2d, v6.2d \n\t" " trn2 v17.2d, v2.2d, v6.2d \n\t" " trn1 v18.2d, v3.2d, v7.2d \n\t" " trn2 v19.2d, v3.2d, v7.2d \n\t" " fcmp d31, #0.0 \n\t" // Don't load for zero beta. BEQ(ZERO_BETA_C) DLOADC_1V_1ELM_C_FWD(0,20,0,x1,0,x7) DLOADC_1V_1ELM_C_FWD(1,20,1,x1,0,x7) DLOADC_1V_1ELM_C_FWD(2,21,0,x1,0,x7) DLOADC_1V_1ELM_C_FWD(3,21,1,x1,0,x7) DLOADC_1V_1ELM_C_FWD(4,22,0,x1,0,x7) DLOADC_1V_1ELM_C_FWD(5,22,1,x1,0,x7) DLOADC_1V_1ELM_C_FWD(6,23,0,x1,0,x7) DLOADC_1V_1ELM_C_FWD(7,23,1,x1,0,x7) DSCALEA12V(12,13,14,15,16,17,18,19,8,9,10,11,0,1,2,3,4,5,6,7,20,21,22,23,31,0) LABEL(ZERO_BETA_C) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_C) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_C) #endif DSTOREC_1V_1ELM_C_FWD(12,8,0,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(13,8,1,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(14,9,0,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(15,9,1,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(16,10,0,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(17,10,1,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(18,11,0,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(19,11,1,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) " \n\t" " subs x12, x12, #1 \n\t" BEQ(END_EXEC) " \n\t" " mov x8, #3 \n\t" " madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. " madd x10, x2, x8, x10 \n\t" // Forward A's base address to the next logic panel. BRANCH(MILLIKER_MLOOP) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_b] "m" (cs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), // In Clang, even "m"-passed parameter takes 1 register. // Have to disable prefetching to pass compilation. #ifndef __clang__ [a_next] "r" (a_next), [b_next] "r" (b_next), #endif [m_iter] "m" (m_iter), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); consider_edge_cases: // TODO: Implement optimized kernel for this. // // Forward address. a = a + m_iter * 3 * rs_a; c = c + m_iter * 3 * rs_c; for ( ; m_left > 0; m_left -= 2 ) { dim_t m_loc = ( m_left < 2 ) ? m_left : 2; bli_dgemmsup_rd_armv8a_int_2x8 ( conja, conjb, m_loc, 8, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); a += 2 * rs_a0; c += 2 * rs_c0; } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c000066400000000000000000000530571427272030600277630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" #define DGEMM_1X4X2_NKER_SUBLOOP(C0,C1,C2,C3,A,B0,B1,B2,B3) \ " fmla v"#C0".2d, v"#A".2d, v"#B0".2d \n\t" \ " fmla v"#C1".2d, v"#A".2d, v"#B1".2d \n\t" \ " fmla v"#C2".2d, v"#A".2d, v"#B2".2d \n\t" \ " fmla v"#C3".2d, v"#A".2d, v"#B3".2d \n\t" #define DGEMM_6X4X2_K_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,C50,C51,C52,C53,A0,A1,A2,A3,B0,B1,B2,B3,AADDR,AELEMADDR,AELEMST,LOADNEXT) \ /* Always load before forwarding to the next line. */ \ DGEMM_1X4X2_NKER_SUBLOOP(C00,C01,C02,C03,A0,B0,B1,B2,B3) \ DGEMM_LOAD1V_K_load(A0,AELEMADDR,AELEMST) \ DGEMM_1X4X2_NKER_SUBLOOP(C10,C11,C12,C13,A1,B0,B1,B2,B3) \ DGEMM_LOAD1V_K_load(A1,AELEMADDR,AELEMST) \ " add "#AADDR", "#AADDR", #16 \n\t" \ " mov "#AELEMADDR", "#AADDR" \n\t" \ DGEMM_1X4X2_NKER_SUBLOOP(C20,C21,C22,C23,A2,B0,B1,B2,B3) \ DGEMM_LOAD1V_K_load(A2,AELEMADDR,AELEMST) \ DGEMM_1X4X2_NKER_SUBLOOP(C30,C31,C32,C33,A3,B0,B1,B2,B3) \ DGEMM_LOAD1V_K_load(A3,AELEMADDR,AELEMST) \ \ DGEMM_1X4X2_NKER_SUBLOOP(C40,C41,C42,C43,A0,B0,B1,B2,B3) \ DGEMM_LOAD1V_K_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ DGEMM_1X4X2_NKER_SUBLOOP(C50,C51,C52,C53,A1,B0,B1,B2,B3) \ DGEMM_LOAD1V_K_ ##LOADNEXT (A1,AELEMADDR,AELEMST) #define DGEMM_LOAD1V_K_noload(V,ELEMADDR,ELEMST) #define DGEMM_LOAD1V_K_load(V,ELEMADDR,ELEMST) \ " ldr q"#V", [ "#ELEMADDR" ] \n\t" \ " add "#ELEMADDR", "#ELEMADDR", "#ELEMST" \n\t" // For row-storage of C. #define DLOADC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ DSTORE2V(C0,C1,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" // For column-storage of C. #define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ DLOAD1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DSTORE2V(C0,C1,CADDR,CSHIFT) \ DSTORE1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSCALE12V(V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,A,IDX) \ DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE4V(V4,V5,V6,V7,A,IDX) \ DSCALE4V(V8,V9,V10,V11,A,IDX) #define DSCALEA12V(D0,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,S0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,A,IDX) \ DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX) \ DSCALEA4V(D8,D9,D10,D11,S8,S9,S10,S11,A,IDX) #define DPRFMC_FWD(CADDR,DLONGC) \ " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" void bli_dgemmsup_rd_armv8a_asm_6x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { if ( m0 != 6 ) { if ( m0 < 6 ) { if ( m0 == 5 ) { // 3xk calls. dim_t n = n0; double *b_loc = b; double *c_loc = c; for ( ; n >= 4; n -= 4 ) { bli_dgemmsup_rd_armv8a_asm_3x4 ( conja, conjb, 3, 4, k0, alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0, beta, c_loc, rs_c0, cs_c0, data, cntx ); b_loc += 4 * cs_b0; c_loc += 4 * cs_c0; } if ( n > 0 ) { bli_dgemmsup_rd_armv8a_int_3x4 ( conja, conjb, 3, n, k0, alpha, a, rs_a0, cs_a0, b_loc, rs_b0, cs_b0, beta, c_loc, rs_c0, cs_c0, data, cntx ); } a += 3 * rs_a0; c += 3 * rs_c0; // 2xk calls. for ( ; n0 > 0; n0 -= 8 ) { dim_t n_loc = ( n0 < 8 ) ? n0 : 8; bli_dgemmsup_rd_armv8a_int_2x8 ( conja, conjb, 2, n_loc, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); b += 8 * cs_b0; c += 8 * cs_c0; } return; } else if ( m0 == 4 ) { for ( ; n0 > 0; n0 -= 8 ) { dim_t n_loc = ( n0 < 8 ) ? n0 : 8; bli_dgemmsup_rd_armv8a_int_2x8 ( conja, conjb, 2, n_loc, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); bli_dgemmsup_rd_armv8a_int_2x8 ( conja, conjb, 2, n_loc, k0, alpha, a + 2 * rs_a0, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c + 2 * rs_c0, rs_c0, cs_c0, data, cntx ); b += 8 * cs_b0; c += 8 * cs_c0; } } else if ( m0 == 3 ) { for ( ; n0 >= 4; n0 -= 4 ) { bli_dgemmsup_rd_armv8a_asm_3x4 ( conja, conjb, 3, 4, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); b += 4 * cs_b0; c += 4 * cs_c0; } if ( n0 > 0 ) { bli_dgemmsup_rd_armv8a_int_3x4 ( conja, conjb, 3, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); } } else // m0 == 2 or 1. { for ( ; n0 > 0; n0 -= 8 ) { dim_t n_loc = ( n0 < 8 ) ? n0 : 8; bli_dgemmsup_rd_armv8a_int_2x8 ( conja, conjb, m0, n_loc, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); b += 8 * cs_b0; c += 8 * cs_c0; } } } else { assert( FALSE ); } return; } // LLVM has very bad routing ability for inline asm. // Limit number of registers in case of Clang compilation. #ifndef __clang__ void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); #endif // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; int64_t n_iter = n0 / 4; int64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; assert( cs_a0 == 1 ); assert( rs_b0 == 1 ); if ( n_iter == 0 ) goto consider_edge_cases; __asm__ volatile ( " ldr x10, %[b] \n\t" " ldr x13, %[c] \n\t" " ldr x12, %[n_iter] \n\t" " ldr x2, %[rs_a] \n\t" // Row-skip of A. " ldr x3, %[cs_b] \n\t" // Column-skip of B. " \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x2, x2, #3 \n\t" // rs_a " lsl x3, x3, #3 \n\t" // cs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " mov x1, x5 \n\t" " cmp x7, #8 \n\t" // Prefetch column-strided C. BEQ(C_PREFETCH_COLS) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) BRANCH(C_PREFETCH_END) LABEL(C_PREFETCH_COLS) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) LABEL(C_PREFETCH_END) // // Millikernel. LABEL(MILLIKER_MLOOP) " \n\t" " mov x1, x10 \n\t" // Parameters to be reloaded " mov x5, x13 \n\t" // within each millikernel loop. " ldr x0, %[a] \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:23] <- C // V[24:27] <- A // V[28:31] <- B // Under this scheme, the following is defined: #define DGEMM_6X4X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,B3,AADDR,AELEMADDR,AELEMST,LOADNEXT) \ DGEMM_6X4X2_K_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,A3,B0,B1,B2,B3,AADDR,AELEMADDR,AELEMST,LOADNEXT) // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " mov x11, x1 \n\t" // Load B. " ldr q28, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q29, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q30, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q31, [x11] \n\t" // " add x11, x11, x3 \n\t" " add x1, x1, #16 \n\t" " \n\t" " mov x14, x0 \n\t" // Load A. " ldr q24, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q25, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q26, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q27, [x14] \n\t" " add x14, x14, x2 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) CLEAR8V(16,17,18,19,20,21,22,23) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_6X4X2_K_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1,B2,B3) \ DGEMM_6X4X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,B2,B3,x0,x14,x2,load) \ /* A already loaded and forwarded. Process B only. */ \ "mov x11, x1 \n\t" \ "ldr q28, [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q29, [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q30, [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q31, [x11] \n\t" \ /*"add x11, x11, x3 \n\t"*/ \ "add x1, x1, #16 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_6X4X2_K_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,30,31) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_6X4X2_K_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,25,28,29,30,31) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_6X4X2_K_MKER_LOOP_PLAIN_LOC(26,27,24,25,28,29,30,31,x0,x14,x2,noload) // // If major kernel is executed, // an additional depth-summation is required. " faddp v0.2d, v0.2d, v1.2d \n\t" // Line 0. " faddp v1.2d, v2.2d, v3.2d \n\t" " faddp v2.2d, v4.2d, v5.2d \n\t" // Line 1. " faddp v3.2d, v6.2d, v7.2d \n\t" " faddp v4.2d, v8.2d, v9.2d \n\t" // Line 2. " faddp v5.2d, v10.2d, v11.2d \n\t" " faddp v6.2d, v12.2d, v13.2d \n\t" // Line 3. " faddp v7.2d, v14.2d, v15.2d \n\t" " faddp v8.2d, v16.2d, v17.2d \n\t" // Line 4. " faddp v9.2d, v18.2d, v19.2d \n\t" " faddp v10.2d, v20.2d, v21.2d \n\t" // Line 5. " faddp v11.2d, v22.2d, v23.2d \n\t" " \n\t" // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " mov x11, x1 \n\t" // Load B row. " ld1 {v28.d}[0], [x11], x3 \n\t" " ld1 {v28.d}[1], [x11], x3 \n\t" " ld1 {v29.d}[0], [x11], x3 \n\t" " ld1 {v29.d}[1], [x11], x3 \n\t" " add x1, x1, #8 \n\t" " mov x14, x0 \n\t" // Load A column. " ld1 {v24.d}[0], [x14], x2 \n\t" " ld1 {v24.d}[1], [x14], x2 \n\t" " ld1 {v25.d}[0], [x14], x2 \n\t" " ld1 {v25.d}[1], [x14], x2 \n\t" " ld1 {v26.d}[0], [x14], x2 \n\t" " ld1 {v26.d}[1], [x14], x2 \n\t" " add x0, x0, #8 \n\t" " fmla v0.2d, v28.2d, v24.d[0] \n\t" " fmla v1.2d, v29.2d, v24.d[0] \n\t" " fmla v2.2d, v28.2d, v24.d[1] \n\t" " fmla v3.2d, v29.2d, v24.d[1] \n\t" " fmla v4.2d, v28.2d, v25.d[0] \n\t" " fmla v5.2d, v29.2d, v25.d[0] \n\t" " fmla v6.2d, v28.2d, v25.d[1] \n\t" " fmla v7.2d, v29.2d, v25.d[1] \n\t" " fmla v8.2d, v28.2d, v26.d[0] \n\t" " fmla v9.2d, v29.2d, v26.d[0] \n\t" " fmla v10.2d, v28.2d, v26.d[1] \n\t" " fmla v11.2d, v29.2d, v26.d[1] \n\t" " sub x8, x8, #1 \n\t" BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta (value). " ld1r {v31.2d}, [x8] \n\t" " \n\t" " fmov d28, #1.0 \n\t" // Don't scale for unit alpha. " fcmp d30, d28 \n\t" BEQ(UNIT_ALPHA) DSCALE12V(0,1,2,3,4,5,6,7,8,9,10,11,30,0) LABEL(UNIT_ALPHA) " \n\t" " mov x1, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " fcmp d31, #0.0 \n\t" // Don't load for zero beta. BEQ(ZERO_BETA_R) DLOADC_2V_R_FWD(12,13,x1,0,x6) DLOADC_2V_R_FWD(14,15,x1,0,x6) DLOADC_2V_R_FWD(16,17,x1,0,x6) DLOADC_2V_R_FWD(18,19,x1,0,x6) DLOADC_2V_R_FWD(20,21,x1,0,x6) DLOADC_2V_R_FWD(22,23,x1,0,x6) DSCALEA12V(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,31,0) LABEL(ZERO_BETA_R) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_R) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_R) #endif DSTOREC_2V_R_FWD(0,1,x5,0,x6) DSTOREC_2V_R_FWD(2,3,x5,0,x6) DSTOREC_2V_R_FWD(4,5,x5,0,x6) DSTOREC_2V_R_FWD(6,7,x5,0,x6) DSTOREC_2V_R_FWD(8,9,x5,0,x6) DSTOREC_2V_R_FWD(10,11,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) " trn1 v12.2d, v0.2d, v2.2d \n\t" " trn1 v13.2d, v4.2d, v6.2d \n\t" " trn1 v14.2d, v8.2d, v10.2d \n\t" " trn2 v15.2d, v0.2d, v2.2d \n\t" " trn2 v16.2d, v4.2d, v6.2d \n\t" " trn2 v17.2d, v8.2d, v10.2d \n\t" " trn1 v18.2d, v1.2d, v3.2d \n\t" " trn1 v19.2d, v5.2d, v7.2d \n\t" " trn1 v20.2d, v9.2d, v11.2d \n\t" " trn2 v21.2d, v1.2d, v3.2d \n\t" " trn2 v22.2d, v5.2d, v7.2d \n\t" " trn2 v23.2d, v9.2d, v11.2d \n\t" " fcmp d31, #0.0 \n\t" // Don't load for zero beta. BEQ(ZERO_BETA_C) DLOADC_3V_C_FWD(0,1,2,x1,0,x7) DLOADC_3V_C_FWD(3,4,5,x1,0,x7) DLOADC_3V_C_FWD(6,7,8,x1,0,x7) DLOADC_3V_C_FWD(9,10,11,x1,0,x7) DSCALEA12V(12,13,14,15,16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,8,9,10,11,31,0) LABEL(ZERO_BETA_C) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_C) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_C) #endif DSTOREC_3V_C_FWD(12,13,14,x5,0,x7) DSTOREC_3V_C_FWD(15,16,17,x5,0,x7) DSTOREC_3V_C_FWD(18,19,20,x5,0,x7) DSTOREC_3V_C_FWD(21,22,23,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) " \n\t" " subs x12, x12, #1 \n\t" BEQ(END_EXEC) " \n\t" " mov x8, #4 \n\t" " madd x13, x7, x8, x13 \n\t" // Forward C's base address to the next logic panel. " madd x10, x3, x8, x10 \n\t" // Forward B's base address to the next logic panel. BRANCH(MILLIKER_MLOOP) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_b] "m" (cs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), // In Clang, even "m"-passed parameter takes 1 register. // Have to disable prefetching to pass compilation. #ifndef __clang__ [a_next] "r" (a_next), [b_next] "r" (b_next), #endif [n_iter] "m" (n_iter), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); consider_edge_cases: // TODO: Implement optimized kernel for this. // // Forward address. b = b + n_iter * 4 * cs_b; c = c + n_iter * 4 * cs_c; if ( n_left >= 3 ) { bli_dgemmsup_rd_armv8a_asm_6x3 ( conja, conjb, 6, 3, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); b = b + 3 * cs_b; c = c + 3 * cs_c; n_left -= 3; } if ( n_left ) { // n_left < 3; // // Slice in rows. bli_dgemmsup_rd_armv8a_int_3x4 ( conja, conjb, 3, n_left, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); a = a + 3 * rs_a; c = c + 3 * rs_c; bli_dgemmsup_rd_armv8a_int_3x4 ( conja, conjb, 3, n_left, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c000066400000000000000000000441511427272030600277750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" // Nanokernel operations. #include "../armv8a_asm_d2x2.h" /* * +---+ +---+ +---+ +---+ * | 0 | | 2 | | 4 | | 6 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 1 | | 3 | | 5 | | 7 | * +---+ +---+ +---+ +---+ */ #define DGEMM_4X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,A0,A1,B0,B1,B2,B3,BADDR,BSHIFT0,BSHIFT1,BSHIFT2,LOADNEXT) \ DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT0) \ DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT1) \ DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (B2,BADDR,BSHIFT2) \ DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \ DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ " ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" // Prefetch C in the long direction. #define DPRFMC_FWD(CADDR,DLONGC) \ " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DLOADC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \ DLOAD2V(C00,C10,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" \ DLOAD2V(C01,C11,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \ DSTORE2V(C00,C10,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" \ DSTORE2V(C01,C11,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" /* * 4x8 dgemmsup kernel with extending 1st dimension. * * Recommanded usage case: * o 16 < (L1 cache latency) * (Num. FPU) < 25. * o L1 cache has a bandwidth not too low (true in most cases). * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases). */ void bli_dgemmsup_rv_armv8a_asm_4x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Fixme: This uker has no dispatching for unalighed sizes. // Currently it only serves as a dispatch target for other kernels // and cannot be registered in configurations. assert( n0 == 8 ); // LLVM has very bad routing ability for inline asm. // Limit number of registers in case of Clang compilation. #ifndef __clang__ void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); #endif uint64_t ps_a = bli_auxinfo_ps_a( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; int64_t m_iter = m0 / 4; int64_t m_left = m0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // uint64_t cs_b = cs_b0; assert( cs_b0 == 1 ); if ( m_iter == 0 ) goto consider_edge_cases; __asm__ volatile ( " ldr x10, %[a] \n\t" " ldr x13, %[c] \n\t" " ldr x12, %[m_iter] \n\t" " ldr x11, %[ps_a] \n\t" // Panel-skip of A. " ldr x9, %[rs_a] \n\t" // Row-skip of A. " ldr x2, %[cs_a] \n\t" // Column-skip of A. " ldr x3, %[rs_b] \n\t" // Row-skip of B. " \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x11, x11, #3 \n\t" // ps_a " lsl x9, x9, #3 \n\t" // rs_a " lsl x2, x2, #3 \n\t" // cs_a " lsl x3, x3, #3 \n\t" // rs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " mov x1, x5 \n\t" " cmp x7, #8 \n\t" // Prefetch column-strided C. BEQ(C_PREFETCH_COLS) // This prefetch will not cover further mker perts. Skip. // // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) BRANCH(C_PREFETCH_END) LABEL(C_PREFETCH_COLS) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) LABEL(C_PREFETCH_END) // // Millikernel. LABEL(MILLIKER_MLOOP) " \n\t" " mov x0, x10 \n\t" // Parameters to be reloaded " mov x5, x13 \n\t" // within each millikernel loop. " ldr x1, %[b] \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:15] <- C // V[16:23] <- A; Allowed latency: 48 cycles / # of FPUs. // V[24:31] <- B; Allowed latency: 28 cycles / # of FPUs. // Under this scheme, the following is defined: #define DGEMM_4X8_MKER_LOOP_PLAIN_LOC(A0,A1,B0,B1,B2,B3,BADDR,BSHIFT0,BSHIFT1,BSHIFT2,LOADNEXT) \ DGEMM_4X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,A0,A1,B0,B1,B2,B3,BADDR,BSHIFT0,BSHIFT1,BSHIFT2,LOADNEXT) LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " mov x14, x0 \n\t" // Load A. " ld1 {v16.d}[0], [x14], x9 \n\t" " ld1 {v16.d}[1], [x14], x9 \n\t" " ld1 {v17.d}[0], [x14], x9 \n\t" " ld1 {v17.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v18.d}[0], [x14], x9 \n\t" " ld1 {v18.d}[1], [x14], x9 \n\t" " ld1 {v19.d}[0], [x14], x9 \n\t" " ld1 {v19.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v20.d}[0], [x14], x9 \n\t" " ld1 {v20.d}[1], [x14], x9 \n\t" " ld1 {v21.d}[0], [x14], x9 \n\t" " ld1 {v21.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v22.d}[0], [x14], x9 \n\t" " ld1 {v22.d}[1], [x14], x9 \n\t" " ld1 {v23.d}[0], [x14], x9 \n\t" " ld1 {v23.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " \n\t" " ldr q24, [x1, #16*0] \n\t" // Load B. " ldr q25, [x1, #16*1] \n\t" " ldr q26, [x1, #16*2] \n\t" " ldr q27, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " ldr q28, [x1, #16*0] \n\t" " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,B0,B1,B2,B3) \ DGEMM_4X8_MKER_LOOP_PLAIN_LOC(A0,A1,B0,B1,B2,B3,x1,0,16*1,16*2,load) \ "ldr q"#B3", [x1, #16*3] \n\t" \ "mov x14, x0 \n\t" \ "ld1 {v"#A0".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A0".d}[1], [x14], x9 \n\t" \ "ld1 {v"#A1".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A1".d}[1], [x14], x9 \n\t" \ "add x0, x0, x2 \n\t" \ "add x1, x1, x3 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(16,17,24,25,26,27) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(18,19,28,29,30,31) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,24,25,26,27) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(22,23,28,29,30,31) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_4X8_MKER_LOOP_PLAIN_LOC(16,17,24,25,26,27,x1,0,16*1,16*2,load) " ldr q27, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" DGEMM_4X8_MKER_LOOP_PLAIN_LOC(18,19,28,29,30,31,x1,0,16*1,16*2,load) " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" DGEMM_4X8_MKER_LOOP_PLAIN_LOC(20,21,24,25,26,27,xzr,-1,-1,-1,noload) DGEMM_4X8_MKER_LOOP_PLAIN_LOC(22,23,28,29,30,31,xzr,-1,-1,-1,noload) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " mov x14, x0 \n\t" // Load A col. " ld1 {v16.d}[0], [x14], x9 \n\t" " ld1 {v16.d}[1], [x14], x9 \n\t" " ld1 {v17.d}[0], [x14], x9 \n\t" " ld1 {v17.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " ldr q24, [x1, #16*0] \n\t" // Load B row. " ldr q25, [x1, #16*1] \n\t" " ldr q26, [x1, #16*2] \n\t" " ldr q27, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" DGEMM_4X8_MKER_LOOP_PLAIN_LOC(16,17,24,25,26,27,xzr,-1,-1,-1,noload) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " \n\t" " mov x1, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " ld1r {v16.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v17.2d}, [x8] \n\t" " fcmp d17, #0.0 \n\t" DSCALE8V(0,1,2,3,4,5,6,7,16,0) DSCALE8V(8,9,10,11,12,13,14,15,16,0) BEQ(ZERO_BETA_R) DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) DSCALEA8V(0,1,2,3,4,5,6,7,20,21,22,23,24,25,26,27,17,0) // DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) DSCALEA8V(8,9,10,11,12,13,14,15,20,21,22,23,24,25,26,27,17,0) LABEL(ZERO_BETA_R) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_R) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_R) #endif // DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) // In-register transpose. " trn1 v16.2d, v0.2d, v4.2d \n\t" // Column 0. " trn1 v17.2d, v8.2d, v12.2d \n\t" " trn2 v18.2d, v0.2d, v4.2d \n\t" // Column 1. " trn2 v19.2d, v8.2d, v12.2d \n\t" " trn1 v20.2d, v1.2d, v5.2d \n\t" // Column 2. " trn1 v21.2d, v9.2d, v13.2d \n\t" " trn2 v22.2d, v1.2d, v5.2d \n\t" // Column 3. " trn2 v23.2d, v9.2d, v13.2d \n\t" " trn1 v24.2d, v2.2d, v6.2d \n\t" // Column 4. " trn1 v25.2d, v10.2d, v14.2d \n\t" " trn2 v26.2d, v2.2d, v6.2d \n\t" // Column 5. " trn2 v27.2d, v10.2d, v14.2d \n\t" " trn1 v28.2d, v3.2d, v7.2d \n\t" // Column 6. " trn1 v29.2d, v11.2d, v15.2d \n\t" " trn2 v30.2d, v3.2d, v7.2d \n\t" // Column 7. " trn2 v31.2d, v11.2d, v15.2d \n\t" " ld1r {v14.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v15.2d}, [x8] \n\t" DSCALE8V(16,17,18,19,20,21,22,23,14,0) DSCALE8V(24,25,26,27,28,29,30,31,14,0) DLOADC_4V_C_FWD(0,1,2,3,x1,0,x7) DLOADC_4V_C_FWD(4,5,6,7,x1,0,x7) DSCALEA8V(16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,15,0) // DLOADC_4V_C_FWD(0,1,2,3,x1,0,x7) DLOADC_4V_C_FWD(4,5,6,7,x1,0,x7) DSCALEA8V(24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,15,0) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_C) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_C) #endif // DSTOREC_4V_C_FWD(16,17,18,19,x5,0,x7) DSTOREC_4V_C_FWD(20,21,22,23,x5,0,x7) DSTOREC_4V_C_FWD(24,25,26,27,x5,0,x7) DSTOREC_4V_C_FWD(28,29,30,31,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) " \n\t" " subs x12, x12, #1 \n\t" BEQ(END_EXEC) " \n\t" " mov x8, #4 \n\t" " madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. " add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. BRANCH(MILLIKER_MLOOP) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a] "m" (ps_a), [rs_b] "m" (rs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), // In Clang, even "m"-passed parameter takes 1 register. // Have to disable prefetching to pass compilation. #ifndef __clang__ [a_next] "r" (a_next), [b_next] "r" (b_next), #endif [m_iter] "m" (m_iter), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); consider_edge_cases: // TODO: Implement optimized kernel for this. // // Forward address. a = a + m_iter * ps_a; c = c + m_iter * 4 * rs_c; if ( m_left ) { bli_dgemmsup_r_armv8a_ref2 ( conja, conjb, m_left, 8, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c000066400000000000000000000444011427272030600277740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" // Nanokernel operations. #include "../armv8a_asm_d2x2.h" /* * +---+ +---+ +---+ +---+ * | 0 | | 2 | | 4 | | 6 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 1 | | 3 | | 5 | | 7 | * +---+ +---+ +---+ +---+ */ #define DGEMM_4X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,A0,A1,B0,B1,B2,B3,BADDR,BSHIFT0,BSHIFT1,BSHIFT2,LOADNEXT) \ DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT0) \ DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (B1,BADDR,BSHIFT1) \ DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ DGEMM_LOAD1V_ ##LOADNEXT (B2,BADDR,BSHIFT2) \ DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \ DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ " ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" // Prefetch C in the long direction. #define DPRFMC_FWD(CADDR,DLONGC) \ " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DLOADC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \ DLOAD2V(C00,C10,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" \ DLOAD2V(C01,C11,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_4V_C_FWD(C00,C10,C01,C11,CADDR,CSHIFT,CSC) \ DSTORE2V(C00,C10,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" \ DSTORE2V(C01,C11,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" /* * 4x8 dgemmsup kernel with extending 2nd dimension. * * Recommanded usage case: * o 16 < (L1 cache latency) * (Num. FPU) < 25. * o L1 cache has a bandwidth not too low (true in most cases). * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases). */ void bli_dgemmsup_rv_armv8a_asm_4x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Fixme: This uker has no dispatching for unalighed sizes. // Currently it only serves as a dispatch target for other kernels // and cannot be registered in configurations. assert( m0 == 4 ); // LLVM has very bad routing ability for inline asm. // Limit number of registers in case of Clang compilation. #ifndef __clang__ void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); #endif uint64_t ps_b = bli_auxinfo_ps_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; int64_t n_iter = n0 / 8; int64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // uint64_t cs_b = cs_b0; assert( cs_b0 == 1 ); if ( n_iter == 0 ) goto consider_edge_cases; __asm__ volatile ( " ldr x10, %[b] \n\t" " ldr x13, %[c] \n\t" " ldr x12, %[n_iter] \n\t" " ldr x11, %[ps_b] \n\t" // Panel-skip of B. " ldr x3, %[rs_b] \n\t" // Row-skip of B. " ldr x9, %[rs_a] \n\t" // Row-skip of A. " ldr x2, %[cs_a] \n\t" // Column-skip of A. " \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x11, x11, #3 \n\t" // ps_b " lsl x9, x9, #3 \n\t" // rs_a " lsl x2, x2, #3 \n\t" // cs_a " lsl x3, x3, #3 \n\t" // rs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " mov x1, x5 \n\t" " cmp x7, #8 \n\t" // Prefetch column-strided C. BEQ(C_PREFETCH_COLS) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) BRANCH(C_PREFETCH_END) LABEL(C_PREFETCH_COLS) // This prefetch will not cover further mker perts. Skip. // // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) LABEL(C_PREFETCH_END) // // Millikernel. LABEL(MILLIKER_MLOOP) " \n\t" " mov x1, x10 \n\t" // Parameters to be reloaded " mov x5, x13 \n\t" // within each millikernel loop. " ldr x0, %[a] \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:15] <- C // V[16:23] <- A; Allowed latency: 48 cycles / # of FPUs. // V[24:31] <- B; Allowed latency: 28 cycles / # of FPUs. // Under this scheme, the following is defined: #define DGEMM_4X8_MKER_LOOP_PLAIN_LOC(A0,A1,B0,B1,B2,B3,BADDR,BSHIFT0,BSHIFT1,BSHIFT2,LOADNEXT) \ DGEMM_4X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,A0,A1,B0,B1,B2,B3,BADDR,BSHIFT0,BSHIFT1,BSHIFT2,LOADNEXT) LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " ldr q24, [x1, #16*0] \n\t" // Load B first. " ldr q25, [x1, #16*1] \n\t" " ldr q26, [x1, #16*2] \n\t" " ldr q27, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " ldr q28, [x1, #16*0] \n\t" " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " \n\t" " mov x14, x0 \n\t" // Load A. " ld1 {v16.d}[0], [x14], x9 \n\t" // We want A to be kept in L1. " ld1 {v16.d}[1], [x14], x9 \n\t" " ld1 {v17.d}[0], [x14], x9 \n\t" " ld1 {v17.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v18.d}[0], [x14], x9 \n\t" " ld1 {v18.d}[1], [x14], x9 \n\t" " ld1 {v19.d}[0], [x14], x9 \n\t" " ld1 {v19.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v20.d}[0], [x14], x9 \n\t" " ld1 {v20.d}[1], [x14], x9 \n\t" " ld1 {v21.d}[0], [x14], x9 \n\t" " ld1 {v21.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v22.d}[0], [x14], x9 \n\t" " ld1 {v22.d}[1], [x14], x9 \n\t" " ld1 {v23.d}[0], [x14], x9 \n\t" " ld1 {v23.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,B0,B1,B2,B3) \ DGEMM_4X8_MKER_LOOP_PLAIN_LOC(A0,A1,B0,B1,B2,B3,x1,0,16*1,16*2,load) \ "ldr q"#B3", [x1, #16*3] \n\t" \ "mov x14, x0 \n\t" \ "ld1 {v"#A0".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A0".d}[1], [x14], x9 \n\t" \ "ld1 {v"#A1".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A1".d}[1], [x14], x9 \n\t" \ "add x0, x0, x2 \n\t" \ "add x1, x1, x3 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(16,17,24,25,26,27) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(18,19,28,29,30,31) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(20,21,24,25,26,27) DGEMM_4X8_MKER_LOOP_PLAIN_LOC_FWD(22,23,28,29,30,31) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_4X8_MKER_LOOP_PLAIN_LOC(16,17,24,25,26,27,x1,0,16*1,16*2,load) " ldr q27, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" DGEMM_4X8_MKER_LOOP_PLAIN_LOC(18,19,28,29,30,31,x1,0,16*1,16*2,load) " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" DGEMM_4X8_MKER_LOOP_PLAIN_LOC(20,21,24,25,26,27,xzr,-1,-1,-1,noload) DGEMM_4X8_MKER_LOOP_PLAIN_LOC(22,23,28,29,30,31,xzr,-1,-1,-1,noload) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " ldr q24, [x1, #16*0] \n\t" // Load B row. " ldr q25, [x1, #16*1] \n\t" " ldr q26, [x1, #16*2] \n\t" " ldr q27, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " mov x14, x0 \n\t" // Load A col. " ld1 {v16.d}[0], [x14], x9 \n\t" " ld1 {v16.d}[1], [x14], x9 \n\t" " ld1 {v17.d}[0], [x14], x9 \n\t" " ld1 {v17.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " sub x8, x8, #1 \n\t" DGEMM_4X8_MKER_LOOP_PLAIN_LOC(16,17,24,25,26,27,xzr,-1,-1,-1,noload) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " \n\t" " mov x1, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " ld1r {v16.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v17.2d}, [x8] \n\t" " fcmp d17, #0.0 \n\t" DSCALE8V(0,1,2,3,4,5,6,7,16,0) DSCALE8V(8,9,10,11,12,13,14,15,16,0) BEQ(ZERO_BETA_R) DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) DSCALEA8V(0,1,2,3,4,5,6,7,20,21,22,23,24,25,26,27,17,0) // DLOADC_4V_R_FWD(20,21,22,23,x1,0,x6) DLOADC_4V_R_FWD(24,25,26,27,x1,0,x6) DSCALEA8V(8,9,10,11,12,13,14,15,20,21,22,23,24,25,26,27,17,0) LABEL(ZERO_BETA_R) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_R) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_R) #endif // DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) // In-register transpose. " trn1 v16.2d, v0.2d, v4.2d \n\t" // Column 0. " trn1 v17.2d, v8.2d, v12.2d \n\t" " trn2 v18.2d, v0.2d, v4.2d \n\t" // Column 1. " trn2 v19.2d, v8.2d, v12.2d \n\t" " trn1 v20.2d, v1.2d, v5.2d \n\t" // Column 2. " trn1 v21.2d, v9.2d, v13.2d \n\t" " trn2 v22.2d, v1.2d, v5.2d \n\t" // Column 3. " trn2 v23.2d, v9.2d, v13.2d \n\t" " trn1 v24.2d, v2.2d, v6.2d \n\t" // Column 4. " trn1 v25.2d, v10.2d, v14.2d \n\t" " trn2 v26.2d, v2.2d, v6.2d \n\t" // Column 5. " trn2 v27.2d, v10.2d, v14.2d \n\t" " trn1 v28.2d, v3.2d, v7.2d \n\t" // Column 6. " trn1 v29.2d, v11.2d, v15.2d \n\t" " trn2 v30.2d, v3.2d, v7.2d \n\t" // Column 7. " trn2 v31.2d, v11.2d, v15.2d \n\t" " ld1r {v14.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v15.2d}, [x8] \n\t" DSCALE8V(16,17,18,19,20,21,22,23,14,0) DSCALE8V(24,25,26,27,28,29,30,31,14,0) DLOADC_4V_C_FWD(0,1,2,3,x1,0,x7) DLOADC_4V_C_FWD(4,5,6,7,x1,0,x7) DSCALEA8V(16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,15,0) // DLOADC_4V_C_FWD(0,1,2,3,x1,0,x7) DLOADC_4V_C_FWD(4,5,6,7,x1,0,x7) DSCALEA8V(24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,15,0) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_C) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_C) #endif // DSTOREC_4V_C_FWD(16,17,18,19,x5,0,x7) DSTOREC_4V_C_FWD(20,21,22,23,x5,0,x7) DSTOREC_4V_C_FWD(24,25,26,27,x5,0,x7) DSTOREC_4V_C_FWD(28,29,30,31,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) " \n\t" " subs x12, x12, #1 \n\t" BEQ(END_EXEC) " \n\t" " mov x8, #8 \n\t" " madd x13, x7, x8, x13 \n\t" // Forward C's base address to the next logic panel. " add x10, x10, x11 \n\t" // Forward B's base address to the next logic panel. BRANCH(MILLIKER_MLOOP) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_b] "m" (ps_b), [rs_b] "m" (rs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), // In Clang, even "m"-passed parameter takes 1 register. // Have to disable prefetching to pass compilation. #ifndef __clang__ [a_next] "r" (a_next), [b_next] "r" (b_next), #endif [n_iter] "m" (n_iter), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); consider_edge_cases: // TODO: Implement optimized kernel for this. // // Forward address. b = b + n_iter * ps_b; c = c + n_iter * 8 * cs_c; if ( n_left ) { auxinfo_t data_d6x4mn = *data; bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); bli_dgemmsup_rv_armv8a_int_6x4mn ( conja, conjb, 4, n_left, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c000066400000000000000000000516411427272030600300010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" // Nanokernel operations. #include "../armv8a_asm_d2x2.h" /* Order of row-major DGEMM_6x8's execution in 2x2 blocks: * * +---+ +---+ +---+ +---+ * | 0 | | 1 | | 6 | | 7 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 2 | | 3 | | 8 | | 9 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 4 | | 5 | | 10| | 11| * +---+ +---+ +---+ +---+ * */ #define DGEMM_6X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,C50,C51,C52,C53,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ DGEMM_LOAD2V_ ##LOADNEXT (B0,B1,BADDR,BSHIFT) \ DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \ DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) \ DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \ DGEMM_2X2_NANOKERNEL(C43,C53,B3,A2) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ " ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ DGEMM_LOAD1V_load(V1,ADDR,IMM) \ DGEMM_LOAD1V_load(V2,ADDR,IMM+16) #define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) #define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ " ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ " ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" // Prefetch C in the long direction. #define DPRFMC_FWD(CADDR,DLONGC) \ " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" // For row-storage of C. #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" // For column-storage of C. #define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ DLOAD1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DSTORE2V(C0,C1,CADDR,CSHIFT) \ DSTORE1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSCALE6V(V0,V1,V2,V3,V4,V5,A,IDX) \ DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE2V(V4,V5,A,IDX) #define DSCALEA6V(D0,D1,D2,D3,D4,D5,S0,S1,S2,S3,S4,S5,A,IDX) \ DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA2V(D4,D5,S4,S5,A,IDX) /* * 6x8 dgemmsup kernel with extending 1st dimension. * * Recommanded usage case: (L1 cache latency) * (Num. FPU) < 17 cycles. * * Calls 4x8 for edge cases. */ void bli_dgemmsup_rv_armv8a_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { if ( n0 != 8 ) { if ( n0 < 8 ) { for ( ; n0 >= 4; n0 -= 4 ) { dgemmsup_ker_ft ukr_fp; auxinfo_t data_d8xkm = *data; if ( bli_auxinfo_ps_a( data ) == 6 * rs_a0 ) { // Use 8x4 Asm kernel for the unpacked case. bli_auxinfo_set_ps_a( 8 * rs_a0, &data_d8xkm ); ukr_fp = bli_dgemmsup_rv_armv8a_asm_8x4m; } else { // Cannot change dimension for m when A is packed. ukr_fp = bli_dgemmsup_rv_armv8a_int_6x4mn; } ukr_fp ( conja, conjb, m0, 4, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, &data_d8xkm, cntx ); b += 4 * cs_b0; c += 4 * cs_c0; } if ( n0 > 0 ) { bli_dgemmsup_rv_armv8a_int_6x4mn ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); } } else { assert( FALSE ); } return; } // LLVM has very bad routing ability for inline asm. // Limit number of registers in case of Clang compilation. #ifndef __clang__ void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); #endif uint64_t ps_a = bli_auxinfo_ps_a( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; int64_t m_iter = m0 / 6; int64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // uint64_t cs_b = cs_b0; assert( cs_b0 == 1 ); if ( m_iter == 0 ) goto consider_edge_cases; __asm__ volatile ( " ldr x10, %[a] \n\t" " ldr x13, %[c] \n\t" " ldr x12, %[m_iter] \n\t" " ldr x11, %[ps_a] \n\t" // Panel-skip of A. " ldr x9, %[rs_a] \n\t" // Row-skip of A. " ldr x2, %[cs_a] \n\t" // Column-skip of A. " ldr x3, %[rs_b] \n\t" // Row-skip of B. " \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x11, x11, #3 \n\t" // ps_a " lsl x9, x9, #3 \n\t" // rs_a " lsl x2, x2, #3 \n\t" // cs_a " lsl x3, x3, #3 \n\t" // rs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " mov x1, x5 \n\t" " cmp x7, #8 \n\t" // Prefetch column-strided C. BEQ(C_PREFETCH_COLS) // This prefetch will not cover further mker perts. Skip. // // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) // DPRFMC_FWD(x1,x6) BRANCH(C_PREFETCH_END) LABEL(C_PREFETCH_COLS) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) DPRFMC_FWD(x1,x7) LABEL(C_PREFETCH_END) // // Millikernel. LABEL(MILLIKER_MLOOP) " \n\t" " mov x0, x10 \n\t" // Parameters to be reloaded " mov x5, x13 \n\t" // within each millikernel loop. " ldr x1, %[b] \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:23] <- C // V[24:27] <- A // V[28:31] <- B // Under this scheme, the following is defined: #define DGEMM_6X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ DGEMM_6X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " mov x14, x0 \n\t" // Load A. " ld1 {v24.d}[0], [x14], x9 \n\t" " ld1 {v24.d}[1], [x14], x9 \n\t" " ld1 {v25.d}[0], [x14], x9 \n\t" " ld1 {v25.d}[1], [x14], x9 \n\t" " ld1 {v26.d}[0], [x14], x9 \n\t" " ld1 {v26.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v27.d}[0], [x14], x9 \n\t" " ld1 {v27.d}[1], [x14], x9 \n\t" " \n\t" " ldr q28, [x1, #16*0] \n\t" // Load B. " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) CLEAR8V(16,17,18,19,20,21,22,23) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \ DGEMM_6X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x14,x9,x1,0,load) \ "add x0, x0, x2 \n\t" \ "mov x14, x0 \n\t" \ "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ "ldr q"#B2", [x1, #16*2] \n\t" \ "ldr q"#B3", [x1, #16*3] \n\t" \ "add x1, x1, x3 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,28,29,30,31) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29,30,31) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,28,29,30,31) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,30,31,x14,x9,x1,0,load) " add x0, x0, x2 \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" DGEMM_6X8_MKER_LOOP_PLAIN_LOC(25,26,27,28,29,30,31,xzr,-1,xzr,-1,noload) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " mov x14, x0 \n\t" " ld1 {v24.d}[0], [x14], x9 \n\t" // Load A col. " ld1 {v24.d}[1], [x14], x9 \n\t" " ld1 {v25.d}[0], [x14], x9 \n\t" " ld1 {v25.d}[1], [x14], x9 \n\t" " ld1 {v26.d}[0], [x14], x9 \n\t" " ld1 {v26.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " ldr q28, [x1, #16*0] \n\t" // Load B row. " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" DGEMM_6X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,30,31,xzr,-1,xzr,-1,noload) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " \n\t" " mov x1, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " ld1r {v24.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v25.2d}, [x8] \n\t" " fmov d26, #1.0 \n\t" " fcmp d24, d26 \n\t" BEQ(UNIT_ALPHA_R) DSCALE8V(0,1,2,3,4,5,6,7,24,0) DSCALE8V(8,9,10,11,12,13,14,15,24,0) DSCALE8V(16,17,18,19,20,21,22,23,24,0) LABEL(UNIT_ALPHA_R) " fcmp d25, #0.0 \n\t" BEQ(ZERO_BETA_R_1) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DSCALEA4V(0,1,2,3,26,27,28,29,25,0) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DSCALEA4V(4,5,6,7,26,27,28,29,25,0) LABEL(ZERO_BETA_R_1) DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) BEQ(ZERO_BETA_R_2) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) DSCALEA8V(8,9,10,11,12,13,14,15,26,27,28,29,0,1,2,3,25,0) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) DSCALEA8V(16,17,18,19,20,21,22,23,26,27,28,29,0,1,2,3,25,0) LABEL(ZERO_BETA_R_2) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_R) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_R) #endif DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6) DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6) DSTOREC_4V_R_FWD(20,21,22,23,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) // In-register transpose, // do transposition in row-order. " trn1 v24.2d, v0.2d, v4.2d \n\t" // Row 0-1. " trn2 v25.2d, v0.2d, v4.2d \n\t" " trn1 v26.2d, v1.2d, v5.2d \n\t" " trn2 v27.2d, v1.2d, v5.2d \n\t" " trn1 v28.2d, v2.2d, v6.2d \n\t" " trn2 v29.2d, v2.2d, v6.2d \n\t" " trn1 v30.2d, v3.2d, v7.2d \n\t" " trn2 v31.2d, v3.2d, v7.2d \n\t" " \n\t" " trn1 v0.2d, v8.2d, v12.2d \n\t" // Row 2-3. " trn2 v1.2d, v8.2d, v12.2d \n\t" " trn1 v2.2d, v9.2d, v13.2d \n\t" " trn2 v3.2d, v9.2d, v13.2d \n\t" " trn1 v4.2d, v10.2d, v14.2d \n\t" " trn2 v5.2d, v10.2d, v14.2d \n\t" " trn1 v6.2d, v11.2d, v15.2d \n\t" " trn2 v7.2d, v11.2d, v15.2d \n\t" " \n\t" " trn1 v8.2d, v16.2d, v20.2d \n\t" // Row 4-5. " trn2 v9.2d, v16.2d, v20.2d \n\t" " trn1 v10.2d, v17.2d, v21.2d \n\t" // AMARI " trn2 v11.2d, v17.2d, v21.2d \n\t" // AMARI " trn1 v12.2d, v18.2d, v22.2d \n\t" // AMARI " trn2 v13.2d, v18.2d, v22.2d \n\t" // AMARI " trn1 v14.2d, v19.2d, v23.2d \n\t" // AMARI " trn2 v15.2d, v19.2d, v23.2d \n\t" // AMARI " \n\t" " ld1r {v16.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v17.2d}, [x8] \n\t" " fmov d18, #1.0 \n\t" " fcmp d16, d18 \n\t" BEQ(UNIT_ALPHA_C) DSCALE8V(24,25,26,27,28,29,30,31,16,0) DSCALE8V(0,1,2,3,4,5,6,7,16,0) DSCALE8V(8,9,10,11,12,13,14,15,16,0) LABEL(UNIT_ALPHA_C) " fcmp d17, #0.0 \n\t" BEQ(ZERO_BETA_C_1) DLOADC_3V_C_FWD(18,19,20,x1,0,x7) DLOADC_3V_C_FWD(21,22,23,x1,0,x7) DSCALEA6V(24,0,8,25,1,9,18,19,20,21,22,23,17,0) LABEL(ZERO_BETA_C_1) DSTOREC_3V_C_FWD(24,0,8,x5,0,x7) DSTOREC_3V_C_FWD(25,1,9,x5,0,x7) BEQ(ZERO_BETA_C_2) DLOADC_3V_C_FWD(18,19,20,x1,0,x7) DLOADC_3V_C_FWD(21,22,23,x1,0,x7) DLOADC_3V_C_FWD(24,0,8,x1,0,x7) DLOADC_3V_C_FWD(25,1,9,x1,0,x7) DSCALEA6V(26,2,10,27,3,11,18,19,20,21,22,23,17,0) DSCALEA6V(28,4,12,29,5,13,24,0,8,25,1,9,17,0) LABEL(ZERO_BETA_C_2) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_C) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_C) " fcmp d17, #0.0 \n\t" // Not the end. Reset branching reg. #endif DSTOREC_3V_C_FWD(26,2,10,x5,0,x7) DSTOREC_3V_C_FWD(27,3,11,x5,0,x7) BEQ(ZERO_BETA_C_3) DLOADC_3V_C_FWD(18,19,20,x1,0,x7) DLOADC_3V_C_FWD(21,22,23,x1,0,x7) DSCALEA6V(30,6,14,31,7,15,18,19,20,21,22,23,17,0) LABEL(ZERO_BETA_C_3) DSTOREC_3V_C_FWD(28,4,12,x5,0,x7) DSTOREC_3V_C_FWD(29,5,13,x5,0,x7) DSTOREC_3V_C_FWD(30,6,14,x5,0,x7) DSTOREC_3V_C_FWD(31,7,15,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) " \n\t" " subs x12, x12, #1 \n\t" BEQ(END_EXEC) " \n\t" " mov x8, #6 \n\t" " madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. " add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. BRANCH(MILLIKER_MLOOP) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a] "m" (ps_a), [rs_b] "m" (rs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), // In Clang, even "m"-passed parameter takes 1 register. // Have to disable prefetching to pass compilation. #ifndef __clang__ [a_next] "r" (a_next), [b_next] "r" (b_next), #endif [m_iter] "m" (m_iter), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); consider_edge_cases: // Forward address. a = a + m_iter * ps_a; c = c + m_iter * 6 * rs_c; #if 1 auxinfo_t data_d6x4mn = *data; bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); bli_dgemmsup_rv_armv8a_int_6x4mn ( conja, conjb, m_left, 8, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx ); #else if ( m_left >= 4 ) { // Calls 4x8m with only 1 outermost loop. // As only 1 outermost loop is called, // ps_a needs not being set here. // bli_dgemmsup_rv_armv8a_asm_4x8m ( conja, conjb, 4, 8, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); m_left -= 4; a = a + 4 * rs_a; c = c + 4 * rs_c; } if ( m_left ) { bli_dgemmsup_r_armv8a_ref2 ( conja, conjb, m_left, 8, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); } #endif } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c000066400000000000000000000502571427272030600300040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" // Nanokernel operations. #include "../armv8a_asm_d2x2.h" /* Order of row-major DGEMM_6x8's execution in 2x2 blocks: * * +---+ +---+ +---+ +---+ * | 0 | | 1 | | 6 | | 7 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 2 | | 3 | | 8 | | 9 | * +---+ +---+ +---+ +---+ * +---+ +---+ +---+ +---+ * | 4 | | 5 | | 10| | 11| * +---+ +---+ +---+ +---+ * */ #define DGEMM_6X8_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,C30,C31,C32,C33,C40,C41,C42,C43,C50,C51,C52,C53,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ DGEMM_2X2_NANOKERNEL(C00,C10,B0,A0) \ DGEMM_2X2_NANOKERNEL(C01,C11,B1,A0) \ DGEMM_2X2_NANOKERNEL(C20,C30,B0,A1) \ DGEMM_2X2_NANOKERNEL(C21,C31,B1,A1) \ DGEMM_2X2_NANOKERNEL(C40,C50,B0,A2) \ DGEMM_2X2_NANOKERNEL(C41,C51,B1,A2) \ DGEMM_LOAD2V_ ##LOADNEXT (B0,B1,BADDR,BSHIFT) \ DGEMM_2X2_NANOKERNEL(C02,C12,B2,A0) \ DGEMM_2X2_NANOKERNEL(C03,C13,B3,A0) \ DGEMM_LOAD1V_G_ ##LOADNEXT (A0,AELEMADDR,AELEMST) \ DGEMM_2X2_NANOKERNEL(C22,C32,B2,A1) \ DGEMM_2X2_NANOKERNEL(C23,C33,B3,A1) \ DGEMM_LOAD1V_G_ ##LOADNEXT (A1,AELEMADDR,AELEMST) \ DGEMM_2X2_NANOKERNEL(C42,C52,B2,A2) \ DGEMM_2X2_NANOKERNEL(C43,C53,B3,A2) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ " ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" #define DGEMM_LOAD2V_noload(V1,V2,ADDR,IMM) #define DGEMM_LOAD2V_load(V1,V2,ADDR,IMM) \ DGEMM_LOAD1V_load(V1,ADDR,IMM) \ DGEMM_LOAD1V_load(V2,ADDR,IMM+16) #define DGEMM_LOAD1V_G_noload(V1,ADDR,ST) #define DGEMM_LOAD1V_G_load(V1,ADDR,ST) \ " ld1 {v"#V1".d}[0], ["#ADDR"], "#ST" \n\t" \ " ld1 {v"#V1".d}[1], ["#ADDR"], "#ST" \n\t" // Prefetch C in the long direction. #define DPRFMC_FWD(CADDR,DLONGC) \ " prfm PLDL1KEEP, ["#CADDR"] \n\t" \ " add "#CADDR", "#CADDR", "#DLONGC" \n\t" // For row-storage of C. #define DLOADC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_4V_R_FWD(C0,C1,C2,C3,CADDR,CSHIFT,RSC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" // For column-storage of C. #define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ DLOAD1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DSTORE2V(C0,C1,CADDR,CSHIFT) \ DSTORE1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSCALE6V(V0,V1,V2,V3,V4,V5,A,IDX) \ DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE2V(V4,V5,A,IDX) #define DSCALEA6V(D0,D1,D2,D3,D4,D5,S0,S1,S2,S3,S4,S5,A,IDX) \ DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA2V(D4,D5,S4,S5,A,IDX) /* * 6x8 dgemmsup kernel with extending 2nd dimension. * * Recommanded usage case: (L1 cache latency) * (Num. FPU) < 17 cycles. * * Calls 4x8n for edge cases. */ void bli_dgemmsup_rv_armv8a_asm_6x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { if ( m0 != 6 ) { // 5 = 4 + 1; // 4; // while ( m0 >= 4 ) { bli_dgemmsup_rv_armv8a_asm_4x8n ( conja, conjb, 4, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); m0 -= 4; a += 4 * rs_a0; c += 4 * rs_c0; } // 3, 2, 1; // if ( m0 > 0 ) { bli_dgemmsup_rv_armv8a_int_3x8mn ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); } return; } // LLVM has very bad routing ability for inline asm. // Limit number of registers in case of Clang compilation. #ifndef __clang__ void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); #endif uint64_t ps_b = bli_auxinfo_ps_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; int64_t n_iter = n0 / 8; int64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // uint64_t cs_b = cs_b0; assert( cs_b0 == 1 ); if ( n_iter == 0 ) goto consider_edge_cases; __asm__ volatile ( " ldr x10, %[b] \n\t" " ldr x13, %[c] \n\t" " ldr x12, %[n_iter] \n\t" " ldr x11, %[ps_b] \n\t" // Panel-skip of B. " ldr x3, %[rs_b] \n\t" // Row-skip of B. " ldr x9, %[rs_a] \n\t" // Row-skip of A. " ldr x2, %[cs_a] \n\t" // Column-skip of A. " \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x11, x11, #3 \n\t" // ps_b " lsl x9, x9, #3 \n\t" // rs_a " lsl x2, x2, #3 \n\t" // cs_a " lsl x3, x3, #3 \n\t" // rs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " mov x1, x5 \n\t" " cmp x7, #8 \n\t" // Prefetch column-strided C. BEQ(C_PREFETCH_COLS) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) DPRFMC_FWD(x1,x6) BRANCH(C_PREFETCH_END) LABEL(C_PREFETCH_COLS) // This prefetch will not cover further mker perts. Skip. // // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) // DPRFMC_FWD(x1,x7) LABEL(C_PREFETCH_END) // // Millikernel. LABEL(MILLIKER_MLOOP) " \n\t" " mov x1, x10 \n\t" // Parameters to be reloaded " mov x5, x13 \n\t" // within each millikernel loop. " ldr x0, %[a] \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:23] <- C // V[24:27] <- A // V[28:31] <- B // Under this scheme, the following is defined: #define DGEMM_6X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) \ DGEMM_6X8_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,A0,A1,A2,B0,B1,B2,B3,AELEMADDR,AELEMST,BADDR,BSHIFT,LOADNEXT) // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " ldr q28, [x1, #16*0] \n\t" // Load B first. " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " \n\t" " mov x14, x0 \n\t" // Load A. " ld1 {v24.d}[0], [x14], x9 \n\t" // We want A to be kept in L1. " ld1 {v24.d}[1], [x14], x9 \n\t" " ld1 {v25.d}[0], [x14], x9 \n\t" " ld1 {v25.d}[1], [x14], x9 \n\t" " ld1 {v26.d}[0], [x14], x9 \n\t" " ld1 {v26.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v27.d}[0], [x14], x9 \n\t" " ld1 {v27.d}[1], [x14], x9 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) CLEAR8V(16,17,18,19,20,21,22,23) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \ DGEMM_6X8_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3,x14,x9,x1,0,load) \ "add x0, x0, x2 \n\t" \ "mov x14, x0 \n\t" \ "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ "ldr q"#B2", [x1, #16*2] \n\t" \ "ldr q"#B3", [x1, #16*3] \n\t" \ "add x1, x1, x3 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,28,29,30,31) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(27,24,25,28,29,30,31) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(26,27,24,28,29,30,31) DGEMM_6X8_MKER_LOOP_PLAIN_LOC_FWD(25,26,27,28,29,30,31) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_6X8_MKER_LOOP_PLAIN_LOC(26,27,24,28,29,30,31,x14,x9,x1,0,load) " add x0, x0, x2 \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" DGEMM_6X8_MKER_LOOP_PLAIN_LOC(25,26,27,28,29,30,31,xzr,-1,xzr,-1,noload) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " ldr q28, [x1, #16*0] \n\t" // Load B row. " ldr q29, [x1, #16*1] \n\t" " ldr q30, [x1, #16*2] \n\t" " ldr q31, [x1, #16*3] \n\t" " add x1, x1, x3 \n\t" " mov x14, x0 \n\t" " ld1 {v24.d}[0], [x14], x9 \n\t" // Load A col. " ld1 {v24.d}[1], [x14], x9 \n\t" " ld1 {v25.d}[0], [x14], x9 \n\t" " ld1 {v25.d}[1], [x14], x9 \n\t" " ld1 {v26.d}[0], [x14], x9 \n\t" " ld1 {v26.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " sub x8, x8, #1 \n\t" DGEMM_6X8_MKER_LOOP_PLAIN_LOC(24,25,26,28,29,30,31,xzr,-1,xzr,-1,noload) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " \n\t" " mov x1, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " ld1r {v24.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v25.2d}, [x8] \n\t" " fmov d26, #1.0 \n\t" " fcmp d24, d26 \n\t" BEQ(UNIT_ALPHA_R) DSCALE8V(0,1,2,3,4,5,6,7,24,0) DSCALE8V(8,9,10,11,12,13,14,15,24,0) DSCALE8V(16,17,18,19,20,21,22,23,24,0) LABEL(UNIT_ALPHA_R) " fcmp d25, #0.0 \n\t" BEQ(ZERO_BETA_R_1) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DSCALEA4V(0,1,2,3,26,27,28,29,25,0) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DSCALEA4V(4,5,6,7,26,27,28,29,25,0) LABEL(ZERO_BETA_R_1) DSTOREC_4V_R_FWD(0,1,2,3,x5,0,x6) BEQ(ZERO_BETA_R_2) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) DSCALEA8V(8,9,10,11,12,13,14,15,26,27,28,29,0,1,2,3,25,0) DLOADC_4V_R_FWD(26,27,28,29,x1,0,x6) DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) DSCALEA8V(16,17,18,19,20,21,22,23,26,27,28,29,0,1,2,3,25,0) LABEL(ZERO_BETA_R_2) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_R) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_R) #endif DSTOREC_4V_R_FWD(4,5,6,7,x5,0,x6) DSTOREC_4V_R_FWD(8,9,10,11,x5,0,x6) DSTOREC_4V_R_FWD(12,13,14,15,x5,0,x6) DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6) DSTOREC_4V_R_FWD(20,21,22,23,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) // In-register transpose, // do transposition in row-order. " trn1 v24.2d, v0.2d, v4.2d \n\t" // Row 0-1. " trn2 v25.2d, v0.2d, v4.2d \n\t" " trn1 v26.2d, v1.2d, v5.2d \n\t" " trn2 v27.2d, v1.2d, v5.2d \n\t" " trn1 v28.2d, v2.2d, v6.2d \n\t" " trn2 v29.2d, v2.2d, v6.2d \n\t" " trn1 v30.2d, v3.2d, v7.2d \n\t" " trn2 v31.2d, v3.2d, v7.2d \n\t" " \n\t" " trn1 v0.2d, v8.2d, v12.2d \n\t" // Row 2-3. " trn2 v1.2d, v8.2d, v12.2d \n\t" " trn1 v2.2d, v9.2d, v13.2d \n\t" " trn2 v3.2d, v9.2d, v13.2d \n\t" " trn1 v4.2d, v10.2d, v14.2d \n\t" " trn2 v5.2d, v10.2d, v14.2d \n\t" " trn1 v6.2d, v11.2d, v15.2d \n\t" " trn2 v7.2d, v11.2d, v15.2d \n\t" " \n\t" " trn1 v8.2d, v16.2d, v20.2d \n\t" // Row 4-5. " trn2 v9.2d, v16.2d, v20.2d \n\t" " trn1 v10.2d, v17.2d, v21.2d \n\t" // AMARI " trn2 v11.2d, v17.2d, v21.2d \n\t" // AMARI " trn1 v12.2d, v18.2d, v22.2d \n\t" // AMARI " trn2 v13.2d, v18.2d, v22.2d \n\t" // AMARI " trn1 v14.2d, v19.2d, v23.2d \n\t" // AMARI " trn2 v15.2d, v19.2d, v23.2d \n\t" // AMARI " \n\t" " ld1r {v16.2d}, [x4] \n\t" // Load alpha & beta. " ld1r {v17.2d}, [x8] \n\t" " fmov d18, #1.0 \n\t" " fcmp d16, d18 \n\t" BEQ(UNIT_ALPHA_C) DSCALE8V(24,25,26,27,28,29,30,31,16,0) DSCALE8V(0,1,2,3,4,5,6,7,16,0) DSCALE8V(8,9,10,11,12,13,14,15,16,0) LABEL(UNIT_ALPHA_C) " fcmp d17, #0.0 \n\t" BEQ(ZERO_BETA_C_1) DLOADC_3V_C_FWD(18,19,20,x1,0,x7) DLOADC_3V_C_FWD(21,22,23,x1,0,x7) DSCALEA6V(24,0,8,25,1,9,18,19,20,21,22,23,17,0) LABEL(ZERO_BETA_C_1) DSTOREC_3V_C_FWD(24,0,8,x5,0,x7) DSTOREC_3V_C_FWD(25,1,9,x5,0,x7) BEQ(ZERO_BETA_C_2) DLOADC_3V_C_FWD(18,19,20,x1,0,x7) DLOADC_3V_C_FWD(21,22,23,x1,0,x7) DLOADC_3V_C_FWD(24,0,8,x1,0,x7) DLOADC_3V_C_FWD(25,1,9,x1,0,x7) DSCALEA6V(26,2,10,27,3,11,18,19,20,21,22,23,17,0) DSCALEA6V(28,4,12,29,5,13,24,0,8,25,1,9,17,0) LABEL(ZERO_BETA_C_2) #ifndef __clang__ " cmp x12, #1 \n\t" BRANCH(PRFM_END_C) " prfm PLDL1KEEP, [%[a_next], #16*0] \n\t" " prfm PLDL1KEEP, [%[a_next], #16*1] \n\t" " prfm PLDL1STRM, [%[b_next], #16*0] \n\t" " prfm PLDL1STRM, [%[b_next], #16*1] \n\t" LABEL(PRFM_END_C) " fcmp d17, #0.0 \n\t" // Not the end. Reset branching reg. #endif DSTOREC_3V_C_FWD(26,2,10,x5,0,x7) DSTOREC_3V_C_FWD(27,3,11,x5,0,x7) BEQ(ZERO_BETA_C_3) DLOADC_3V_C_FWD(18,19,20,x1,0,x7) DLOADC_3V_C_FWD(21,22,23,x1,0,x7) DSCALEA6V(30,6,14,31,7,15,18,19,20,21,22,23,17,0) LABEL(ZERO_BETA_C_3) DSTOREC_3V_C_FWD(28,4,12,x5,0,x7) DSTOREC_3V_C_FWD(29,5,13,x5,0,x7) DSTOREC_3V_C_FWD(30,6,14,x5,0,x7) DSTOREC_3V_C_FWD(31,7,15,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) " \n\t" " subs x12, x12, #1 \n\t" BEQ(END_EXEC) " \n\t" " mov x8, #8 \n\t" " madd x13, x7, x8, x13 \n\t" // Forward C's base address to the next logic panel. " add x10, x10, x11 \n\t" // Forward B's base address to the next logic panel. BRANCH(MILLIKER_MLOOP) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_b] "m" (ps_b), [rs_b] "m" (rs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), // In Clang, even "m"-passed parameter takes 1 register. // Have to disable prefetching to pass compilation. #ifndef __clang__ [a_next] "r" (a_next), [b_next] "r" (b_next), #endif [n_iter] "m" (n_iter), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); consider_edge_cases: // Forward address. b = b + n_iter * ps_b; c = c + n_iter * 8 * cs_c; if ( n_left ) { // Set panel stride to unpacked mode. // Only 1 millikernel w.r.t. 6x8 is executed. auxinfo_t data_d6x4mn = *data; bli_auxinfo_set_ps_b( 4 * cs_b0, &data_d6x4mn ); // bli_dgemmsup_rv_armv8a_int_6x4mn ( conja, conjb, 6, n_left, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, &data_d6x4mn, cntx ); } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c000066400000000000000000000424301427272030600277730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "assert.h" GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 ) // Label locality & misc. #include "../armv8a_asm_utils.h" // Nanokernel operations. #include "../armv8a_asm_d2x2.h" /* * +---+ +---+ * | 0 | | 4 | * +---+ +---+ * +---+ +---+ * | 1 | | 5 | * +---+ +---+ * +---+ +---+ * | 2 | | 6 | * +---+ +---+ * +---+ +---+ * | 3 | | 7 | * +---+ +---+ * */ #define DGEMM_8X4_MKER_LOOP_PLAIN(C00,C10,C20,C30,C01,C11,C21,C31,C02,C12,C22,C32,C03,C13,C23,C33,A0,A1,A2,A3,B0,B1,BADDR,BSHIFT,LOADNEXT) \ DGEMM_2X2_NANOKERNEL(C00,C01,A0,B0) \ DGEMM_2X2_NANOKERNEL(C10,C11,A1,B0) \ DGEMM_2X2_NANOKERNEL(C20,C21,A2,B0) \ DGEMM_2X2_NANOKERNEL(C30,C31,A3,B0) \ DGEMM_LOAD1V_ ##LOADNEXT (B0,BADDR,BSHIFT) \ DGEMM_2X2_NANOKERNEL(C02,C03,A0,B1) \ DGEMM_2X2_NANOKERNEL(C12,C13,A1,B1) \ DGEMM_2X2_NANOKERNEL(C22,C23,A2,B1) \ DGEMM_2X2_NANOKERNEL(C32,C33,A3,B1) // Interleaving load or not. #define DGEMM_LOAD1V_noload(V1,ADDR,IMM) #define DGEMM_LOAD1V_load(V1,ADDR,IMM) \ " ldr q"#V1", ["#ADDR", #"#IMM"] \n\t" #define DLOADC_4V_C_FWD(C0,C1,C2,C3,CADDR,CSHIFT,LDC) \ DLOAD4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#LDC" \n\t" #define DSTOREC_4V_C_FWD(C0,C1,C2,C3,CADDR,CSHIFT,LDC) \ DSTORE4V(C0,C1,C2,C3,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#LDC" \n\t" #define DLOADC_4V_R_FWD(C00,C01,C10,C11,CADDR,CSHIFT,RSC) \ DLOAD2V(C00,C01,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" \ DLOAD2V(C10,C11,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_4V_R_FWD(C00,C01,C10,C11,CADDR,CSHIFT,RSC) \ DSTORE2V(C00,C01,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" \ DSTORE2V(C10,C11,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" /* * 8x4 kernel for dgemmsup. * * R-dimension too short. * Not recommanded for use. */ void bli_dgemmsup_rv_armv8a_asm_8x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Fixme: This uker has no dispatching for unalighed sizes. // Currently it only serves as a dispatch target for other kernels // and cannot be registered in configurations. assert( n0 == 4 ); void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); uint64_t ps_a = bli_auxinfo_ps_a( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 6; uint64_t k_left = k0 % 6; int64_t m_iter = m0 / 8; int64_t m_left = m0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // uint64_t cs_b = cs_b0; assert( cs_b0 == 1 ); if ( m_iter == 0 ) goto consider_edge_cases; __asm__ volatile ( " ldr x10, %[a] \n\t" " ldr x13, %[c] \n\t" " ldr x12, %[m_iter] \n\t" " ldr x11, %[ps_a] \n\t" // Panel-skip of A. " ldr x2, %[cs_a] \n\t" // Column-skip of A. " ldr x9, %[rs_a] \n\t" // Row-skip of A. " ldr x3, %[rs_b] \n\t" // Row-skip of B. " \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x11, x11, #3 \n\t" // ps_a " lsl x9, x9, #3 \n\t" // rs_a " lsl x2, x2, #3 \n\t" // cs_a " lsl x3, x3, #3 \n\t" // rs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" LABEL(MILLIKER_MLOOP) " \n\t" " mov x0, x10 \n\t" // Parameters to be reloaded " mov x5, x13 \n\t" // within each millikernel loop. " ldr x1, %[b] \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:15] <- C // V[16:19] <- B; Allowed latency: 24 cycles / # of FPUs. // V[20:31] <- A; Allowed latency: 32 cycles / # of FPUs. // Under this scheme, the following is defined: #define DGEMM_8X4_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,BADDR,BSHIFT,LOADNEXT) \ DGEMM_8X4_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,A0,A1,A2,A3,B0,B1,BADDR,BSHIFT,LOADNEXT) LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " mov x14, x0 \n\t" " ld1 {v20.d}[0], [x14], x9 \n\t" " ld1 {v20.d}[1], [x14], x9 \n\t" " ld1 {v21.d}[0], [x14], x9 \n\t" " ld1 {v21.d}[1], [x14], x9 \n\t" " ld1 {v22.d}[0], [x14], x9 \n\t" " ld1 {v22.d}[1], [x14], x9 \n\t" " ld1 {v23.d}[0], [x14], x9 \n\t" " ld1 {v23.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v24.d}[0], [x14], x9 \n\t" " ld1 {v24.d}[1], [x14], x9 \n\t" " ld1 {v25.d}[0], [x14], x9 \n\t" " ld1 {v25.d}[1], [x14], x9 \n\t" " ld1 {v26.d}[0], [x14], x9 \n\t" " ld1 {v26.d}[1], [x14], x9 \n\t" " ld1 {v27.d}[0], [x14], x9 \n\t" " ld1 {v27.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " mov x14, x0 \n\t" " ld1 {v28.d}[0], [x14], x9 \n\t" " ld1 {v28.d}[1], [x14], x9 \n\t" " ld1 {v29.d}[0], [x14], x9 \n\t" " ld1 {v29.d}[1], [x14], x9 \n\t" " ld1 {v30.d}[0], [x14], x9 \n\t" " ld1 {v30.d}[1], [x14], x9 \n\t" " ld1 {v31.d}[0], [x14], x9 \n\t" " ld1 {v31.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " \n\t" " ldr q16, [x1, #16*0] \n\t" " ldr q17, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " ldr q18, [x1, #16*0] \n\t" " ldr q19, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,B0,B1) \ DGEMM_8X4_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,B0,B1,x1,0,load) \ "mov x14, x0 \n\t" \ "ld1 {v"#A0".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A0".d}[1], [x14], x9 \n\t" \ "ld1 {v"#A1".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A1".d}[1], [x14], x9 \n\t" \ "ld1 {v"#A2".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A2".d}[1], [x14], x9 \n\t" \ "ld1 {v"#A3".d}[0], [x14], x9 \n\t" \ "ld1 {v"#A3".d}[1], [x14], x9 \n\t" \ "ldr q"#B1", [x1, #16*1] \n\t" \ "add x1, x1, x3 \n\t" \ "add x0, x0, x2 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,23,16,17) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,18,19) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(28,29,30,31,16,17) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(20,21,22,23,18,19) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,16,17) DGEMM_8X4_MKER_LOOP_PLAIN_LOC_FWD(28,29,30,31,18,19) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_8X4_MKER_LOOP_PLAIN_LOC(20,21,22,23,18,19,x1,0,load) " ldr q19, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" DGEMM_8X4_MKER_LOOP_PLAIN_LOC(24,25,26,27,16,17,xzr,-1,noload) DGEMM_8X4_MKER_LOOP_PLAIN_LOC(28,29,30,31,18,19,xzr,-1,noload) // // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " mov x14, x0 \n\t" " ld1 {v20.d}[0], [x14], x9 \n\t" // Load A col. " ld1 {v20.d}[1], [x14], x9 \n\t" " ld1 {v21.d}[0], [x14], x9 \n\t" " ld1 {v21.d}[1], [x14], x9 \n\t" " ld1 {v22.d}[0], [x14], x9 \n\t" " ld1 {v22.d}[1], [x14], x9 \n\t" " ld1 {v23.d}[0], [x14], x9 \n\t" " ld1 {v23.d}[1], [x14], x9 \n\t" " add x0, x0, x2 \n\t" " ldr q16, [x1, #16*0] \n\t" // Load B col. " ldr q17, [x1, #16*1] \n\t" " add x1, x1, x3 \n\t" " sub x8, x8, #1 \n\t" DGEMM_8X4_MKER_LOOP_PLAIN_LOC(20,21,22,23,16,17,xzr,-1,noload) BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ld1r {v16.2d}, [x4] \n\t" // Load alpha & beta (value). " ld1r {v17.2d}, [x8] \n\t" " fmov d18, #1.0 \n\t" " fcmp d16, d18 \n\t" BEQ(UNIT_ALPHA) DSCALE8V(0,1,2,3,4,5,6,7,16,0) DSCALE8V(8,9,10,11,12,13,14,15,16,0) LABEL(UNIT_ALPHA) " \n\t" " mov x1, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x6, #8 \n\t" // Check for row-storage. BNE(WRITE_MEM_R) // // C storage in columns. LABEL(WRITE_MEM_C) " fcmp d17, #0.0 \n\t" BEQ(ZERO_BETA_C) DLOADC_4V_C_FWD(20,21,22,23,x1,0,x7) DLOADC_4V_C_FWD(24,25,26,27,x1,0,x7) DSCALEA8V(0,1,2,3,4,5,6,7,20,21,22,23,24,25,26,27,17,0) // DLOADC_4V_C_FWD(20,21,22,23,x1,0,x7) DLOADC_4V_C_FWD(24,25,26,27,x1,0,x7) DSCALEA8V(8,9,10,11,12,13,14,15,20,21,22,23,24,25,26,27,17,0) LABEL(ZERO_BETA_C) // DSTOREC_4V_C_FWD(0,1,2,3,x5,0,x7) DSTOREC_4V_C_FWD(4,5,6,7,x5,0,x7) DSTOREC_4V_C_FWD(8,9,10,11,x5,0,x7) DSTOREC_4V_C_FWD(12,13,14,15,x5,0,x7) BRANCH(END_WRITE_MEM) // // C storage in rows. LABEL(WRITE_MEM_R) // In-register transpose. " trn1 v16.2d, v0.2d, v4.2d \n\t" // Row 0. " trn1 v17.2d, v8.2d, v12.2d \n\t" " trn2 v18.2d, v0.2d, v4.2d \n\t" // Row 1. " trn2 v19.2d, v8.2d, v12.2d \n\t" " trn1 v20.2d, v1.2d, v5.2d \n\t" // Row 2. " trn1 v21.2d, v9.2d, v13.2d \n\t" " trn2 v22.2d, v1.2d, v5.2d \n\t" // Row 3. " trn2 v23.2d, v9.2d, v13.2d \n\t" " trn1 v24.2d, v2.2d, v6.2d \n\t" // Row 4. " trn1 v25.2d, v10.2d, v14.2d \n\t" " trn2 v26.2d, v2.2d, v6.2d \n\t" // Row 5. " trn2 v27.2d, v10.2d, v14.2d \n\t" " trn1 v28.2d, v3.2d, v7.2d \n\t" // Row 6. " trn1 v29.2d, v11.2d, v15.2d \n\t" " trn2 v30.2d, v3.2d, v7.2d \n\t" // Row 7. " trn2 v31.2d, v11.2d, v15.2d \n\t" // " ld1r {v14.2d}, [x4] \n\t" // Reload alpha & beta (value). " ld1r {v15.2d}, [x8] \n\t" " fcmp d15, #0.0 \n\t" BEQ(ZERO_BETA_R) DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) DLOADC_4V_R_FWD(4,5,6,7,x1,0,x6) DSCALEA8V(16,17,18,19,20,21,22,23,0,1,2,3,4,5,6,7,15,0) // DLOADC_4V_R_FWD(0,1,2,3,x1,0,x6) DLOADC_4V_R_FWD(4,5,6,7,x1,0,x6) DSCALEA8V(24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,15,0) LABEL(ZERO_BETA_R) // DSTOREC_4V_R_FWD(16,17,18,19,x5,0,x6) DSTOREC_4V_R_FWD(20,21,22,23,x5,0,x6) DSTOREC_4V_R_FWD(24,25,26,27,x5,0,x6) DSTOREC_4V_R_FWD(28,29,30,31,x5,0,x6) // // End of this microkernel. LABEL(END_WRITE_MEM) " \n\t" " subs x12, x12, #1 \n\t" BEQ(END_EXEC) " \n\t" " mov x8, #8 \n\t" " madd x13, x6, x8, x13 \n\t" // Forward C's base address to the next logic panel. " add x10, x10, x11 \n\t" // Forward A's base address to the next logic panel. BRANCH(MILLIKER_MLOOP) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a] "m" (ps_a), [rs_b] "m" (rs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [m_iter] "m" (m_iter), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); consider_edge_cases: a = a + m_iter * ps_a; c = c + m_iter * 8 * rs_c; // Edge case is within 1 millikernel loop of THIS kernel. // Regarding the 6x?m kernel, the panel stride should be always local. auxinfo_t data_6xkm = *data; bli_auxinfo_set_ps_a( 6 * rs_a, &data_6xkm ); if ( m_left ) { bli_dgemmsup_rv_armv8a_int_6x4mn ( conja, conjb, m_left, 4, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, &data_6xkm, cntx ); } // Issue prefetch instructions only after // execution is done. __asm__ ( " mov x0, %[a_next] \n\t" " mov x1, %[b_next] \n\t" " prfm PLDL1STRM, [x0, #16*0] \n\t" " prfm PLDL1STRM, [x0, #16*1] \n\t" " prfm PLDL1STRM, [x0, #16*2] \n\t" " prfm PLDL1KEEP, [x1, #16*0] \n\t" " prfm PLDL1KEEP, [x1, #16*1] \n\t" " prfm PLDL1KEEP, [x1, #16*2] \n\t" : : [a_next] "r" (a_next), [b_next] "r" (b_next) : "x0", "x1" ); } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d3x4/000077500000000000000000000000001427272030600222325ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c000066400000000000000000000313731427272030600303550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Supplimentary fixed-size gemmsup. #include "blis.h" #include "assert.h" // Label locality & misc. #include "../../armv8a_asm_utils.h" #define DGEMM_3X1X2_NKER_SUBLOOP(C0,C1,C2,A0,A1,A2,B) \ " fmla v"#C0".2d, v"#A0".2d, v"#B".2d \n\t" \ " fmla v"#C1".2d, v"#A1".2d, v"#B".2d \n\t" \ " fmla v"#C2".2d, v"#A2".2d, v"#B".2d \n\t" #define DGEMM_3X4X2_K_MKER_LOOP_PLAIN(C00,C01,C02,C03,C10,C11,C12,C13,C20,C21,C22,C23,A0,A1,A2,B0,B1,B2,B3) \ DGEMM_3X1X2_NKER_SUBLOOP(C00,C10,C20,A0,A1,A2,B0) \ DGEMM_3X1X2_NKER_SUBLOOP(C01,C11,C21,A0,A1,A2,B1) \ DGEMM_3X1X2_NKER_SUBLOOP(C02,C12,C22,A0,A1,A2,B2) \ DGEMM_3X1X2_NKER_SUBLOOP(C03,C13,C23,A0,A1,A2,B3) // For row-storage of C. #define DLOADC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_2V_R_FWD(C0,C1,CADDR,CSHIFT,RSC) \ DSTORE2V(C0,C1,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", "#RSC" \n\t" // For column-storage of C. #define DLOADC_1V_1ELM_C_FWD(C0,CSCALAR,CIDX,CADDR,CSHIFT,CSC) \ DLOAD1V(C0,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " ld1 {v"#CSCALAR".d}["#CIDX"], ["#CADDR"] \n\t" \ " sub "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_1V_1ELM_C_FWD(C0,CSCALAR,CIDX,CADDR,CSHIFT,CSC) \ DSTORE1V(C0,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " st1 {v"#CSCALAR".d}["#CIDX"], ["#CADDR"] \n\t" \ " sub "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSCALE6V(V0,V1,V2,V3,V4,V5,A,IDX) \ DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE2V(V4,V5,A,IDX) #define DSCALEA6V(D0,D1,D2,D3,D4,D5,S0,S1,S2,S3,S4,S5,A,IDX) \ DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA2V(D4,D5,S4,S5,A,IDX) void bli_dgemmsup_rd_armv8a_asm_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { assert( m0 == 3 ); assert( n0 == 4 ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; assert( cs_a0 == 1 ); assert( rs_b0 == 1 ); __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " ldr x2, %[rs_a] \n\t" // Row-skip of A. " ldr x3, %[cs_b] \n\t" // Column-skip of B. " \n\t" " ldr x5, %[c] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x2, x2, #3 \n\t" // rs_a " lsl x3, x3, #3 \n\t" // cs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:11] <- C // V[12:14] <- A // V[16:19] <- B // Under this scheme, the following is defined: #define DGEMM_3X4X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3) \ DGEMM_3X4X2_K_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,A0,A1,A2,B0,B1,B2,B3) // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " mov x11, x1 \n\t" // Load B. " ldr q16, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q17, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q18, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q19, [x11] \n\t" " add x1, x1, #16 \n\t" " \n\t" " mov x14, x0 \n\t" // Load A. " ldr q12, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q13, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q14, [x14] \n\t" " add x0, x0, #16 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR4V(8,9,10,11) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_3X4X2_K_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,B0,B1,B2,B3) \ DGEMM_3X4X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,B0,B1,B2,B3) \ "mov x11, x1 \n\t" \ "ldr q"#B0", [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q"#B1", [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q"#B2", [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q"#B3", [x11] \n\t" \ "add x1, x1, #16 \n\t" \ "mov x14, x0 \n\t" \ "ldr q"#A0", [x14] \n\t" \ "add x14, x14, x2 \n\t" \ "ldr q"#A1", [x14] \n\t" \ "add x14, x14, x2 \n\t" \ "ldr q"#A2", [x14] \n\t" \ "add x0, x0, #16 \n\t" // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_3X4X2_K_MKER_LOOP_PLAIN_LOC_FWD(12,13,14,16,17,18,19) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_3X4X2_K_MKER_LOOP_PLAIN_LOC_FWD(12,13,14,16,17,18,19) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_3X4X2_K_MKER_LOOP_PLAIN_LOC(12,13,14,16,17,18,19) // // If major kernel is executed, // an additional depth-summation is required. " faddp v0.2d, v0.2d, v1.2d \n\t" // Line 0. " faddp v1.2d, v2.2d, v3.2d \n\t" " faddp v2.2d, v4.2d, v5.2d \n\t" // Line 1. " faddp v3.2d, v6.2d, v7.2d \n\t" " faddp v4.2d, v8.2d, v9.2d \n\t" // Line 2. " faddp v5.2d, v10.2d, v11.2d \n\t" " \n\t" // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " mov x11, x1 \n\t" // Load B row. " ld1 {v28.d}[0], [x11], x3 \n\t" " ld1 {v28.d}[1], [x11], x3 \n\t" " ld1 {v29.d}[0], [x11], x3 \n\t" " ld1 {v29.d}[1], [x11], x3 \n\t" " add x1, x1, #8 \n\t" " mov x14, x0 \n\t" // Load A column. " ld1 {v24.d}[0], [x14], x2 \n\t" " ld1 {v24.d}[1], [x14], x2 \n\t" " ld1 {v25.d}[0], [x14], x2 \n\t" " add x0, x0, #8 \n\t" " fmla v0.2d, v28.2d, v24.d[0] \n\t" " fmla v1.2d, v29.2d, v24.d[0] \n\t" " fmla v2.2d, v28.2d, v24.d[1] \n\t" " fmla v3.2d, v29.2d, v24.d[1] \n\t" " fmla v4.2d, v28.2d, v25.d[0] \n\t" " fmla v5.2d, v29.2d, v25.d[0] \n\t" " sub x8, x8, #1 \n\t" BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta (value). " ld1r {v31.2d}, [x8] \n\t" DSCALE6V(0,1,2,3,4,5,30,0) " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " fcmp d31, #0.0 \n\t" BEQ(ZERO_BETA_R) DLOADC_2V_R_FWD(12,13,x9,0,x6) DLOADC_2V_R_FWD(14,15,x9,0,x6) DLOADC_2V_R_FWD(16,17,x9,0,x6) DSCALEA6V(0,1,2,3,4,5,12,13,14,15,16,17,31,0) LABEL(ZERO_BETA_R) DSTOREC_2V_R_FWD(0,1,x5,0,x6) DSTOREC_2V_R_FWD(2,3,x5,0,x6) DSTOREC_2V_R_FWD(4,5,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) " trn1 v6.2d, v0.2d, v2.2d \n\t" " trn2 v7.2d, v0.2d, v2.2d \n\t" " trn1 v8.2d, v1.2d, v3.2d \n\t" " trn2 v9.2d, v1.2d, v3.2d \n\t" " fcmp d31, #0.0 \n\t" BEQ(ZERO_BETA_C) DLOADC_1V_1ELM_C_FWD(12,20,0,x9,0,x7) DLOADC_1V_1ELM_C_FWD(13,20,1,x9,0,x7) DLOADC_1V_1ELM_C_FWD(14,21,0,x9,0,x7) DLOADC_1V_1ELM_C_FWD(15,21,1,x9,0,x7) DSCALEA6V(6,7,8,9,4,5,12,13,14,15,20,21,31,0) LABEL(ZERO_BETA_C) DSTOREC_1V_1ELM_C_FWD(6,4,0,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(7,4,1,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(8,5,0,x5,0,x7) DSTOREC_1V_1ELM_C_FWD(9,5,1,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_b] "m" (cs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c000066400000000000000000000367051427272030600303630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Supplimentary fixed-size gemmsup. #include "blis.h" #include "assert.h" // Label locality & misc. #include "../../armv8a_asm_utils.h" #define DGEMM_1X3X2_NKER_SUBLOOP(C0,C1,C2,A,B0,B1,B2) \ " fmla v"#C0".2d, v"#A".2d, v"#B0".2d \n\t" \ " fmla v"#C1".2d, v"#A".2d, v"#B1".2d \n\t" \ " fmla v"#C2".2d, v"#A".2d, v"#B2".2d \n\t" #define DGEMM_6X3X2_K_MKER_LOOP_PLAIN(C00,C01,C02,C10,C11,C12,C20,C21,C22,C30,C31,C32,C40,C41,C42,C50,C51,C52,A0,A1,A2,A3,A4,A5,B0,B1,B2,AADDR,AELEMADDR,AELEMST,LOAD0,LOAD1) \ DGEMM_1X3X2_NKER_SUBLOOP(C00,C01,C02,A0,B0,B1,B2) \ DGEMM_LOAD1V_K_ ##LOAD0 (A0,AELEMADDR,AELEMST) \ DGEMM_1X3X2_NKER_SUBLOOP(C10,C11,C12,A1,B0,B1,B2) \ DGEMM_LOAD1V_K_ ##LOAD0 (A1,AELEMADDR,AELEMST) \ DGEMM_1X3X2_NKER_SUBLOOP(C20,C21,C22,A2,B0,B1,B2) \ DGEMM_LOAD1V_K_ ##LOAD0 (A2,AELEMADDR,AELEMST) \ DGEMM_1X3X2_NKER_SUBLOOP(C30,C31,C32,A3,B0,B1,B2) \ DGEMM_LOAD1V_K_ ##LOAD0 (A3,AELEMADDR,AELEMST) \ DGEMM_FWDA_K_ ##LOAD0 (AADDR) \ " mov "#AELEMADDR", "#AADDR" \n\t" \ DGEMM_1X3X2_NKER_SUBLOOP(C40,C41,C42,A4,B0,B1,B2) \ DGEMM_LOAD1V_K_ ##LOAD1 (A4,AELEMADDR,AELEMST) \ DGEMM_1X3X2_NKER_SUBLOOP(C50,C51,C52,A5,B0,B1,B2) \ DGEMM_LOAD1V_K_ ##LOAD1 (A5,AELEMADDR,AELEMST) #define DGEMM_LOAD1V_K_noload(V,ELEMADDR,ELEMST) #define DGEMM_LOAD1V_K_load(V,ELEMADDR,ELEMST) \ " ldr q"#V", [ "#ELEMADDR" ] \n\t" \ " add "#ELEMADDR", "#ELEMADDR", "#ELEMST" \n\t" #define DGEMM_FWDA_K_noload(ADDR) #define DGEMM_FWDA_K_load(ADDR) \ " add "#ADDR", "#ADDR", #16 \n\t" // For row-storage of C. #define DLOADC_1V_1ELM_R_FWD(C0,CSCALAR,CIDX,CADDR,CSHIFT,RSC) \ DLOAD1V(C0,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " ld1 {v"#CSCALAR".d}["#CIDX"], ["#CADDR"] \n\t" \ " sub "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " add "#CADDR", "#CADDR", "#RSC" \n\t" #define DSTOREC_1V_1ELM_R_FWD(C0,CSCALAR,CIDX,CADDR,CSHIFT,RSC) \ DSTORE1V(C0,CADDR,CSHIFT) \ " add "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " st1 {v"#CSCALAR".d}["#CIDX"], ["#CADDR"] \n\t" \ " sub "#CADDR", "#CADDR", #"#CSHIFT"+16 \n\t" \ " add "#CADDR", "#CADDR", "#RSC" \n\t" // For column-storage of C. #define DLOADC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DLOAD2V(C0,C1,CADDR,CSHIFT) \ DLOAD1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSTOREC_3V_C_FWD(C0,C1,C2,CADDR,CSHIFT,CSC) \ DSTORE2V(C0,C1,CADDR,CSHIFT) \ DSTORE1V(C2,CADDR,CSHIFT+32) \ " add "#CADDR", "#CADDR", "#CSC" \n\t" #define DSCALE9V(V0,V1,V2,V3,V4,V5,V6,V7,V8,A,IDX) \ DSCALE4V(V0,V1,V2,V3,A,IDX) \ DSCALE4V(V4,V5,V6,V7,A,IDX) \ DSCALE1V(V8,A,IDX) #define DSCALEA9V(D0,D1,D2,D3,D4,D5,D6,D7,D8,S0,S1,S2,S3,S4,S5,S6,S7,S8,A,IDX) \ DSCALEA4V(D0,D1,D2,D3,S0,S1,S2,S3,A,IDX) \ DSCALEA4V(D4,D5,D6,D7,S4,S5,S6,S7,A,IDX) \ DSCALEA1V(D8,S8,A,IDX) void bli_dgemmsup_rd_armv8a_asm_6x3 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { assert( m0 == 6 ); assert( n0 == 3 ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_mker = k0 / 8; uint64_t k_left = k0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; assert( cs_a0 == 1 ); assert( rs_b0 == 1 ); __asm__ volatile ( " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " ldr x2, %[rs_a] \n\t" // Row-skip of A. " ldr x3, %[cs_b] \n\t" // Column-skip of B. " \n\t" " ldr x5, %[c] \n\t" " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. " \n\t" " \n\t" // Multiply some address skips by sizeof(double). " lsl x2, x2, #3 \n\t" // rs_a " lsl x3, x3, #3 \n\t" // cs_b " lsl x6, x6, #3 \n\t" // rs_c " lsl x7, x7, #3 \n\t" // cs_c " \n\t" " ldr x4, %[k_mker] \n\t" " ldr x8, %[k_left] \n\t" " \n\t" // Storage scheme: // V[ 0:17] <- C // V[18:23] <- B // V[24:31] <- A // Under this scheme, the following is defined: #define DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,A4,A5,B0,B1,B2,AADDR,AELEMADDR,AELEMST,LOAD0,LOAD1) \ DGEMM_6X3X2_K_MKER_LOOP_PLAIN(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,A0,A1,A2,A3,A4,A5,B0,B1,B2,AADDR,AELEMADDR,AELEMST,LOAD0,LOAD1) // Load from memory. LABEL(LOAD_ABC) " \n\t" // No-microkernel early return is a must " cmp x4, #0 \n\t" // to avoid out-of-boundary read. BEQ(CLEAR_CCOLS) " \n\t" " mov x14, x0 \n\t" // Load A. " ldr q24, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q25, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q26, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q27, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q28, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q29, [x14] \n\t" " add x0, x0, #16 \n\t" " mov x14, x0 \n\t" " ldr q30, [x14] \n\t" " add x14, x14, x2 \n\t" " ldr q31, [x14] \n\t" " add x14, x14, x2 \n\t" " \n\t" " mov x11, x1 \n\t" // Load B. " ldr q18, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q19, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q20, [x11] \n\t" " add x1, x1, #16 \n\t" " mov x11, x1 \n\t" " ldr q21, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q22, [x11] \n\t" " add x11, x11, x3 \n\t" " ldr q23, [x11] \n\t" " add x1, x1, #16 \n\t" LABEL(CLEAR_CCOLS) CLEAR8V(0,1,2,3,4,5,6,7) CLEAR8V(8,9,10,11,12,13,14,15) CLEAR2V(16,17) // No-microkernel early return, once again. BEQ(K_LEFT_LOOP) // // Microkernel is defined here as: #define DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC_FWD(A0,A1,A2,A3,A4,A5,B0,B1,B2) \ DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC(A0,A1,A2,A3,A4,A5,B0,B1,B2,x0,x14,x2,load,load) \ "mov x11, x1 \n\t" \ "ldr q"#B0", [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q"#B1", [x11] \n\t" \ "add x11, x11, x3 \n\t" \ "ldr q"#B2", [x11] \n\t" \ "add x1, x1, #16 \n\t" \ // Start microkernel loop. LABEL(K_MKER_LOOP) DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC_FWD(24,25,26,27,28,29,18,19,20) DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC_FWD(30,31,24,25,26,27,21,22,23) " \n\t" // Decrease counter before final replica. " subs x4, x4, #1 \n\t" // Branch early to avoid reading excess mem. BEQ(FIN_MKER_LOOP) DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC_FWD(28,29,30,31,24,25,18,19,20) DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC_FWD(26,27,28,29,30,31,21,22,23) BRANCH(K_MKER_LOOP) // // Final microkernel loop. LABEL(FIN_MKER_LOOP) DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC(28,29,30,31,24,25,18,19,20,x0,x14,x2,load,noload) DGEMM_6X3X2_K_MKER_LOOP_PLAIN_LOC(26,27,28,29,30,31,21,22,23,xzr,xzr,xzr,noload,noload) // // If major kernel is executed, // an additional depth-summation is required. " faddp v0.2d, v0.2d, v3.2d \n\t" // Column 0 Prt 0. " faddp v1.2d, v1.2d, v4.2d \n\t" // Column 1 Prt 0. " faddp v2.2d, v2.2d, v5.2d \n\t" // Column 2 Prt 0. " faddp v3.2d, v6.2d, v9.2d \n\t" // Column 0 Prt 1. " faddp v4.2d, v7.2d, v10.2d \n\t" // Column 1 Prt 1. " faddp v5.2d, v8.2d, v11.2d \n\t" // Column 2 Prt 1. " faddp v6.2d, v12.2d, v15.2d \n\t" // Column 0 Prt 2. " faddp v7.2d, v13.2d, v16.2d \n\t" // Column 1 Prt 2. " faddp v8.2d, v14.2d, v17.2d \n\t" // Column 2 Prt 2. " \n\t" // Loops left behind microkernels. LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of exec. BEQ(WRITE_MEM_PREP) " mov x14, x0 \n\t" // Load A column. " ld1 {v24.d}[0], [x14], x2 \n\t" " ld1 {v24.d}[1], [x14], x2 \n\t" " ld1 {v25.d}[0], [x14], x2 \n\t" " ld1 {v25.d}[1], [x14], x2 \n\t" " ld1 {v26.d}[0], [x14], x2 \n\t" " ld1 {v26.d}[1], [x14], x2 \n\t" " add x0, x0, #8 \n\t" " mov x11, x1 \n\t" // Load B row. " ld1 {v28.d}[0], [x11], x3 \n\t" " ld1 {v28.d}[1], [x11], x3 \n\t" " ld1 {v29.d}[0], [x11], x3 \n\t" " add x1, x1, #8 \n\t" " fmla v0.2d, v24.2d, v28.d[0] \n\t" " fmla v3.2d, v25.2d, v28.d[0] \n\t" " fmla v6.2d, v26.2d, v28.d[0] \n\t" " fmla v1.2d, v24.2d, v28.d[1] \n\t" " fmla v4.2d, v25.2d, v28.d[1] \n\t" " fmla v7.2d, v26.2d, v28.d[1] \n\t" " fmla v2.2d, v24.2d, v29.d[0] \n\t" " fmla v5.2d, v25.2d, v29.d[0] \n\t" " fmla v8.2d, v26.2d, v29.d[0] \n\t" " sub x8, x8, #1 \n\t" BRANCH(K_LEFT_LOOP) // // Scale and write to memory. LABEL(WRITE_MEM_PREP) " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" " ld1r {v30.2d}, [x4] \n\t" // Load alpha & beta (value). " ld1r {v31.2d}, [x8] \n\t" DSCALE9V(0,1,2,3,4,5,6,7,8,30,0) " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. " cmp x7, #8 \n\t" // Check for column-storage. BNE(WRITE_MEM_C) // // C storage in rows. LABEL(WRITE_MEM_R) " trn1 v20.2d, v0.2d, v1.2d \n\t" " trn2 v21.2d, v0.2d, v1.2d \n\t" " trn1 v22.2d, v3.2d, v4.2d \n\t" " trn2 v23.2d, v3.2d, v4.2d \n\t" " trn1 v24.2d, v6.2d, v7.2d \n\t" " trn2 v25.2d, v6.2d, v7.2d \n\t" " fcmp d31, #0.0 \n\t" BEQ(ZERO_BETA_R) DLOADC_1V_1ELM_R_FWD(10,26,0,x9,0,x6) DLOADC_1V_1ELM_R_FWD(11,26,1,x9,0,x6) DLOADC_1V_1ELM_R_FWD(12,27,0,x9,0,x6) DLOADC_1V_1ELM_R_FWD(13,27,1,x9,0,x6) DLOADC_1V_1ELM_R_FWD(14,28,0,x9,0,x6) DLOADC_1V_1ELM_R_FWD(15,28,1,x9,0,x6) DSCALEA9V(20,21,22,23,24,25,2,5,8,10,11,12,13,14,15,26,27,28,31,0) LABEL(ZERO_BETA_R) DSTOREC_1V_1ELM_R_FWD(20,2,0,x5,0,x6) DSTOREC_1V_1ELM_R_FWD(21,2,1,x5,0,x6) DSTOREC_1V_1ELM_R_FWD(22,5,0,x5,0,x6) DSTOREC_1V_1ELM_R_FWD(23,5,1,x5,0,x6) DSTOREC_1V_1ELM_R_FWD(24,8,0,x5,0,x6) DSTOREC_1V_1ELM_R_FWD(25,8,1,x5,0,x6) BRANCH(END_WRITE_MEM) // // C storage in columns. LABEL(WRITE_MEM_C) " fcmp d31, #0.0 \n\t" BEQ(ZERO_BETA_C) DLOADC_3V_C_FWD(12,15,18,x9,0,x7) DLOADC_3V_C_FWD(13,16,19,x9,0,x7) DLOADC_3V_C_FWD(14,17,20,x9,0,x7) DSCALEA9V(0,1,2,3,4,5,6,7,8,12,13,14,15,16,17,18,19,20,31,0) LABEL(ZERO_BETA_C) DSTOREC_3V_C_FWD(0,3,6,x5,0,x7) DSTOREC_3V_C_FWD(1,4,7,x5,0,x7) DSTOREC_3V_C_FWD(2,5,8,x5,0,x7) // // End of this microkernel. LABEL(END_WRITE_MEM) // // End of execution. LABEL(END_EXEC) : : [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_a] "m" (rs_a), [cs_b] "m" (cs_b), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [k_mker] "m" (k_mker), [k_left] "m" (k_left), [alpha] "m" (alpha), [beta] "m" (beta) : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10","x11","x12","x13","x14", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14","v15", "v16","v17","v18","v19","v20","v21","v22","v23", "v24","v25","v26","v27","v28","v29","v30","v31" ); } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c000066400000000000000000000351631427272030600303730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Supplimentary dynamic-size gemmsup. #include "blis.h" #include "assert.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL _Pragma("unroll") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL _Pragma("GCC unroll 2") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL #endif /* * As these kernels requires num. of vregs about half of the total 32, * it should be all right to implement w/ intrinsics. * * c.f. https://www.youtube.com/watch?v=R2hQOVjRwVE . */ void bli_dgemmsup_rd_armv8a_int_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a, inc_t cs_a, double* restrict b, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { assert( m0 <= 2 ); assert( n0 <= 8 ); double *a_loc = a; double *b_loc = b; double *c_loc = c; uint64_t k_mker = k0 / 2; uint64_t k_left = k0 % 2; uint64_t b_iszr = ( *beta == 0.0 ); assert( cs_a == 1 ); assert( rs_b == 1 ); // Registers used to store a 2x8x2 block of C (summing the last dimension). // Total: 22 specified. float64x2_t vc_00, vc_01, vc_02, vc_03, vc_04, vc_05, vc_06, vc_07; float64x2_t vc_10, vc_11, vc_12, vc_13, vc_14, vc_15, vc_16, vc_17; float64x2_t va_0, va_1; float64x2_t vb_0, vb_1, vb_2, vb_3; vc_00 = (float64x2_t)vdupq_n_f64( 0 ); vc_01 = (float64x2_t)vdupq_n_f64( 0 ); vc_02 = (float64x2_t)vdupq_n_f64( 0 ); vc_03 = (float64x2_t)vdupq_n_f64( 0 ); vc_04 = (float64x2_t)vdupq_n_f64( 0 ); vc_05 = (float64x2_t)vdupq_n_f64( 0 ); vc_06 = (float64x2_t)vdupq_n_f64( 0 ); vc_07 = (float64x2_t)vdupq_n_f64( 0 ); vc_10 = (float64x2_t)vdupq_n_f64( 0 ); vc_11 = (float64x2_t)vdupq_n_f64( 0 ); vc_12 = (float64x2_t)vdupq_n_f64( 0 ); vc_13 = (float64x2_t)vdupq_n_f64( 0 ); vc_14 = (float64x2_t)vdupq_n_f64( 0 ); vc_15 = (float64x2_t)vdupq_n_f64( 0 ); vc_16 = (float64x2_t)vdupq_n_f64( 0 ); vc_17 = (float64x2_t)vdupq_n_f64( 0 ); PRAGMA_UNROLL for ( ; k_mker > 0; --k_mker ) { // if ( m0 > 0 ) va_0 = vld1q_f64( a_loc + rs_a * 0 ); if ( m0 > 1 ) va_1 = vld1q_f64( a_loc + rs_a * 1 ); // if ( n0 > 0 ) vb_0 = vld1q_f64( b_loc + cs_b * 0 ); if ( n0 > 1 ) vb_1 = vld1q_f64( b_loc + cs_b * 1 ); if ( n0 > 2 ) vb_2 = vld1q_f64( b_loc + cs_b * 2 ); if ( n0 > 3 ) vb_3 = vld1q_f64( b_loc + cs_b * 3 ); vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_01 = vfmaq_f64( vc_01, va_0, vb_1 ); vc_02 = vfmaq_f64( vc_02, va_0, vb_2 ); vc_03 = vfmaq_f64( vc_03, va_0, vb_3 ); if ( m0 > 1 ) { vc_10 = vfmaq_f64( vc_10, va_1, vb_0 ); vc_11 = vfmaq_f64( vc_11, va_1, vb_1 ); vc_12 = vfmaq_f64( vc_12, va_1, vb_2 ); vc_13 = vfmaq_f64( vc_13, va_1, vb_3 ); } if ( n0 > 4 ) { vb_0 = vld1q_f64( b_loc + cs_b * 4 ); if ( n0 > 5 ) vb_1 = vld1q_f64( b_loc + cs_b * 5 ); if ( n0 > 6 ) vb_2 = vld1q_f64( b_loc + cs_b * 6 ); if ( n0 > 7 ) vb_3 = vld1q_f64( b_loc + cs_b * 7 ); vc_04 = vfmaq_f64( vc_04, va_0, vb_0 ); vc_05 = vfmaq_f64( vc_05, va_0, vb_1 ); if ( n0 > 6 ) { vc_06 = vfmaq_f64( vc_06, va_0, vb_2 ); vc_07 = vfmaq_f64( vc_07, va_0, vb_3 ); } if ( m0 > 1 ) { vc_14 = vfmaq_f64( vc_14, va_1, vb_0 ); vc_15 = vfmaq_f64( vc_15, va_1, vb_1 ); if ( n0 > 6 ) { vc_16 = vfmaq_f64( vc_16, va_1, vb_2 ); vc_17 = vfmaq_f64( vc_17, va_1, vb_3 ); } } } a_loc += 2; b_loc += 2; } // Pay no care for O(1) details. va_0 = (float64x2_t)vdupq_n_f64( 0 ); va_1 = (float64x2_t)vdupq_n_f64( 0 ); vb_0 = (float64x2_t)vdupq_n_f64( 0 ); vb_1 = (float64x2_t)vdupq_n_f64( 0 ); vb_2 = (float64x2_t)vdupq_n_f64( 0 ); vb_3 = (float64x2_t)vdupq_n_f64( 0 ); PRAGMA_NOUNROLL for ( ; k_left > 0; --k_left ) { // if ( m0 > 0 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 ); if ( m0 > 1 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 1, va_1, 0 ); // if ( n0 > 0 ) vb_0 = vld1q_lane_f64( b_loc + cs_b * 0, vb_0, 0 ); if ( n0 > 1 ) vb_1 = vld1q_lane_f64( b_loc + cs_b * 1, vb_1, 0 ); if ( n0 > 2 ) vb_2 = vld1q_lane_f64( b_loc + cs_b * 2, vb_2, 0 ); if ( n0 > 3 ) vb_3 = vld1q_lane_f64( b_loc + cs_b * 3, vb_3, 0 ); vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_01 = vfmaq_f64( vc_01, va_0, vb_1 ); vc_02 = vfmaq_f64( vc_02, va_0, vb_2 ); vc_03 = vfmaq_f64( vc_03, va_0, vb_3 ); vc_10 = vfmaq_f64( vc_10, va_1, vb_0 ); vc_11 = vfmaq_f64( vc_11, va_1, vb_1 ); vc_12 = vfmaq_f64( vc_12, va_1, vb_2 ); vc_13 = vfmaq_f64( vc_13, va_1, vb_3 ); if ( n0 > 4 ) vb_0 = vld1q_lane_f64( b_loc + cs_b * 4, vb_0, 0 ); if ( n0 > 5 ) vb_1 = vld1q_lane_f64( b_loc + cs_b * 5, vb_1, 0 ); if ( n0 > 6 ) vb_2 = vld1q_lane_f64( b_loc + cs_b * 6, vb_2, 0 ); if ( n0 > 7 ) vb_3 = vld1q_lane_f64( b_loc + cs_b * 7, vb_3, 0 ); vc_04 = vfmaq_f64( vc_04, va_0, vb_0 ); vc_05 = vfmaq_f64( vc_05, va_0, vb_1 ); vc_06 = vfmaq_f64( vc_06, va_0, vb_2 ); vc_07 = vfmaq_f64( vc_07, va_0, vb_3 ); vc_14 = vfmaq_f64( vc_14, va_1, vb_0 ); vc_15 = vfmaq_f64( vc_15, va_1, vb_1 ); vc_16 = vfmaq_f64( vc_16, va_1, vb_2 ); vc_17 = vfmaq_f64( vc_17, va_1, vb_3 ); a_loc += 1; b_loc += 1; } // Load alpha and beta. // Note that here vb is used for alpha, in contrast to other kernels. vb_0 = vld1q_dup_f64( alpha ); va_0 = vld1q_dup_f64( beta ); // Scale. vc_00 = vmulq_f64( vc_00, vb_0 ); vc_01 = vmulq_f64( vc_01, vb_0 ); vc_02 = vmulq_f64( vc_02, vb_0 ); vc_03 = vmulq_f64( vc_03, vb_0 ); vc_04 = vmulq_f64( vc_04, vb_0 ); vc_05 = vmulq_f64( vc_05, vb_0 ); vc_06 = vmulq_f64( vc_06, vb_0 ); vc_07 = vmulq_f64( vc_07, vb_0 ); vc_10 = vmulq_f64( vc_10, vb_0 ); vc_11 = vmulq_f64( vc_11, vb_0 ); vc_12 = vmulq_f64( vc_12, vb_0 ); vc_13 = vmulq_f64( vc_13, vb_0 ); vc_14 = vmulq_f64( vc_14, vb_0 ); vc_15 = vmulq_f64( vc_15, vb_0 ); vc_16 = vmulq_f64( vc_16, vb_0 ); vc_17 = vmulq_f64( vc_17, vb_0 ); if ( cs_c == 1 ) { // Row-storage. vc_00 = vpaddq_f64( vc_00, vc_01 ); vc_02 = vpaddq_f64( vc_02, vc_03 ); vc_04 = vpaddq_f64( vc_04, vc_05 ); vc_06 = vpaddq_f64( vc_06, vc_07 ); vc_10 = vpaddq_f64( vc_10, vc_11 ); vc_12 = vpaddq_f64( vc_12, vc_13 ); vc_14 = vpaddq_f64( vc_14, vc_15 ); vc_16 = vpaddq_f64( vc_16, vc_17 ); if ( n0 > 1 ) vb_0 = vld1q_f64 ( c_loc + 0 * rs_c + 0 ); else if ( n0 > 0 ) vb_0 = vld1q_lane_f64( c_loc + 0 * rs_c + 0, vb_0, 0 ); if ( n0 > 3 ) vb_1 = vld1q_f64 ( c_loc + 0 * rs_c + 2 ); else if ( n0 > 2 ) vb_1 = vld1q_lane_f64( c_loc + 0 * rs_c + 2, vb_1, 0 ); if ( n0 > 5 ) vb_2 = vld1q_f64 ( c_loc + 0 * rs_c + 4 ); else if ( n0 > 4 ) vb_2 = vld1q_lane_f64( c_loc + 0 * rs_c + 4, vb_2, 0 ); if ( n0 > 7 ) vb_3 = vld1q_f64 ( c_loc + 0 * rs_c + 6 ); else if ( n0 > 6 ) vb_3 = vld1q_lane_f64( c_loc + 0 * rs_c + 6, vb_3, 0 ); if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_02 = vfmaq_f64( vc_02, va_0, vb_1 ); vc_04 = vfmaq_f64( vc_04, va_0, vb_2 ); vc_06 = vfmaq_f64( vc_06, va_0, vb_3 ); } if ( n0 > 1 ) vst1q_f64 ( c_loc + 0 * rs_c + 0, vc_00 ); else if ( n0 > 0 ) vst1q_lane_f64( c_loc + 0 * rs_c + 0, vc_00, 0 ); if ( n0 > 3 ) vst1q_f64 ( c_loc + 0 * rs_c + 2, vc_02 ); else if ( n0 > 2 ) vst1q_lane_f64( c_loc + 0 * rs_c + 2, vc_02, 0 ); if ( n0 > 5 ) vst1q_f64 ( c_loc + 0 * rs_c + 4, vc_04 ); else if ( n0 > 4 ) vst1q_lane_f64( c_loc + 0 * rs_c + 4, vc_04, 0 ); if ( n0 > 7 ) vst1q_f64 ( c_loc + 0 * rs_c + 6, vc_06 ); else if ( n0 > 6 ) vst1q_lane_f64( c_loc + 0 * rs_c + 6, vc_06, 0 ); if ( m0 > 1 ) { if ( n0 > 1 ) vb_0 = vld1q_f64 ( c_loc + 1 * rs_c + 0 ); else if ( n0 > 0 ) vb_0 = vld1q_lane_f64( c_loc + 1 * rs_c + 0, vb_0, 0 ); if ( n0 > 3 ) vb_1 = vld1q_f64 ( c_loc + 1 * rs_c + 2 ); else if ( n0 > 2 ) vb_1 = vld1q_lane_f64( c_loc + 1 * rs_c + 2, vb_1, 0 ); if ( n0 > 5 ) vb_2 = vld1q_f64 ( c_loc + 1 * rs_c + 4 ); else if ( n0 > 4 ) vb_2 = vld1q_lane_f64( c_loc + 1 * rs_c + 4, vb_2, 0 ); if ( n0 > 7 ) vb_3 = vld1q_f64 ( c_loc + 1 * rs_c + 6 ); else if ( n0 > 6 ) vb_3 = vld1q_lane_f64( c_loc + 1 * rs_c + 6, vb_3, 0 ); if ( !b_iszr ) { vc_10 = vfmaq_f64( vc_10, va_0, vb_0 ); vc_12 = vfmaq_f64( vc_12, va_0, vb_1 ); vc_14 = vfmaq_f64( vc_14, va_0, vb_2 ); vc_16 = vfmaq_f64( vc_16, va_0, vb_3 ); } if ( n0 > 1 ) vst1q_f64 ( c_loc + 1 * rs_c + 0, vc_10 ); else if ( n0 > 0 ) vst1q_lane_f64( c_loc + 1 * rs_c + 0, vc_10, 0 ); if ( n0 > 3 ) vst1q_f64 ( c_loc + 1 * rs_c + 2, vc_12 ); else if ( n0 > 2 ) vst1q_lane_f64( c_loc + 1 * rs_c + 2, vc_12, 0 ); if ( n0 > 5 ) vst1q_f64 ( c_loc + 1 * rs_c + 4, vc_14 ); else if ( n0 > 4 ) vst1q_lane_f64( c_loc + 1 * rs_c + 4, vc_14, 0 ); if ( n0 > 7 ) vst1q_f64 ( c_loc + 1 * rs_c + 6, vc_16 ); else if ( n0 > 6 ) vst1q_lane_f64( c_loc + 1 * rs_c + 6, vc_16, 0 ); } } else { // Column-storage. vc_00 = vpaddq_f64( vc_00, vc_10 ); vc_01 = vpaddq_f64( vc_01, vc_11 ); vc_02 = vpaddq_f64( vc_02, vc_12 ); vc_03 = vpaddq_f64( vc_03, vc_13 ); vc_04 = vpaddq_f64( vc_04, vc_14 ); vc_05 = vpaddq_f64( vc_05, vc_15 ); vc_06 = vpaddq_f64( vc_06, vc_16 ); vc_07 = vpaddq_f64( vc_07, vc_17 ); if ( m0 > 1 ) { // if ( n0 > 0 ) vb_0 = vld1q_f64( c_loc + 0 + 0 * cs_c ); if ( n0 > 1 ) vb_1 = vld1q_f64( c_loc + 0 + 1 * cs_c ); if ( n0 > 2 ) vb_2 = vld1q_f64( c_loc + 0 + 2 * cs_c ); if ( n0 > 3 ) vb_3 = vld1q_f64( c_loc + 0 + 3 * cs_c ); if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_01 = vfmaq_f64( vc_01, va_0, vb_1 ); vc_02 = vfmaq_f64( vc_02, va_0, vb_2 ); vc_03 = vfmaq_f64( vc_03, va_0, vb_3 ); } vst1q_f64( c_loc + 0 + 0 * cs_c, vc_00 ); if ( n0 > 1 ) vst1q_f64( c_loc + 0 + 1 * cs_c, vc_01 ); if ( n0 > 2 ) vst1q_f64( c_loc + 0 + 2 * cs_c, vc_02 ); if ( n0 > 3 ) vst1q_f64( c_loc + 0 + 3 * cs_c, vc_03 ); if ( n0 > 4 ) vb_0 = vld1q_f64( c_loc + 0 + 4 * cs_c ); if ( n0 > 5 ) vb_1 = vld1q_f64( c_loc + 0 + 5 * cs_c ); if ( n0 > 6 ) vb_2 = vld1q_f64( c_loc + 0 + 6 * cs_c ); if ( n0 > 7 ) vb_3 = vld1q_f64( c_loc + 0 + 7 * cs_c ); if ( !b_iszr ) { vc_04 = vfmaq_f64( vc_04, va_0, vb_0 ); vc_05 = vfmaq_f64( vc_05, va_0, vb_1 ); vc_06 = vfmaq_f64( vc_06, va_0, vb_2 ); vc_07 = vfmaq_f64( vc_07, va_0, vb_3 ); } if ( n0 > 4 ) vst1q_f64( c_loc + 0 + 4 * cs_c, vc_04 ); if ( n0 > 5 ) vst1q_f64( c_loc + 0 + 5 * cs_c, vc_05 ); if ( n0 > 6 ) vst1q_f64( c_loc + 0 + 6 * cs_c, vc_06 ); if ( n0 > 7 ) vst1q_f64( c_loc + 0 + 7 * cs_c, vc_07 ); } else { // if ( n0 > 0 ) vb_0 = vld1q_lane_f64( c_loc + 0 + 0 * cs_c, vb_0, 0 ); if ( n0 > 1 ) vb_1 = vld1q_lane_f64( c_loc + 0 + 1 * cs_c, vb_1, 0 ); if ( n0 > 2 ) vb_2 = vld1q_lane_f64( c_loc + 0 + 2 * cs_c, vb_2, 0 ); if ( n0 > 3 ) vb_3 = vld1q_lane_f64( c_loc + 0 + 3 * cs_c, vb_3, 0 ); if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_01 = vfmaq_f64( vc_01, va_0, vb_1 ); vc_02 = vfmaq_f64( vc_02, va_0, vb_2 ); vc_03 = vfmaq_f64( vc_03, va_0, vb_3 ); } vst1q_lane_f64( c_loc + 0 + 0 * cs_c, vc_00, 0 ); if ( n0 > 1 ) vst1q_lane_f64( c_loc + 0 + 1 * cs_c, vc_01, 0 ); if ( n0 > 2 ) vst1q_lane_f64( c_loc + 0 + 2 * cs_c, vc_02, 0 ); if ( n0 > 3 ) vst1q_lane_f64( c_loc + 0 + 3 * cs_c, vc_03, 0 ); if ( n0 > 4 ) vb_0 = vld1q_lane_f64( c_loc + 0 + 4 * cs_c, vb_0, 0 ); if ( n0 > 5 ) vb_1 = vld1q_lane_f64( c_loc + 0 + 5 * cs_c, vb_1, 0 ); if ( n0 > 6 ) vb_2 = vld1q_lane_f64( c_loc + 0 + 6 * cs_c, vb_2, 0 ); if ( n0 > 7 ) vb_3 = vld1q_lane_f64( c_loc + 0 + 7 * cs_c, vb_3, 0 ); if ( !b_iszr ) { vc_04 = vfmaq_f64( vc_04, va_0, vb_0 ); vc_05 = vfmaq_f64( vc_05, va_0, vb_1 ); vc_06 = vfmaq_f64( vc_06, va_0, vb_2 ); vc_07 = vfmaq_f64( vc_07, va_0, vb_3 ); } if ( n0 > 4 ) vst1q_lane_f64( c_loc + 0 + 4 * cs_c, vc_04, 0 ); if ( n0 > 5 ) vst1q_lane_f64( c_loc + 0 + 5 * cs_c, vc_05, 0 ); if ( n0 > 6 ) vst1q_lane_f64( c_loc + 0 + 6 * cs_c, vc_06, 0 ); if ( n0 > 7 ) vst1q_lane_f64( c_loc + 0 + 7 * cs_c, vc_07, 0 ); } } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c000066400000000000000000000275341427272030600303730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Supplimentary dynamic-size gemmsup. #include "blis.h" #include "assert.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL _Pragma("unroll") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL _Pragma("GCC unroll 2") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL #endif /* * As these kernels requires num. of vregs about half of the total 32, * it should be all right to implement w/ intrinsics. * * c.f. https://www.youtube.com/watch?v=R2hQOVjRwVE . */ void bli_dgemmsup_rd_armv8a_int_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a, inc_t cs_a, double* restrict b, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // if ( m0 == 3 && n0 == 4 ) // { // // Use fixed-size version if it is full 3x4. // bli_dgemmsup_rd_armv8a_asm_3x4 // ( // conja, conjb, m0, n0, k0, // alpha, a, rs_a, cs_a, b, rs_b, cs_b, // beta, c, rs_c, cs_c, data, cntx // ); // return; // } assert( m0 <= 3 ); assert( n0 <= 4 ); double *a_loc = a; double *b_loc = b; double *c_loc = c; uint64_t k_mker = k0 / 2; uint64_t k_left = k0 % 2; uint64_t b_iszr = ( *beta == 0.0 ); assert( cs_a == 1 ); assert( rs_b == 1 ); // Registers used to store a 3x4x2 block of C (summing the last dimension). float64x2_t vc_00, vc_01, vc_02, vc_03; float64x2_t vc_10, vc_11, vc_12, vc_13; float64x2_t vc_20, vc_21, vc_22, vc_23; float64x2_t va_0, va_1, va_2; float64x2_t vb_0, vb_1, vb_2, vb_3; vc_00 = (float64x2_t)vdupq_n_f64( 0 ); vc_01 = (float64x2_t)vdupq_n_f64( 0 ); vc_02 = (float64x2_t)vdupq_n_f64( 0 ); vc_03 = (float64x2_t)vdupq_n_f64( 0 ); vc_10 = (float64x2_t)vdupq_n_f64( 0 ); vc_11 = (float64x2_t)vdupq_n_f64( 0 ); vc_12 = (float64x2_t)vdupq_n_f64( 0 ); vc_13 = (float64x2_t)vdupq_n_f64( 0 ); vc_20 = (float64x2_t)vdupq_n_f64( 0 ); vc_21 = (float64x2_t)vdupq_n_f64( 0 ); vc_22 = (float64x2_t)vdupq_n_f64( 0 ); vc_23 = (float64x2_t)vdupq_n_f64( 0 ); PRAGMA_UNROLL for ( ; k_mker > 0; --k_mker ) { // if ( m0 > 0 ) va_0 = vld1q_f64( a_loc + rs_a * 0 ); if ( m0 > 1 ) va_1 = vld1q_f64( a_loc + rs_a * 1 ); if ( m0 > 2 ) va_2 = vld1q_f64( a_loc + rs_a * 2 ); // if ( n0 > 0 ) vb_0 = vld1q_f64( b_loc + cs_b * 0 ); if ( n0 > 1 ) vb_1 = vld1q_f64( b_loc + cs_b * 1 ); if ( n0 > 2 ) vb_2 = vld1q_f64( b_loc + cs_b * 2 ); if ( n0 > 3 ) vb_3 = vld1q_f64( b_loc + cs_b * 3 ); a_loc += 2; b_loc += 2; // 1-column case. if ( n0 == 1 ) { vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_10 = vfmaq_f64( vc_10, va_1, vb_0 ); vc_20 = vfmaq_f64( vc_20, va_2, vb_0 ); continue; } vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_01 = vfmaq_f64( vc_01, va_0, vb_1 ); vc_02 = vfmaq_f64( vc_02, va_0, vb_2 ); vc_03 = vfmaq_f64( vc_03, va_0, vb_3 ); if ( m0 > 1 ) { vc_10 = vfmaq_f64( vc_10, va_1, vb_0 ); vc_11 = vfmaq_f64( vc_11, va_1, vb_1 ); vc_12 = vfmaq_f64( vc_12, va_1, vb_2 ); vc_13 = vfmaq_f64( vc_13, va_1, vb_3 ); } if ( m0 > 2 ) { vc_20 = vfmaq_f64( vc_20, va_2, vb_0 ); vc_21 = vfmaq_f64( vc_21, va_2, vb_1 ); vc_22 = vfmaq_f64( vc_22, va_2, vb_2 ); vc_23 = vfmaq_f64( vc_23, va_2, vb_3 ); } } // Pay no care for O(1) details. va_0 = (float64x2_t)vdupq_n_f64( 0 ); va_1 = (float64x2_t)vdupq_n_f64( 0 ); va_2 = (float64x2_t)vdupq_n_f64( 0 ); vb_0 = (float64x2_t)vdupq_n_f64( 0 ); vb_1 = (float64x2_t)vdupq_n_f64( 0 ); vb_2 = (float64x2_t)vdupq_n_f64( 0 ); vb_3 = (float64x2_t)vdupq_n_f64( 0 ); PRAGMA_NOUNROLL for ( ; k_left > 0; --k_left ) { // if ( m0 > 0 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 ); if ( m0 > 1 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 1, va_1, 0 ); if ( m0 > 2 ) va_2 = vld1q_lane_f64( a_loc + rs_a * 2, va_2, 0 ); // if ( n0 > 0 ) vb_0 = vld1q_lane_f64( b_loc + cs_b * 0, vb_0, 0 ); if ( n0 > 1 ) vb_1 = vld1q_lane_f64( b_loc + cs_b * 1, vb_1, 0 ); if ( n0 > 2 ) vb_2 = vld1q_lane_f64( b_loc + cs_b * 2, vb_2, 0 ); if ( n0 > 3 ) vb_3 = vld1q_lane_f64( b_loc + cs_b * 3, vb_3, 0 ); vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_01 = vfmaq_f64( vc_01, va_0, vb_1 ); vc_02 = vfmaq_f64( vc_02, va_0, vb_2 ); vc_03 = vfmaq_f64( vc_03, va_0, vb_3 ); vc_10 = vfmaq_f64( vc_10, va_1, vb_0 ); vc_11 = vfmaq_f64( vc_11, va_1, vb_1 ); vc_12 = vfmaq_f64( vc_12, va_1, vb_2 ); vc_13 = vfmaq_f64( vc_13, va_1, vb_3 ); vc_20 = vfmaq_f64( vc_20, va_2, vb_0 ); vc_21 = vfmaq_f64( vc_21, va_2, vb_1 ); vc_22 = vfmaq_f64( vc_22, va_2, vb_2 ); vc_23 = vfmaq_f64( vc_23, va_2, vb_3 ); a_loc += 1; b_loc += 1; } // Reduce. vc_00 = vpaddq_f64( vc_00, vc_01 ); vc_02 = vpaddq_f64( vc_02, vc_03 ); vc_10 = vpaddq_f64( vc_10, vc_11 ); vc_12 = vpaddq_f64( vc_12, vc_13 ); vc_20 = vpaddq_f64( vc_20, vc_21 ); vc_22 = vpaddq_f64( vc_22, vc_23 ); // Load alpha and beta. va_0 = vld1q_dup_f64( alpha ); vb_0 = vld1q_dup_f64( beta ); // Scale. vc_00 = vmulq_f64( vc_00, va_0 ); vc_02 = vmulq_f64( vc_02, va_0 ); vc_10 = vmulq_f64( vc_10, va_0 ); vc_12 = vmulq_f64( vc_12, va_0 ); vc_20 = vmulq_f64( vc_20, va_0 ); vc_22 = vmulq_f64( vc_22, va_0 ); if ( cs_c == 1 ) { // Row-storage. // if ( m0 > 0 ) { if ( n0 > 1 ) va_0 = vld1q_f64 ( c_loc + 0 * rs_c + 0 ); else if ( n0 > 0 ) va_0 = vld1q_lane_f64( c_loc + 0 * rs_c + 0, va_0, 0 ); if ( n0 > 3 ) va_1 = vld1q_f64 ( c_loc + 0 * rs_c + 2 ); else if ( n0 > 2 ) va_1 = vld1q_lane_f64( c_loc + 0 * rs_c + 2, va_1, 0 ); if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_02 = vfmaq_f64( vc_02, va_1, vb_0 ); } if ( n0 > 1 ) vst1q_f64 ( c_loc + 0 * rs_c + 0, vc_00 ); else if ( n0 > 0 ) vst1q_lane_f64( c_loc + 0 * rs_c + 0, vc_00, 0 ); if ( n0 > 3 ) vst1q_f64 ( c_loc + 0 * rs_c + 2, vc_02 ); else if ( n0 > 2 ) vst1q_lane_f64( c_loc + 0 * rs_c + 2, vc_02, 0 ); } if ( m0 > 1 ) { if ( n0 > 1 ) va_0 = vld1q_f64 ( c_loc + 1 * rs_c + 0 ); else if ( n0 > 0 ) va_0 = vld1q_lane_f64( c_loc + 1 * rs_c + 0, va_0, 0 ); if ( n0 > 3 ) va_1 = vld1q_f64 ( c_loc + 1 * rs_c + 2 ); else if ( n0 > 2 ) va_1 = vld1q_lane_f64( c_loc + 1 * rs_c + 2, va_1, 0 ); if ( !b_iszr ) { vc_10 = vfmaq_f64( vc_10, va_0, vb_0 ); vc_12 = vfmaq_f64( vc_12, va_1, vb_0 ); } if ( n0 > 1 ) vst1q_f64 ( c_loc + 1 * rs_c + 0, vc_10 ); else if ( n0 > 0 ) vst1q_lane_f64( c_loc + 1 * rs_c + 0, vc_10, 0 ); if ( n0 > 3 ) vst1q_f64 ( c_loc + 1 * rs_c + 2, vc_12 ); else if ( n0 > 2 ) vst1q_lane_f64( c_loc + 1 * rs_c + 2, vc_12, 0 ); } if ( m0 > 2 ) { if ( n0 > 1 ) va_0 = vld1q_f64 ( c_loc + 2 * rs_c + 0 ); else if ( n0 > 0 ) va_0 = vld1q_lane_f64( c_loc + 2 * rs_c + 0, va_0, 0 ); if ( n0 > 3 ) va_1 = vld1q_f64 ( c_loc + 2 * rs_c + 2 ); else if ( n0 > 2 ) va_1 = vld1q_lane_f64( c_loc + 2 * rs_c + 2, va_1, 0 ); if ( !b_iszr ) { vc_20 = vfmaq_f64( vc_20, va_0, vb_0 ); vc_22 = vfmaq_f64( vc_22, va_1, vb_0 ); } if ( n0 > 1 ) vst1q_f64 ( c_loc + 2 * rs_c + 0, vc_20 ); else if ( n0 > 0 ) vst1q_lane_f64( c_loc + 2 * rs_c + 0, vc_20, 0 ); if ( n0 > 3 ) vst1q_f64 ( c_loc + 2 * rs_c + 2, vc_22 ); else if ( n0 > 2 ) vst1q_lane_f64( c_loc + 2 * rs_c + 2, vc_22, 0 ); } } else { // Column-storage. if ( m0 > 0 ) va_0 = vld1q_lane_f64( c_loc + 0 + 0 * cs_c, va_0, 0 ); if ( m0 > 1 ) va_1 = vld1q_lane_f64( c_loc + 1 + 0 * cs_c, va_1, 0 ); if ( m0 > 2 ) va_2 = vld1q_lane_f64( c_loc + 2 + 0 * cs_c, va_2, 0 ); if ( n0 > 1 ) { if ( m0 > 0 ) va_0 = vld1q_lane_f64( c_loc + 0 + 1 * cs_c, va_0, 1 ); if ( m0 > 1 ) va_1 = vld1q_lane_f64( c_loc + 1 + 1 * cs_c, va_1, 1 ); if ( m0 > 2 ) va_2 = vld1q_lane_f64( c_loc + 2 + 1 * cs_c, va_2, 1 ); } if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_10 = vfmaq_f64( vc_10, va_1, vb_0 ); vc_20 = vfmaq_f64( vc_20, va_2, vb_0 ); } if ( m0 > 0 ) vst1q_lane_f64( c_loc + 0 + 0 * cs_c, vc_00, 0 ); if ( m0 > 1 ) vst1q_lane_f64( c_loc + 1 + 0 * cs_c, vc_10, 0 ); if ( m0 > 2 ) vst1q_lane_f64( c_loc + 2 + 0 * cs_c, vc_20, 0 ); if ( n0 > 1 ) { if ( m0 > 0 ) vst1q_lane_f64( c_loc + 0 + 1 * cs_c, vc_00, 1 ); if ( m0 > 1 ) vst1q_lane_f64( c_loc + 1 + 1 * cs_c, vc_10, 1 ); if ( m0 > 2 ) vst1q_lane_f64( c_loc + 2 + 1 * cs_c, vc_20, 1 ); } if ( n0 > 2 ) { if ( m0 > 0 ) va_0 = vld1q_lane_f64( c_loc + 0 + 2 * cs_c, va_0, 0 ); if ( m0 > 1 ) va_1 = vld1q_lane_f64( c_loc + 1 + 2 * cs_c, va_1, 0 ); if ( m0 > 2 ) va_2 = vld1q_lane_f64( c_loc + 2 + 2 * cs_c, va_2, 0 ); } if ( n0 > 3 ) { if ( m0 > 0 ) va_0 = vld1q_lane_f64( c_loc + 0 + 3 * cs_c, va_0, 1 ); if ( m0 > 1 ) va_1 = vld1q_lane_f64( c_loc + 1 + 3 * cs_c, va_1, 1 ); if ( m0 > 2 ) va_2 = vld1q_lane_f64( c_loc + 2 + 3 * cs_c, va_2, 1 ); } if ( !b_iszr ) { vc_02 = vfmaq_f64( vc_02, va_0, vb_0 ); vc_12 = vfmaq_f64( vc_12, va_1, vb_0 ); vc_22 = vfmaq_f64( vc_22, va_2, vb_0 ); } if ( n0 > 2 ) { if ( m0 > 0 ) vst1q_lane_f64( c_loc + 0 + 2 * cs_c, vc_02, 0 ); if ( m0 > 1 ) vst1q_lane_f64( c_loc + 1 + 2 * cs_c, vc_12, 0 ); if ( m0 > 2 ) vst1q_lane_f64( c_loc + 2 + 2 * cs_c, vc_22, 0 ); } if ( n0 > 3 ) { if ( m0 > 0 ) vst1q_lane_f64( c_loc + 0 + 3 * cs_c, vc_02, 1 ); if ( m0 > 1 ) vst1q_lane_f64( c_loc + 1 + 3 * cs_c, vc_12, 1 ); if ( m0 > 2 ) vst1q_lane_f64( c_loc + 2 + 3 * cs_c, vc_22, 1 ); } } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d6x4/000077500000000000000000000000001427272030600222355ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c000066400000000000000000000400511427272030600307440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Supplimentary dynamic-size gemmsup. #include "blis.h" #include "assert.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL _Pragma("unroll") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL _Pragma("GCC unroll 2") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL #endif /* * As these kernels requires num. of vregs about half of the total 32, * it should be all right to implement w/ intrinsics. * * c.f. https://www.youtube.com/watch?v=R2hQOVjRwVE . */ void bli_dgemmsup_rv_armv8a_int_3x8mn ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a0, inc_t rs_a, inc_t cs_a, double* restrict b0, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c0, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Unlike the rd case, this rv case does not impose restriction upon // maximal m & n. double *a_loc; double *b_loc, *b_in; double *c_loc, *c_in; dim_t n; dim_t k; uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t b_iszr = ( *beta == 0.0 ); assert( cs_b == 1 ); // Registers used to store a 3x8 block of C. float64x2_t vc_00, vc_01, vc_02, vc_03; float64x2_t vc_10, vc_11, vc_12, vc_13; float64x2_t vc_20, vc_21, vc_22, vc_23; float64x2_t va_0, va_1; float64x2_t vb_0, vb_1, vb_2, vb_3; PRAGMA_NOUNROLL for ( ; m0 > 0; m0 -= 3 ) { n = n0; b_in = b0; c_in = c0; PRAGMA_NOUNROLL for ( ; n > 0; n -= 8 ) { a_loc = a0; b_loc = b_in; c_loc = c_in; k = k0; vc_00 = (float64x2_t)vdupq_n_f64( 0 ); vc_01 = (float64x2_t)vdupq_n_f64( 0 ); vc_02 = (float64x2_t)vdupq_n_f64( 0 ); vc_03 = (float64x2_t)vdupq_n_f64( 0 ); vc_10 = (float64x2_t)vdupq_n_f64( 0 ); vc_11 = (float64x2_t)vdupq_n_f64( 0 ); vc_12 = (float64x2_t)vdupq_n_f64( 0 ); vc_13 = (float64x2_t)vdupq_n_f64( 0 ); vc_20 = (float64x2_t)vdupq_n_f64( 0 ); vc_21 = (float64x2_t)vdupq_n_f64( 0 ); vc_22 = (float64x2_t)vdupq_n_f64( 0 ); vc_23 = (float64x2_t)vdupq_n_f64( 0 ); PRAGMA_UNROLL for ( ; k > 0; --k ) { // A columns. // if ( m0 > 0 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 ); if ( m0 > 1 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 1, va_0, 1 ); if ( m0 > 2 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 2, va_1, 0 ); // B rows. if ( n > 1 ) vb_0 = vld1q_f64 ( b_loc + 0 ); else vb_0 = vld1q_lane_f64( b_loc + 0, vb_0, 0 ); if ( n > 3 ) vb_1 = vld1q_f64 ( b_loc + 2 ); else if ( n > 2 ) vb_1 = vld1q_lane_f64( b_loc + 2, vb_1, 0 ); if ( n > 5 ) vb_2 = vld1q_f64 ( b_loc + 4 ); else if ( n > 4 ) vb_2 = vld1q_lane_f64( b_loc + 4, vb_2, 0 ); if ( n > 7 ) vb_3 = vld1q_f64 ( b_loc + 6 ); else if ( n > 6 ) vb_3 = vld1q_lane_f64( b_loc + 6, vb_3, 0 ); a_loc += cs_a; b_loc += rs_b; // if ( m0 > 0 ) { vc_00 = vfmaq_laneq_f64( vc_00, vb_0, va_0, 0 ); vc_01 = vfmaq_laneq_f64( vc_01, vb_1, va_0, 0 ); vc_02 = vfmaq_laneq_f64( vc_02, vb_2, va_0, 0 ); vc_03 = vfmaq_laneq_f64( vc_03, vb_3, va_0, 0 ); } if ( m0 > 1 ) { vc_10 = vfmaq_laneq_f64( vc_10, vb_0, va_0, 1 ); vc_11 = vfmaq_laneq_f64( vc_11, vb_1, va_0, 1 ); vc_12 = vfmaq_laneq_f64( vc_12, vb_2, va_0, 1 ); vc_13 = vfmaq_laneq_f64( vc_13, vb_3, va_0, 1 ); } if ( m0 > 2 ) { vc_20 = vfmaq_laneq_f64( vc_20, vb_0, va_1, 0 ); vc_21 = vfmaq_laneq_f64( vc_21, vb_1, va_1, 0 ); vc_22 = vfmaq_laneq_f64( vc_22, vb_2, va_1, 0 ); vc_23 = vfmaq_laneq_f64( vc_23, vb_3, va_1, 0 ); } } // Load alpha and beta. // Note that here vb is used for alpha, in contrast to other kernels. vb_0 = vld1q_dup_f64( alpha ); va_0 = vld1q_dup_f64( beta ); // Scale. vc_00 = vmulq_f64( vc_00, vb_0 ); vc_01 = vmulq_f64( vc_01, vb_0 ); vc_02 = vmulq_f64( vc_02, vb_0 ); vc_03 = vmulq_f64( vc_03, vb_0 ); vc_10 = vmulq_f64( vc_10, vb_0 ); vc_11 = vmulq_f64( vc_11, vb_0 ); vc_12 = vmulq_f64( vc_12, vb_0 ); vc_13 = vmulq_f64( vc_13, vb_0 ); vc_20 = vmulq_f64( vc_20, vb_0 ); vc_21 = vmulq_f64( vc_21, vb_0 ); vc_22 = vmulq_f64( vc_22, vb_0 ); vc_23 = vmulq_f64( vc_23, vb_0 ); if ( cs_c == 1 ) { // Store in rows. // // if ( m0 > 0 ) { // Load. if ( n > 1 ) vb_0 = vld1q_f64 ( c_loc + 0 * rs_c + 0 ); else vb_0 = vld1q_lane_f64( c_loc + 0 * rs_c + 0, vb_0, 0 ); if ( n > 3 ) vb_1 = vld1q_f64 ( c_loc + 0 * rs_c + 2 ); else if ( n > 2 ) vb_1 = vld1q_lane_f64( c_loc + 0 * rs_c + 2, vb_1, 0 ); if ( n > 5 ) vb_2 = vld1q_f64 ( c_loc + 0 * rs_c + 4 ); else if ( n > 4 ) vb_2 = vld1q_lane_f64( c_loc + 0 * rs_c + 4, vb_2, 0 ); if ( n > 7 ) vb_3 = vld1q_f64 ( c_loc + 0 * rs_c + 6 ); else if ( n > 6 ) vb_3 = vld1q_lane_f64( c_loc + 0 * rs_c + 6, vb_3, 0 ); // Scale. if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, vb_0, va_0 ); vc_01 = vfmaq_f64( vc_01, vb_1, va_0 ); vc_02 = vfmaq_f64( vc_02, vb_2, va_0 ); vc_03 = vfmaq_f64( vc_03, vb_3, va_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 0 * rs_c + 0, vc_00 ); else vst1q_lane_f64( c_loc + 0 * rs_c + 0, vc_00, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 0 * rs_c + 2, vc_01 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 0 * rs_c + 2, vc_01, 0 ); if ( n > 5 ) vst1q_f64 ( c_loc + 0 * rs_c + 4, vc_02 ); else if ( n > 4 ) vst1q_lane_f64( c_loc + 0 * rs_c + 4, vc_02, 0 ); if ( n > 7 ) vst1q_f64 ( c_loc + 0 * rs_c + 6, vc_03 ); else if ( n > 6 ) vst1q_lane_f64( c_loc + 0 * rs_c + 6, vc_03, 0 ); } if ( m0 > 1 ) { // Load. if ( n > 1 ) vb_0 = vld1q_f64 ( c_loc + 1 * rs_c + 0 ); else vb_0 = vld1q_lane_f64( c_loc + 1 * rs_c + 0, vb_0, 0 ); if ( n > 3 ) vb_1 = vld1q_f64 ( c_loc + 1 * rs_c + 2 ); else if ( n > 2 ) vb_1 = vld1q_lane_f64( c_loc + 1 * rs_c + 2, vb_1, 0 ); if ( n > 5 ) vb_2 = vld1q_f64 ( c_loc + 1 * rs_c + 4 ); else if ( n > 4 ) vb_2 = vld1q_lane_f64( c_loc + 1 * rs_c + 4, vb_2, 0 ); if ( n > 7 ) vb_3 = vld1q_f64 ( c_loc + 1 * rs_c + 6 ); else if ( n > 6 ) vb_3 = vld1q_lane_f64( c_loc + 1 * rs_c + 6, vb_3, 0 ); // Scale. if ( !b_iszr ) { vc_10 = vfmaq_f64( vc_10, vb_0, va_0 ); vc_11 = vfmaq_f64( vc_11, vb_1, va_0 ); vc_12 = vfmaq_f64( vc_12, vb_2, va_0 ); vc_13 = vfmaq_f64( vc_13, vb_3, va_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 1 * rs_c + 0, vc_10 ); else vst1q_lane_f64( c_loc + 1 * rs_c + 0, vc_10, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 1 * rs_c + 2, vc_11 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 1 * rs_c + 2, vc_11, 0 ); if ( n > 5 ) vst1q_f64 ( c_loc + 1 * rs_c + 4, vc_12 ); else if ( n > 4 ) vst1q_lane_f64( c_loc + 1 * rs_c + 4, vc_12, 0 ); if ( n > 7 ) vst1q_f64 ( c_loc + 1 * rs_c + 6, vc_13 ); else if ( n > 6 ) vst1q_lane_f64( c_loc + 1 * rs_c + 6, vc_13, 0 ); } if ( m0 > 2 ) { // Load. if ( n > 1 ) vb_0 = vld1q_f64 ( c_loc + 2 * rs_c + 0 ); else vb_0 = vld1q_lane_f64( c_loc + 2 * rs_c + 0, vb_0, 0 ); if ( n > 3 ) vb_1 = vld1q_f64 ( c_loc + 2 * rs_c + 2 ); else if ( n > 2 ) vb_1 = vld1q_lane_f64( c_loc + 2 * rs_c + 2, vb_1, 0 ); if ( n > 5 ) vb_2 = vld1q_f64 ( c_loc + 2 * rs_c + 4 ); else if ( n > 4 ) vb_2 = vld1q_lane_f64( c_loc + 2 * rs_c + 4, vb_2, 0 ); if ( n > 7 ) vb_3 = vld1q_f64 ( c_loc + 2 * rs_c + 6 ); else if ( n > 6 ) vb_3 = vld1q_lane_f64( c_loc + 2 * rs_c + 6, vb_3, 0 ); // Scale. if ( !b_iszr ) { vc_20 = vfmaq_f64( vc_20, vb_0, va_0 ); vc_21 = vfmaq_f64( vc_21, vb_1, va_0 ); vc_22 = vfmaq_f64( vc_22, vb_2, va_0 ); vc_23 = vfmaq_f64( vc_23, vb_3, va_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 2 * rs_c + 0, vc_20 ); else vst1q_lane_f64( c_loc + 2 * rs_c + 0, vc_20, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 2 * rs_c + 2, vc_21 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 2 * rs_c + 2, vc_21, 0 ); if ( n > 5 ) vst1q_f64 ( c_loc + 2 * rs_c + 4, vc_22 ); else if ( n > 4 ) vst1q_lane_f64( c_loc + 2 * rs_c + 4, vc_22, 0 ); if ( n > 7 ) vst1q_f64 ( c_loc + 2 * rs_c + 6, vc_23 ); else if ( n > 6 ) vst1q_lane_f64( c_loc + 2 * rs_c + 6, vc_23, 0 ); } } else { // Store in columns. // No in-reg transpose here. // // if ( m0 > 0 ) { // Load. if ( n > 0 ) vb_0 = vld1q_lane_f64( c_loc + 0 + 0 * cs_c, vb_0, 0 ); if ( n > 1 ) vb_0 = vld1q_lane_f64( c_loc + 0 + 1 * cs_c, vb_0, 1 ); if ( n > 2 ) vb_1 = vld1q_lane_f64( c_loc + 0 + 2 * cs_c, vb_1, 0 ); if ( n > 3 ) vb_1 = vld1q_lane_f64( c_loc + 0 + 3 * cs_c, vb_1, 1 ); if ( n > 4 ) vb_2 = vld1q_lane_f64( c_loc + 0 + 4 * cs_c, vb_2, 0 ); if ( n > 5 ) vb_2 = vld1q_lane_f64( c_loc + 0 + 5 * cs_c, vb_2, 1 ); if ( n > 6 ) vb_3 = vld1q_lane_f64( c_loc + 0 + 6 * cs_c, vb_3, 0 ); if ( n > 7 ) vb_3 = vld1q_lane_f64( c_loc + 0 + 7 * cs_c, vb_3, 1 ); // Scale. if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, vb_0, va_0 ); vc_01 = vfmaq_f64( vc_01, vb_1, va_0 ); vc_02 = vfmaq_f64( vc_02, vb_2, va_0 ); vc_03 = vfmaq_f64( vc_03, vb_3, va_0 ); } // Store. if ( n > 0 ) vst1q_lane_f64( c_loc + 0 + 0 * cs_c, vc_00, 0 ); if ( n > 1 ) vst1q_lane_f64( c_loc + 0 + 1 * cs_c, vc_00, 1 ); if ( n > 2 ) vst1q_lane_f64( c_loc + 0 + 2 * cs_c, vc_01, 0 ); if ( n > 3 ) vst1q_lane_f64( c_loc + 0 + 3 * cs_c, vc_01, 1 ); if ( n > 4 ) vst1q_lane_f64( c_loc + 0 + 4 * cs_c, vc_02, 0 ); if ( n > 5 ) vst1q_lane_f64( c_loc + 0 + 5 * cs_c, vc_02, 1 ); if ( n > 6 ) vst1q_lane_f64( c_loc + 0 + 6 * cs_c, vc_03, 0 ); if ( n > 7 ) vst1q_lane_f64( c_loc + 0 + 7 * cs_c, vc_03, 1 ); } if ( m0 > 1 ) { // Load. if ( n > 0 ) vb_0 = vld1q_lane_f64( c_loc + 1 + 0 * cs_c, vb_0, 0 ); if ( n > 1 ) vb_0 = vld1q_lane_f64( c_loc + 1 + 1 * cs_c, vb_0, 1 ); if ( n > 2 ) vb_1 = vld1q_lane_f64( c_loc + 1 + 2 * cs_c, vb_1, 0 ); if ( n > 3 ) vb_1 = vld1q_lane_f64( c_loc + 1 + 3 * cs_c, vb_1, 1 ); if ( n > 4 ) vb_2 = vld1q_lane_f64( c_loc + 1 + 4 * cs_c, vb_2, 0 ); if ( n > 5 ) vb_2 = vld1q_lane_f64( c_loc + 1 + 5 * cs_c, vb_2, 1 ); if ( n > 6 ) vb_3 = vld1q_lane_f64( c_loc + 1 + 6 * cs_c, vb_3, 0 ); if ( n > 7 ) vb_3 = vld1q_lane_f64( c_loc + 1 + 7 * cs_c, vb_3, 1 ); // Scale. if ( !b_iszr ) { vc_10 = vfmaq_f64( vc_10, vb_0, va_0 ); vc_11 = vfmaq_f64( vc_11, vb_1, va_0 ); vc_12 = vfmaq_f64( vc_12, vb_2, va_0 ); vc_13 = vfmaq_f64( vc_13, vb_3, va_0 ); } // Store. if ( n > 0 ) vst1q_lane_f64( c_loc + 1 + 0 * cs_c, vc_10, 0 ); if ( n > 1 ) vst1q_lane_f64( c_loc + 1 + 1 * cs_c, vc_10, 1 ); if ( n > 2 ) vst1q_lane_f64( c_loc + 1 + 2 * cs_c, vc_11, 0 ); if ( n > 3 ) vst1q_lane_f64( c_loc + 1 + 3 * cs_c, vc_11, 1 ); if ( n > 4 ) vst1q_lane_f64( c_loc + 1 + 4 * cs_c, vc_12, 0 ); if ( n > 5 ) vst1q_lane_f64( c_loc + 1 + 5 * cs_c, vc_12, 1 ); if ( n > 6 ) vst1q_lane_f64( c_loc + 1 + 6 * cs_c, vc_13, 0 ); if ( n > 7 ) vst1q_lane_f64( c_loc + 1 + 7 * cs_c, vc_13, 1 ); } if ( m0 > 2 ) { // Load. if ( n > 0 ) vb_0 = vld1q_lane_f64( c_loc + 2 + 0 * cs_c, vb_0, 0 ); if ( n > 1 ) vb_0 = vld1q_lane_f64( c_loc + 2 + 1 * cs_c, vb_0, 1 ); if ( n > 2 ) vb_1 = vld1q_lane_f64( c_loc + 2 + 2 * cs_c, vb_1, 0 ); if ( n > 3 ) vb_1 = vld1q_lane_f64( c_loc + 2 + 3 * cs_c, vb_1, 1 ); if ( n > 4 ) vb_2 = vld1q_lane_f64( c_loc + 2 + 4 * cs_c, vb_2, 0 ); if ( n > 5 ) vb_2 = vld1q_lane_f64( c_loc + 2 + 5 * cs_c, vb_2, 1 ); if ( n > 6 ) vb_3 = vld1q_lane_f64( c_loc + 2 + 6 * cs_c, vb_3, 0 ); if ( n > 7 ) vb_3 = vld1q_lane_f64( c_loc + 2 + 7 * cs_c, vb_3, 1 ); // Scale. if ( !b_iszr ) { vc_20 = vfmaq_f64( vc_20, vb_0, va_0 ); vc_21 = vfmaq_f64( vc_21, vb_1, va_0 ); vc_22 = vfmaq_f64( vc_22, vb_2, va_0 ); vc_23 = vfmaq_f64( vc_23, vb_3, va_0 ); } // Store. if ( n > 0 ) vst1q_lane_f64( c_loc + 2 + 0 * cs_c, vc_20, 0 ); if ( n > 1 ) vst1q_lane_f64( c_loc + 2 + 1 * cs_c, vc_20, 1 ); if ( n > 2 ) vst1q_lane_f64( c_loc + 2 + 2 * cs_c, vc_21, 0 ); if ( n > 3 ) vst1q_lane_f64( c_loc + 2 + 3 * cs_c, vc_21, 1 ); if ( n > 4 ) vst1q_lane_f64( c_loc + 2 + 4 * cs_c, vc_22, 0 ); if ( n > 5 ) vst1q_lane_f64( c_loc + 2 + 5 * cs_c, vc_22, 1 ); if ( n > 6 ) vst1q_lane_f64( c_loc + 2 + 6 * cs_c, vc_23, 0 ); if ( n > 7 ) vst1q_lane_f64( c_loc + 2 + 7 * cs_c, vc_23, 1 ); } } b_in += ps_b; c_in += 8 * cs_c; } a0 += ps_a; c0 += 3 * rs_c; } } cython-blis-0.9.1/blis/_src/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c000066400000000000000000000447151427272030600307560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2021, The University of Tokyo Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // Supplimentary dynamic-size gemmsup. #include "blis.h" #include "assert.h" #include #if defined(__clang__) #define PRAGMA_NOUNROLL _Pragma("nounroll") #define PRAGMA_UNROLL _Pragma("unroll") #elif defined(__GNUC__) #define PRAGMA_NOUNROLL _Pragma("GCC unroll 1") #define PRAGMA_UNROLL _Pragma("GCC unroll 2") #else #define PRAGMA_NOUNROLL #define PRAGMA_UNROLL #endif /* * As these kernels requires num. of vregs about half of the total 32, * it should be all right to implement w/ intrinsics. * * c.f. https://www.youtube.com/watch?v=R2hQOVjRwVE . */ void bli_dgemmsup_rv_armv8a_int_6x4mn ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a0, inc_t rs_a, inc_t cs_a, double* restrict b0, inc_t rs_b, inc_t cs_b, double* restrict beta, double* restrict c0, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Unlike the rd case, this rv case does not impose restriction upon // maximal m & n. double *a_loc; double *b_loc, *b_in; double *c_loc, *c_in; dim_t n; dim_t k; uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t b_iszr = ( *beta == 0.0 ); assert( cs_b == 1 ); // Registers used to store a 6x4 block of C. float64x2_t vc_00, vc_01; float64x2_t vc_10, vc_11; float64x2_t vc_20, vc_21; float64x2_t vc_30, vc_31; float64x2_t vc_40, vc_41; float64x2_t vc_50, vc_51; float64x2_t va_0, va_1, va_2; float64x2_t vb_0, vb_1; PRAGMA_NOUNROLL for ( ; m0 > 0; m0 -= 6 ) { n = n0; b_in = b0; c_in = c0; PRAGMA_NOUNROLL for ( ; n > 0; n -= 4 ) { a_loc = a0; b_loc = b_in; c_loc = c_in; k = k0; vc_00 = (float64x2_t)vdupq_n_f64( 0 ); vc_01 = (float64x2_t)vdupq_n_f64( 0 ); vc_10 = (float64x2_t)vdupq_n_f64( 0 ); vc_11 = (float64x2_t)vdupq_n_f64( 0 ); vc_20 = (float64x2_t)vdupq_n_f64( 0 ); vc_21 = (float64x2_t)vdupq_n_f64( 0 ); vc_30 = (float64x2_t)vdupq_n_f64( 0 ); vc_31 = (float64x2_t)vdupq_n_f64( 0 ); vc_40 = (float64x2_t)vdupq_n_f64( 0 ); vc_41 = (float64x2_t)vdupq_n_f64( 0 ); vc_50 = (float64x2_t)vdupq_n_f64( 0 ); vc_51 = (float64x2_t)vdupq_n_f64( 0 ); PRAGMA_UNROLL for ( ; k > 0; --k ) { // A columns. // if ( m0 > 0 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 ); if ( m0 > 1 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 1, va_0, 1 ); if ( m0 > 2 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 2, va_1, 0 ); if ( m0 > 3 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 3, va_1, 1 ); if ( m0 > 4 ) va_2 = vld1q_lane_f64( a_loc + rs_a * 4, va_2, 0 ); if ( m0 > 5 ) va_2 = vld1q_lane_f64( a_loc + rs_a * 5, va_2, 1 ); // B rows. if ( n > 1 ) vb_0 = vld1q_f64 ( b_loc + 0 ); else vb_0 = vld1q_lane_f64( b_loc + 0, vb_0, 0 ); if ( n > 3 ) vb_1 = vld1q_f64 ( b_loc + 2 ); else if ( n > 2 ) vb_1 = vld1q_lane_f64( b_loc + 2, vb_1, 0 ); a_loc += cs_a; b_loc += rs_b; // One or two-column case. if ( n <= 2 ) { // if ( m0 > 0 ) { vc_00 = vfmaq_laneq_f64( vc_00, vb_0, va_0, 0 ); vc_10 = vfmaq_laneq_f64( vc_10, vb_0, va_0, 1 ); vc_20 = vfmaq_laneq_f64( vc_20, vb_0, va_1, 0 ); } if ( m0 > 3 ) { vc_30 = vfmaq_laneq_f64( vc_30, vb_0, va_1, 1 ); vc_40 = vfmaq_laneq_f64( vc_40, vb_0, va_2, 0 ); vc_50 = vfmaq_laneq_f64( vc_50, vb_0, va_2, 1 ); } continue; } // Three or four-column case. Moderately decrease num. of FMLA instructions // according to m and n. // if ( m0 > 0 ) { vc_00 = vfmaq_laneq_f64( vc_00, vb_0, va_0, 0 ); vc_01 = vfmaq_laneq_f64( vc_01, vb_1, va_0, 0 ); vc_10 = vfmaq_laneq_f64( vc_10, vb_0, va_0, 1 ); vc_11 = vfmaq_laneq_f64( vc_11, vb_1, va_0, 1 ); } if ( m0 > 2 ) { vc_20 = vfmaq_laneq_f64( vc_20, vb_0, va_1, 0 ); vc_21 = vfmaq_laneq_f64( vc_21, vb_1, va_1, 0 ); vc_30 = vfmaq_laneq_f64( vc_30, vb_0, va_1, 1 ); vc_31 = vfmaq_laneq_f64( vc_31, vb_1, va_1, 1 ); } if ( m0 > 4 ) { vc_40 = vfmaq_laneq_f64( vc_40, vb_0, va_2, 0 ); vc_41 = vfmaq_laneq_f64( vc_41, vb_1, va_2, 0 ); vc_50 = vfmaq_laneq_f64( vc_50, vb_0, va_2, 1 ); vc_51 = vfmaq_laneq_f64( vc_51, vb_1, va_2, 1 ); } } // Load alpha and beta. va_0 = vld1q_dup_f64( alpha ); vb_0 = vld1q_dup_f64( beta ); // Scale. vc_00 = vmulq_f64( vc_00, va_0 ); vc_01 = vmulq_f64( vc_01, va_0 ); vc_10 = vmulq_f64( vc_10, va_0 ); vc_11 = vmulq_f64( vc_11, va_0 ); vc_20 = vmulq_f64( vc_20, va_0 ); vc_21 = vmulq_f64( vc_21, va_0 ); vc_30 = vmulq_f64( vc_30, va_0 ); vc_31 = vmulq_f64( vc_31, va_0 ); vc_40 = vmulq_f64( vc_40, va_0 ); vc_41 = vmulq_f64( vc_41, va_0 ); vc_50 = vmulq_f64( vc_50, va_0 ); vc_51 = vmulq_f64( vc_51, va_0 ); if ( cs_c == 1 ) { // Store in rows. // if ( m0 > 0 ) { // Load. if ( n > 1 ) va_0 = vld1q_f64 ( c_loc + 0 * rs_c + 0 ); else va_0 = vld1q_lane_f64( c_loc + 0 * rs_c + 0, va_0, 0 ); if ( n > 3 ) va_1 = vld1q_f64 ( c_loc + 0 * rs_c + 2 ); else if ( n > 2 ) va_1 = vld1q_lane_f64( c_loc + 0 * rs_c + 2, va_1, 0 ); // Scale. if ( !b_iszr ) { vc_00 = vfmaq_f64( vc_00, va_0, vb_0 ); vc_01 = vfmaq_f64( vc_01, va_1, vb_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 0 * rs_c + 0, vc_00 ); else vst1q_lane_f64( c_loc + 0 * rs_c + 0, vc_00, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 0 * rs_c + 2, vc_01 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 0 * rs_c + 2, vc_01, 0 ); } if ( m0 > 1 ) { // Load. if ( n > 1 ) va_0 = vld1q_f64 ( c_loc + 1 * rs_c + 0 ); else va_0 = vld1q_lane_f64( c_loc + 1 * rs_c + 0, va_0, 0 ); if ( n > 3 ) va_1 = vld1q_f64 ( c_loc + 1 * rs_c + 2 ); else if ( n > 2 ) va_1 = vld1q_lane_f64( c_loc + 1 * rs_c + 2, va_1, 0 ); // Scale. if ( !b_iszr ) { vc_10 = vfmaq_f64( vc_10, va_0, vb_0 ); vc_11 = vfmaq_f64( vc_11, va_1, vb_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 1 * rs_c + 0, vc_10 ); else vst1q_lane_f64( c_loc + 1 * rs_c + 0, vc_10, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 1 * rs_c + 2, vc_11 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 1 * rs_c + 2, vc_11, 0 ); } if ( m0 > 2 ) { // Load. if ( n > 1 ) va_0 = vld1q_f64 ( c_loc + 2 * rs_c + 0 ); else va_0 = vld1q_lane_f64( c_loc + 2 * rs_c + 0, va_0, 0 ); if ( n > 3 ) va_1 = vld1q_f64 ( c_loc + 2 * rs_c + 2 ); else if ( n > 2 ) va_1 = vld1q_lane_f64( c_loc + 2 * rs_c + 2, va_1, 0 ); // Scale. if ( !b_iszr ) { vc_20 = vfmaq_f64( vc_20, va_0, vb_0 ); vc_21 = vfmaq_f64( vc_21, va_1, vb_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 2 * rs_c + 0, vc_20 ); else vst1q_lane_f64( c_loc + 2 * rs_c + 0, vc_20, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 2 * rs_c + 2, vc_21 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 2 * rs_c + 2, vc_21, 0 ); } if ( m0 > 3 ) { // Load. if ( n > 1 ) va_0 = vld1q_f64 ( c_loc + 3 * rs_c + 0 ); else va_0 = vld1q_lane_f64( c_loc + 3 * rs_c + 0, va_0, 0 ); if ( n > 3 ) va_1 = vld1q_f64 ( c_loc + 3 * rs_c + 2 ); else if ( n > 2 ) va_1 = vld1q_lane_f64( c_loc + 3 * rs_c + 2, va_1, 0 ); // Scale. if ( !b_iszr ) { vc_30 = vfmaq_f64( vc_30, va_0, vb_0 ); vc_31 = vfmaq_f64( vc_31, va_1, vb_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 3 * rs_c + 0, vc_30 ); else vst1q_lane_f64( c_loc + 3 * rs_c + 0, vc_30, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 3 * rs_c + 2, vc_31 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 3 * rs_c + 2, vc_31, 0 ); } if ( m0 > 4 ) { // Load. if ( n > 1 ) va_0 = vld1q_f64 ( c_loc + 4 * rs_c + 0 ); else va_0 = vld1q_lane_f64( c_loc + 4 * rs_c + 0, va_0, 0 ); if ( n > 3 ) va_1 = vld1q_f64 ( c_loc + 4 * rs_c + 2 ); else if ( n > 2 ) va_1 = vld1q_lane_f64( c_loc + 4 * rs_c + 2, va_1, 0 ); // Scale. if ( !b_iszr ) { vc_40 = vfmaq_f64( vc_40, va_0, vb_0 ); vc_41 = vfmaq_f64( vc_41, va_1, vb_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 4 * rs_c + 0, vc_40 ); else vst1q_lane_f64( c_loc + 4 * rs_c + 0, vc_40, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 4 * rs_c + 2, vc_41 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 4 * rs_c + 2, vc_41, 0 ); } if ( m0 > 5 ) { // Load. if ( n > 1 ) va_0 = vld1q_f64 ( c_loc + 5 * rs_c + 0 ); else va_0 = vld1q_lane_f64( c_loc + 5 * rs_c + 0, va_0, 0 ); if ( n > 3 ) va_1 = vld1q_f64 ( c_loc + 5 * rs_c + 2 ); else if ( n > 2 ) va_1 = vld1q_lane_f64( c_loc + 5 * rs_c + 2, va_1, 0 ); // Scale. if ( !b_iszr ) { vc_50 = vfmaq_f64( vc_50, va_0, vb_0 ); vc_51 = vfmaq_f64( vc_51, va_1, vb_0 ); } // Store. if ( n > 1 ) vst1q_f64 ( c_loc + 5 * rs_c + 0, vc_50 ); else vst1q_lane_f64( c_loc + 5 * rs_c + 0, vc_50, 0 ); if ( n > 3 ) vst1q_f64 ( c_loc + 5 * rs_c + 2, vc_51 ); else if ( n > 2 ) vst1q_lane_f64( c_loc + 5 * rs_c + 2, vc_51, 0 ); } } else { // Store in columns. // Rename some vectors. #define VCOL0 va_0 #define VCOL1 va_1 #define VCOL2 va_2 #define VCOL3 vb_1 #define VTMP0 vc_00 #define VTMP1 vc_01 #define VTMP2 vc_10 #define VTMP3 vc_11 // if ( m0 > 0 ) { VCOL0 = vtrn1q_f64(vc_00, vc_10); VCOL1 = vtrn2q_f64(vc_00, vc_10); VCOL2 = vtrn1q_f64(vc_01, vc_11); VCOL3 = vtrn2q_f64(vc_01, vc_11); if ( m0 > 1 ) { if ( n > 0 ) VTMP0 = vld1q_f64( c_loc + 0 * cs_c + 0 ); if ( n > 1 ) VTMP1 = vld1q_f64( c_loc + 1 * cs_c + 0 ); if ( n > 2 ) VTMP2 = vld1q_f64( c_loc + 2 * cs_c + 0 ); if ( n > 3 ) VTMP3 = vld1q_f64( c_loc + 3 * cs_c + 0 ); if ( !b_iszr ) { VCOL0 = vfmaq_f64( VCOL0, VTMP0, vb_0 ); VCOL1 = vfmaq_f64( VCOL1, VTMP1, vb_0 ); VCOL2 = vfmaq_f64( VCOL2, VTMP2, vb_0 ); VCOL3 = vfmaq_f64( VCOL3, VTMP3, vb_0 ); } if ( n > 0 ) vst1q_f64( c_loc + 0 * cs_c + 0, VCOL0 ); if ( n > 1 ) vst1q_f64( c_loc + 1 * cs_c + 0, VCOL1 ); if ( n > 2 ) vst1q_f64( c_loc + 2 * cs_c + 0, VCOL2 ); if ( n > 3 ) vst1q_f64( c_loc + 3 * cs_c + 0, VCOL3 ); } else { if ( n > 0 ) VTMP0 = vld1q_lane_f64( c_loc + 0 * cs_c + 0, VTMP0, 0 ); if ( n > 1 ) VTMP1 = vld1q_lane_f64( c_loc + 1 * cs_c + 0, VTMP1, 0 ); if ( n > 2 ) VTMP2 = vld1q_lane_f64( c_loc + 2 * cs_c + 0, VTMP2, 0 ); if ( n > 3 ) VTMP3 = vld1q_lane_f64( c_loc + 3 * cs_c + 0, VTMP3, 0 ); if ( !b_iszr ) { VCOL0 = vfmaq_f64( VCOL0, VTMP0, vb_0 ); VCOL1 = vfmaq_f64( VCOL1, VTMP1, vb_0 ); VCOL2 = vfmaq_f64( VCOL2, VTMP2, vb_0 ); VCOL3 = vfmaq_f64( VCOL3, VTMP3, vb_0 ); } if ( n > 0 ) vst1q_lane_f64( c_loc + 0 * cs_c + 0, VCOL0, 0 ); if ( n > 1 ) vst1q_lane_f64( c_loc + 1 * cs_c + 0, VCOL1, 0 ); if ( n > 2 ) vst1q_lane_f64( c_loc + 2 * cs_c + 0, VCOL2, 0 ); if ( n > 3 ) vst1q_lane_f64( c_loc + 3 * cs_c + 0, VCOL3, 0 ); } } if ( m0 > 2 ) { VCOL0 = vtrn1q_f64(vc_20, vc_30); VCOL1 = vtrn2q_f64(vc_20, vc_30); VCOL2 = vtrn1q_f64(vc_21, vc_31); VCOL3 = vtrn2q_f64(vc_21, vc_31); if ( m0 > 3 ) { if ( n > 0 ) VTMP0 = vld1q_f64( c_loc + 0 * cs_c + 2 ); if ( n > 1 ) VTMP1 = vld1q_f64( c_loc + 1 * cs_c + 2 ); if ( n > 2 ) VTMP2 = vld1q_f64( c_loc + 2 * cs_c + 2 ); if ( n > 3 ) VTMP3 = vld1q_f64( c_loc + 3 * cs_c + 2 ); if ( !b_iszr ) { VCOL0 = vfmaq_f64( VCOL0, VTMP0, vb_0 ); VCOL1 = vfmaq_f64( VCOL1, VTMP1, vb_0 ); VCOL2 = vfmaq_f64( VCOL2, VTMP2, vb_0 ); VCOL3 = vfmaq_f64( VCOL3, VTMP3, vb_0 ); } if ( n > 0 ) vst1q_f64( c_loc + 0 * cs_c + 2, VCOL0 ); if ( n > 1 ) vst1q_f64( c_loc + 1 * cs_c + 2, VCOL1 ); if ( n > 2 ) vst1q_f64( c_loc + 2 * cs_c + 2, VCOL2 ); if ( n > 3 ) vst1q_f64( c_loc + 3 * cs_c + 2, VCOL3 ); } else { if ( n > 0 ) VTMP0 = vld1q_lane_f64( c_loc + 0 * cs_c + 2, VTMP0, 0 ); if ( n > 1 ) VTMP1 = vld1q_lane_f64( c_loc + 1 * cs_c + 2, VTMP1, 0 ); if ( n > 2 ) VTMP2 = vld1q_lane_f64( c_loc + 2 * cs_c + 2, VTMP2, 0 ); if ( n > 3 ) VTMP3 = vld1q_lane_f64( c_loc + 3 * cs_c + 2, VTMP3, 0 ); if ( !b_iszr ) { VCOL0 = vfmaq_f64( VCOL0, VTMP0, vb_0 ); VCOL1 = vfmaq_f64( VCOL1, VTMP1, vb_0 ); VCOL2 = vfmaq_f64( VCOL2, VTMP2, vb_0 ); VCOL3 = vfmaq_f64( VCOL3, VTMP3, vb_0 ); } if ( n > 0 ) vst1q_lane_f64( c_loc + 0 * cs_c + 2, VCOL0, 0 ); if ( n > 1 ) vst1q_lane_f64( c_loc + 1 * cs_c + 2, VCOL1, 0 ); if ( n > 2 ) vst1q_lane_f64( c_loc + 2 * cs_c + 2, VCOL2, 0 ); if ( n > 3 ) vst1q_lane_f64( c_loc + 3 * cs_c + 2, VCOL3, 0 ); } } if ( m0 > 4 ) { VCOL0 = vtrn1q_f64(vc_40, vc_50); VCOL1 = vtrn2q_f64(vc_40, vc_50); VCOL2 = vtrn1q_f64(vc_41, vc_51); VCOL3 = vtrn2q_f64(vc_41, vc_51); if ( m0 > 5 ) { if ( n > 0 ) VTMP0 = vld1q_f64( c_loc + 0 * cs_c + 4 ); if ( n > 1 ) VTMP1 = vld1q_f64( c_loc + 1 * cs_c + 4 ); if ( n > 2 ) VTMP2 = vld1q_f64( c_loc + 2 * cs_c + 4 ); if ( n > 3 ) VTMP3 = vld1q_f64( c_loc + 3 * cs_c + 4 ); if ( !b_iszr ) { VCOL0 = vfmaq_f64( VCOL0, VTMP0, vb_0 ); VCOL1 = vfmaq_f64( VCOL1, VTMP1, vb_0 ); VCOL2 = vfmaq_f64( VCOL2, VTMP2, vb_0 ); VCOL3 = vfmaq_f64( VCOL3, VTMP3, vb_0 ); } if ( n > 0 ) vst1q_f64( c_loc + 0 * cs_c + 4, VCOL0 ); if ( n > 1 ) vst1q_f64( c_loc + 1 * cs_c + 4, VCOL1 ); if ( n > 2 ) vst1q_f64( c_loc + 2 * cs_c + 4, VCOL2 ); if ( n > 3 ) vst1q_f64( c_loc + 3 * cs_c + 4, VCOL3 ); } else { if ( n > 0 ) VTMP0 = vld1q_lane_f64( c_loc + 0 * cs_c + 4, VTMP0, 0 ); if ( n > 1 ) VTMP1 = vld1q_lane_f64( c_loc + 1 * cs_c + 4, VTMP1, 0 ); if ( n > 2 ) VTMP2 = vld1q_lane_f64( c_loc + 2 * cs_c + 4, VTMP2, 0 ); if ( n > 3 ) VTMP3 = vld1q_lane_f64( c_loc + 3 * cs_c + 4, VTMP3, 0 ); if ( !b_iszr ) { VCOL0 = vfmaq_f64( VCOL0, VTMP0, vb_0 ); VCOL1 = vfmaq_f64( VCOL1, VTMP1, vb_0 ); VCOL2 = vfmaq_f64( VCOL2, VTMP2, vb_0 ); VCOL3 = vfmaq_f64( VCOL3, VTMP3, vb_0 ); } if ( n > 0 ) vst1q_lane_f64( c_loc + 0 * cs_c + 4, VCOL0, 0 ); if ( n > 1 ) vst1q_lane_f64( c_loc + 1 * cs_c + 4, VCOL1, 0 ); if ( n > 2 ) vst1q_lane_f64( c_loc + 2 * cs_c + 4, VCOL2, 0 ); if ( n > 3 ) vst1q_lane_f64( c_loc + 3 * cs_c + 4, VCOL3, 0 ); } } } b_in += ps_b; c_in += 4 * cs_c; } a0 += ps_a; c0 += 6 * rs_c; } } cython-blis-0.9.1/blis/_src/kernels/armv8a/bli_kernels_armv8a.h000066400000000000000000000055561427272030600244320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ PACKM_KER_PROT( float, s, packm_armv8a_int_8xk ) PACKM_KER_PROT( float, s, packm_armv8a_int_12xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_6xk ) PACKM_KER_PROT( double, d, packm_armv8a_int_8xk ) GEMM_UKR_PROT( float, s, gemm_armv8a_asm_8x12 ) GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_6x8r ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_8x4 ) // GEMM_UKR_PROT( double, d, gemm_armv8a_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_4x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_asm_8x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_int_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_armv8a_asm_6x3 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_6x4mn ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_armv8a_int_3x8mn ) cython-blis-0.9.1/blis/_src/kernels/bgq/000077500000000000000000000000001427272030600200525ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/bgq/1/000077500000000000000000000000001427272030600202125ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/bgq/1/bli_axpyv_bgq_int.c000066400000000000000000000056201427272030600240610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_daxpyv_bgq_int ( conj_t conjx, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { if ( bli_zero_dim1( n ) ) return; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. bool use_ref = FALSE; if ( incx != 1 || incy != 1 || bli_is_unaligned_to( ( siz_t )x, 32 ) || bli_is_unaligned_to( ( siz_t )y, 32 ) ) { use_ref = TRUE; } // Call the reference implementation if needed. if ( use_ref == TRUE ) { BLIS_DAXPYV_KERNEL_REF( conjx, n, alpha, x, incx, y, incy, cntx ); return; } dim_t n_run = n / 4; dim_t n_left = n % 4; vector4double xv, yv, zv; vector4double alphav = vec_lds( 0 * sizeof(double), (double*)alpha ); #pragma omp parallel for for ( dim_t i = 0; i < n_run; i++ ) { xv = vec_lda( 0 * sizeof(double), &x[i*4] ); yv = vec_lda( 0 * sizeof(double), &y[i*4] ); zv = vec_madd( alphav, xv, yv ); vec_sta( zv, 0 * sizeof(double), &y[i*4] ); } for ( dim_t i = 0; i < n_left; i++ ) { y[4*n_run + i] += *alpha * x[4*n_run + i]; } } cython-blis-0.9.1/blis/_src/kernels/bgq/1/bli_dotv_bgq_int.c000066400000000000000000000064641427272030600236750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_ddotv_bgq_int ( conj_t conjx, conj_t conjy, dim_t n, double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, cntx_t* restrict cntx ) { bool use_ref = FALSE; // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { PASTEMAC(d,set0s)( *rho ); return; } // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( incx != 1 || incy != 1 || bli_is_unaligned_to( ( siz_t )x, 32 ) || bli_is_unaligned_to( ( siz_t )y, 32 ) ) use_ref = TRUE; // Call the reference implementation if needed. if ( use_ref ) { BLIS_DDOTV_KERNEL_REF( conjx, conjy, n, x, incx, y, incy, rho, cntx ); return; } dim_t n_run = n / 4; dim_t n_left = n % 4; double rhos = 0.0; #pragma omp parallel reduction(+:rhos) { dim_t n_threads; dim_t t_id = omp_get_thread_num(); n_threads = omp_get_num_threads(); vector4double rhov = vec_splats( 0.0 ); vector4double xv, yv; for ( dim_t i = t_id; i < n_run; i += n_threads ) { xv = vec_lda( 0 * sizeof(double), &x[i*4] ); yv = vec_lda( 0 * sizeof(double), &y[i*4] ); rhov = vec_madd( xv, yv, rhov ); } rhos += vec_extract( rhov, 0 ); rhos += vec_extract( rhov, 1 ); rhos += vec_extract( rhov, 2 ); rhos += vec_extract( rhov, 3 ); } for ( dim_t i = 0; i < n_left; i++ ) { rhos += x[4*n_run + i] * y[4*n_run + i]; } *rho = rhos; } cython-blis-0.9.1/blis/_src/kernels/bgq/1f/000077500000000000000000000000001427272030600203605ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/bgq/1f/bli_axpyf_bgq_int.c000066400000000000000000000126611427272030600242120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_daxpyf_bgq_int ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fusefac = 8; if ( bli_zero_dim2( m, b_n ) ) return; bool use_ref = FALSE; // printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\n", b_n, fusefac, inca, incx, incy, bli_is_unaligned_to( ( siz_t )a, 32 ), bli_is_unaligned_to( ( siz_t )y, 32)); // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( ( b_n < fusefac) || inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( ( siz_t )a, 32 ) || bli_is_unaligned_to( ( siz_t )y, 32 ) ) use_ref = TRUE; // Call the reference implementation if needed. if ( use_ref == TRUE ) { // printf("%d\t%d\t%d\t%d\t%d\t%d\n", fusefac, inca, incx, incy, bli_is_unaligned_to( ( siz_t )a, 32 ), bli_is_unaligned_to( ( siz_t )y, 32)); // printf("DEFAULTING TO REFERENCE IMPLEMENTATION\n"); BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy, cntx ); return; } dim_t m_run = m / 4; dim_t m_left = m % 4; double * a0 = a + 0*lda; double * a1 = a + 1*lda; double * a2 = a + 2*lda; double * a3 = a + 3*lda; double * a4 = a + 4*lda; double * a5 = a + 5*lda; double * a6 = a + 6*lda; double * a7 = a + 7*lda; double * y0 = y; double chi0 = *(x + 0*incx); double chi1 = *(x + 1*incx); double chi2 = *(x + 2*incx); double chi3 = *(x + 3*incx); double chi4 = *(x + 4*incx); double chi5 = *(x + 5*incx); double chi6 = *(x + 6*incx); double chi7 = *(x + 7*incx); PASTEMAC2(d,d,scals)( *alpha, chi0 ); PASTEMAC2(d,d,scals)( *alpha, chi1 ); PASTEMAC2(d,d,scals)( *alpha, chi2 ); PASTEMAC2(d,d,scals)( *alpha, chi3 ); PASTEMAC2(d,d,scals)( *alpha, chi4 ); PASTEMAC2(d,d,scals)( *alpha, chi5 ); PASTEMAC2(d,d,scals)( *alpha, chi6 ); PASTEMAC2(d,d,scals)( *alpha, chi7 ); vector4double a0v, a1v, a2v, a3v, a4v, a5v, a6v, a7v; vector4double yv; vector4double chi0v, chi1v, chi2v, chi3v, chi4v, chi5v, chi6v, chi7v; chi0v = vec_splats( chi0 ); chi1v = vec_splats( chi1 ); chi2v = vec_splats( chi2 ); chi3v = vec_splats( chi3 ); chi4v = vec_splats( chi4 ); chi5v = vec_splats( chi5 ); chi6v = vec_splats( chi6 ); chi7v = vec_splats( chi7 ); for ( dim_t i = 0; i < m_run; i += 1 ) { yv = vec_lda( 0 * sizeof(double), &y0[i*4]); a0v = vec_lda( 0 * sizeof(double), &a0[i*4]); a1v = vec_lda( 0 * sizeof(double), &a1[i*4]); a2v = vec_lda( 0 * sizeof(double), &a2[i*4]); a3v = vec_lda( 0 * sizeof(double), &a3[i*4]); a4v = vec_lda( 0 * sizeof(double), &a4[i*4]); a5v = vec_lda( 0 * sizeof(double), &a5[i*4]); a6v = vec_lda( 0 * sizeof(double), &a6[i*4]); a7v = vec_lda( 0 * sizeof(double), &a7[i*4]); yv = vec_madd( chi0v, a0v, yv ); yv = vec_madd( chi1v, a1v, yv ); yv = vec_madd( chi2v, a2v, yv ); yv = vec_madd( chi3v, a3v, yv ); yv = vec_madd( chi4v, a4v, yv ); yv = vec_madd( chi5v, a5v, yv ); yv = vec_madd( chi6v, a6v, yv ); yv = vec_madd( chi7v, a7v, yv ); vec_sta( yv, 0 * sizeof(double), &y0[i*4]); } for ( dim_t i = 0; i < m_left; ++i ) { y0[4*m_run + i] += chi0 * a0[4*m_run + i] + chi1 * a1[4*m_run + i] + chi2 * a2[4*m_run + i] + chi3 * a3[4*m_run + i] + chi4 * a4[4*m_run + i] + chi5 * a5[4*m_run + i] + chi6 * a6[4*m_run + i] + chi7 * a7[4*m_run + i]; } } cython-blis-0.9.1/blis/_src/kernels/bgq/3/000077500000000000000000000000001427272030600202145ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/bgq/3/bli_gemm_bgq_int_8x8.c000066400000000000000000000326721427272030600243570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef restrict #include #include /* * Here is dgemm kernel for QPX. * Instruction mix was divined by a statement in an email from John Gunnels when asked about the peak performance with a single thread: * "Achievable peak can either be: * 1) 12.8 GF 8 FMAs cycle * 1.6 GHz * 2) 8.53 GF Takes intoo account the instruction mix in DGEMM and the fact that you can only do an FMA or a load/store in a single cycle with just one thread * 3) 7.58 GF (2) + the fact that we can only issue 8 instructions in 9 cycles with one thread" * * Which I have taken to mean: 8.53 GFLOPS implies on average 5.33 flops/cycle. * I know the kernel John uses is 8x8, so 16 flops per loop iteration. * Thus there must be 24 total instructions per iteration because 16/24 = 5.33. * * Here, we have 6 loads per iteration. These are executed on a different pipeline from FMAs so * we could (maybe) theoretically hit 100% of peak with this instruction mix */ void bli_dgemm_bgq_int_8x8 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false ); //Registers for storing C. //4 4x4 subblocks of C, c00, c01, c10, c11 //4 registers per subblock: a, b, c, d //There is an excel file that details which register ends up storing what vector4double c00a = vec_splats( 0.0 ); vector4double c00b = vec_splats( 0.0 ); vector4double c00c = vec_splats( 0.0 ); vector4double c00d = vec_splats( 0.0 ); vector4double c01a = vec_splats( 0.0 ); vector4double c01b = vec_splats( 0.0 ); vector4double c01c = vec_splats( 0.0 ); vector4double c01d = vec_splats( 0.0 ); vector4double c10a = vec_splats( 0.0 ); vector4double c10b = vec_splats( 0.0 ); vector4double c10c = vec_splats( 0.0 ); vector4double c10d = vec_splats( 0.0 ); vector4double c11a = vec_splats( 0.0 ); vector4double c11b = vec_splats( 0.0 ); vector4double c11c = vec_splats( 0.0 ); vector4double c11d = vec_splats( 0.0 ); vector4double b0a, b1a; vector4double b0b, b1b; vector4double a0, a1; for( dim_t i = 0; i < k; i++ ) { b0a = vec_ld2a( 0 * sizeof(double), &b[8*i] ); b0b = vec_ld2a( 2 * sizeof(double), &b[8*i] ); b1a = vec_ld2a( 4 * sizeof(double), &b[8*i] ); b1b = vec_ld2a( 6 * sizeof(double), &b[8*i] ); a0 = vec_lda ( 0 * sizeof(double), &a[8*i] ); a1 = vec_lda ( 4 * sizeof(double), &a[8*i] ); c00a = vec_xmadd ( b0a, a0, c00a ); c00b = vec_xxmadd( a0, b0a, c00b ); c00c = vec_xmadd ( b0b, a0, c00c ); c00d = vec_xxmadd( a0, b0b, c00d ); c01a = vec_xmadd ( b1a, a0, c01a ); c01b = vec_xxmadd( a0, b1a, c01b ); c01c = vec_xmadd ( b1b, a0, c01c ); c01d = vec_xxmadd( a0, b1b, c01d ); c10a = vec_xmadd ( b0a, a1, c10a ); c10b = vec_xxmadd( a1, b0a, c10b ); c10c = vec_xmadd ( b0b, a1, c10c ); c10d = vec_xxmadd( a1, b0b, c10d ); c11a = vec_xmadd ( b1a, a1, c11a ); c11b = vec_xxmadd( a1, b1a, c11b ); c11c = vec_xmadd ( b1b, a1, c11c ); c11d = vec_xxmadd( a1, b1b, c11d ); } // Create patterns for permuting Cb and Cd vector4double pattern = vec_gpci( 01032 ); vector4double AB; vector4double C = vec_splats( 0.0 ); vector4double betav = vec_lds( 0, ( double* )beta ); vector4double alphav = vec_lds( 0, ( double* )alpha ); double ct; //Macro to update 4 elements of C in a column. //REG is the register holding those 4 elements //ADDR is the address to write them to //OFFSET is the number of rows from ADDR to write to #define UPDATE( REG, ADDR, OFFSET ) \ { \ ct = *(ADDR + (OFFSET + 0) * rs_c); \ C = vec_insert( ct, C, 0 ); \ ct = *(ADDR + (OFFSET + 1) * rs_c); \ C = vec_insert( ct, C, 1 ); \ ct = *(ADDR + (OFFSET + 2) * rs_c); \ C = vec_insert( ct, C, 2 ); \ ct = *(ADDR + (OFFSET + 3) * rs_c); \ C = vec_insert( ct, C, 3 ); \ \ AB = vec_mul( REG, alphav ); \ AB = vec_madd( C, betav, AB); \ \ ct = vec_extract( AB, 0 ); \ *(ADDR + (OFFSET + 0) * rs_c) = ct; \ ct = vec_extract( AB, 1 ); \ *(ADDR + (OFFSET + 1) * rs_c) = ct; \ ct = vec_extract( AB, 2 ); \ *(ADDR + (OFFSET + 2) * rs_c) = ct; \ ct = vec_extract( AB, 3 ); \ *(ADDR + (OFFSET + 3) * rs_c) = ct; \ } //Update c00 and c10 sub-blocks UPDATE( c00a, c, 0 ); UPDATE( c10a, c, 4 ); c = c + cs_c; AB = vec_perm( c00b, c00b, pattern ); UPDATE( AB, c, 0 ); AB = vec_perm( c10b, c10b, pattern ); UPDATE( AB, c, 4 ); c = c + cs_c; UPDATE( c00c, c, 0 ); UPDATE( c10c, c, 4 ); c = c + cs_c; AB = vec_perm( c00d, c00d, pattern ); UPDATE( AB, c, 0 ); AB = vec_perm( c10d, c10d, pattern ); UPDATE( AB, c, 4 ); //Update c01 and c11 sub-blocks c = c + cs_c; UPDATE( c01a, c, 0 ); UPDATE( c11a, c, 4 ); c = c + cs_c; AB = vec_perm( c01b, c01b, pattern ); UPDATE( AB, c, 0 ); AB = vec_perm( c11b, c11b, pattern ); UPDATE( AB, c, 4 ); c = c + cs_c; UPDATE( c01c, c, 0 ); UPDATE( c11c, c, 4 ); c = c + cs_c; AB = vec_perm( c01d, c01d, pattern ); UPDATE( AB, c, 0 ); AB = vec_perm( c11d, c11d, pattern ); UPDATE( AB, c, 4 ); GEMM_UKR_FLUSH_CT( d ); } void printvec(vector4double v) { double a = vec_extract(v, 0); double b = vec_extract(v, 1); double c = vec_extract(v, 2); double d = vec_extract(v, 3); printf("%4.3f\t%4.3f\t%4.3f\t%4.3f\n", a, b, c, d); } void bli_zgemm_bgq_int_4x4 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false ); double* a_d = ( double* )a; double* b_d = ( double* )b; double* c_d = ( double* )c; //Registers for storing C. //2 2x4 subblocks of C, c0, and c1 //Each sub-block has 4 columns, 0, 1, 2, 3 //Each column has 2 partial sum, a and b, and contains 2 complex numbers. vector4double c00a = vec_splats( 0.0 ); vector4double c00b = vec_splats( 0.0 ); vector4double c01a = vec_splats( 0.0 ); vector4double c01b = vec_splats( 0.0 ); vector4double c02a = vec_splats( 0.0 ); vector4double c02b = vec_splats( 0.0 ); vector4double c03a = vec_splats( 0.0 ); vector4double c03b = vec_splats( 0.0 ); vector4double c10a = vec_splats( 0.0 ); vector4double c10b = vec_splats( 0.0 ); vector4double c11a = vec_splats( 0.0 ); vector4double c11b = vec_splats( 0.0 ); vector4double c12a = vec_splats( 0.0 ); vector4double c12b = vec_splats( 0.0 ); vector4double c13a = vec_splats( 0.0 ); vector4double c13b = vec_splats( 0.0 ); vector4double b0, b1, b2, b3; vector4double a0, a1; for( dim_t i = 0; i < k; i++ ) { b0 = vec_ld2a( 0 * sizeof(double), &b_d[8*i] ); b1 = vec_ld2a( 2 * sizeof(double), &b_d[8*i] ); b2 = vec_ld2a( 4 * sizeof(double), &b_d[8*i] ); b3 = vec_ld2a( 6 * sizeof(double), &b_d[8*i] ); a0 = vec_lda ( 0 * sizeof(double), &a_d[8*i] ); a1 = vec_lda ( 4 * sizeof(double), &a_d[8*i] ); c00a = vec_xmadd ( b0, a0, c00a ); c00b = vec_xxcpnmadd( a0, b0, c00b ); c01a = vec_xmadd ( b1, a0, c01a ); c01b = vec_xxcpnmadd( a0, b1, c01b ); c02a = vec_xmadd ( b2, a0, c02a ); c02b = vec_xxcpnmadd( a0, b2, c02b ); c03a = vec_xmadd ( b3, a0, c03a ); c03b = vec_xxcpnmadd( a0, b3, c03b ); c10a = vec_xmadd ( b0, a1, c10a ); c10b = vec_xxcpnmadd( a1, b0, c10b ); c11a = vec_xmadd ( b1, a1, c11a ); c11b = vec_xxcpnmadd( a1, b1, c11b ); c12a = vec_xmadd ( b2, a1, c12a ); c12b = vec_xxcpnmadd( a1, b2, c12b ); c13a = vec_xmadd ( b3, a1, c13a ); c13b = vec_xxcpnmadd( a1, b3, c13b ); } // Create patterns for permuting the "b" parts of each vector vector4double pattern = vec_gpci( 01032 ); vector4double zed = vec_splats( 0.0 ); vector4double AB; vector4double C = vec_splats( 0.0 ); vector4double C1 = vec_splats( 0.0 ); vector4double C2 = vec_splats( 0.0 ); double alphar = bli_zreal( *alpha ); double alphai = bli_zimag( *alpha ); double betar = bli_zreal( *beta ); double betai = bli_zimag( *beta ); vector4double alphav = vec_splats( 0.0 ); vector4double betav = vec_splats( 0.0 ); alphav = vec_insert( alphar, alphav, 0); alphav = vec_insert( alphai, alphav, 1); alphav = vec_insert( alphar, alphav, 2); alphav = vec_insert( alphai, alphav, 3); betav = vec_insert( betar, betav, 0); betav = vec_insert( betai, betav, 1); betav = vec_insert( betar, betav, 2); betav = vec_insert( betai, betav, 3); double ct; //Macro to update 2 elements of C in a column. //REG1 is the register holding the first partial sum of those 2 elements //REG2 is the register holding the second partial sum of those 2 elements //ADDR is the address to write them to //OFFSET is the number of rows from ADDR to write to #define ZUPDATE( REG1, REG2, ADDR, OFFSET ) \ { \ ct = *(ADDR + (OFFSET + 0) * rs_c); \ C = vec_insert( ct, C, 0 ); \ ct = *(ADDR + (OFFSET + 0) * rs_c + 1); \ C = vec_insert( ct, C, 1 ); \ ct = *(ADDR + (OFFSET + 2) * rs_c); \ C = vec_insert( ct, C, 2 ); \ ct = *(ADDR + (OFFSET + 2) * rs_c + 1); \ C = vec_insert( ct, C, 3 ); \ \ AB = vec_sub(REG1, REG2 ); \ \ /* Scale by alpha */ \ REG1 = vec_xmadd( alphav, AB, zed ); \ REG2 = vec_xxcpnmadd( AB, alphav, zed ); \ AB = vec_sub(REG1, REG2 ); \ \ \ /* Scale by beta */ \ REG1 = vec_xmadd( betav, C, zed ); \ REG2 = vec_xxcpnmadd( C, betav, zed ); \ C = vec_sub(REG1, REG2 ); \ \ /* Add AB to C */ \ C = vec_add( AB, C ); \ \ ct = vec_extract( C, 0 ); \ *(ADDR + (OFFSET + 0) * rs_c) = ct; \ ct = vec_extract( C, 1 ); \ *(ADDR + (OFFSET + 0) * rs_c + 1) = ct; \ ct = vec_extract( C, 2 ); \ *(ADDR + (OFFSET + 2) * rs_c) = ct; \ ct = vec_extract( C, 3 ); \ *(ADDR + (OFFSET + 2) * rs_c + 1) = ct; \ } ZUPDATE( c00a, c00b, c_d, 0 ); ZUPDATE( c10a, c10b, c_d, 4 ); c_d += 2*cs_c; ZUPDATE( c01a, c01b, c_d, 0 ); ZUPDATE( c11a, c11b, c_d, 4 ); c_d += 2*cs_c; ZUPDATE( c02a, c02b, c_d, 0 ); ZUPDATE( c12a, c12b, c_d, 4 ); c_d += 2*cs_c; ZUPDATE( c03a, c03b, c_d, 0 ); ZUPDATE( c13a, c13b, c_d, 4 ); GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/bgq/bli_kernels_bgq.h000066400000000000000000000035561427272030600233560ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( double, d, gemm_bgq_int_8x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_bgq_int_4x4 ) AXPYF_KER_PROT( double, d, axpyf_bgq_int ) AXPYV_KER_PROT( double, d, axpyv_bgq_int ) DOTV_KER_PROT( double, d, dotv_bgq_int ) cython-blis-0.9.1/blis/_src/kernels/bulldozer/000077500000000000000000000000001427272030600213035ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/bulldozer/3/000077500000000000000000000000001427272030600214455ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c000066400000000000000000001450371427272030600300740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name of The University of Texas nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #define GROUP_YMM_BY_4 \ vmovaps(ymm15, ymm7)\ vshufps(imm(0xe4), ymm13, ymm15, ymm15)\ vshufps(imm(0xe4), ymm7, ymm13, ymm13)\ \ vmovaps(ymm11, ymm7)\ vshufps(imm(0xe4), ymm9, ymm11, ymm11)\ vshufps(imm(0xe4), ymm7, ymm9, ymm9)\ \ vmovaps(ymm14, ymm7)\ vshufps(imm(0xe4), ymm12, ymm14, ymm14)\ vshufps(imm(0xe4), ymm7, ymm12, ymm12)\ \ vmovaps(ymm10, ymm7)\ vshufps(imm(0xe4), ymm8, ymm10, ymm10)\ vshufps(imm(0xe4), ymm7, ymm8, ymm8)\ \ vmovaps(ymm15, ymm7)\ vperm2f128(imm(0x12), ymm15, ymm11, ymm15)\ vperm2f128(imm(0x30), ymm7, ymm11, ymm11)\ \ vmovaps(ymm13, ymm7)\ vperm2f128(imm(0x12), ymm13, ymm9, ymm13)\ vperm2f128(imm(0x30), ymm7, ymm9, ymm9)\ \ vmovaps(ymm14, ymm7)\ vperm2f128(imm(0x12), ymm14, ymm10, ymm14)\ vperm2f128(imm(0x30), ymm7, ymm10, ymm10)\ \ vmovaps(ymm12, ymm7)\ vperm2f128(imm(0x12), ymm12, ymm8, ymm12)\ vperm2f128(imm(0x30), ymm7, ymm8, ymm8) #define STORE_SS \ vextractf128(imm(1), ymm0, xmm2)\ vmovss(xmm0, mem(rcx))\ vpermilps(imm(0x39), xmm0, xmm1)\ vmovss(xmm1, mem(rcx, rsi, 1))\ vpermilps(imm(0x39), xmm1, xmm0)\ vmovss(xmm0, mem(rcx, r12, 1))\ vpermilps(imm(0x39), xmm0, xmm1)\ vmovss(xmm1, mem(rcx, r13, 1))\ vmovss(xmm2, mem(rdx))\ vpermilps(imm(0x39), xmm2, xmm3)\ vmovss(xmm3, mem(rdx, rsi, 1))\ vpermilps(imm(0x39), xmm3, xmm2)\ vmovss(xmm2, mem(rdx, r12, 1))\ vpermilps(imm(0x39), xmm2, xmm3)\ vmovss(xmm3, mem(rdx, r13, 1))\ void bli_sgemm_bulldozer_asm_8x8_fma4 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_ALIGNED( s, 8, 8, false, 32 ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b. vpermilps(imm(0x4e), ymm2, ymm3) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c; lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, r14, 1, 7*8)) // prefetch c + 3*cs_c prefetch(0, mem(r10, 7*8)) // prefetch c + 4*cs_c prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 16*32)) vfmaddps(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovshdup(mem(rbx, 0*32), ymm2) vfmaddps(ymm13, ymm0, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vmovaps(mem(rax, 1*32), ymm1) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm0, ymm4, ymm11) vfmaddps(ymm9, ymm0, ymm5, ymm9) vfmaddps(ymm14, ymm0, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 1*32), ymm2) vfmaddps(ymm12, ymm0, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm0, ymm4, ymm10) vfmaddps(ymm8, ymm0, ymm5, ymm8) // iteration 1 vfmaddps(ymm15, ymm1, ymm2, ymm15) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovshdup(mem(rbx, 1*32), ymm2) vfmaddps(ymm13, ymm1, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vmovaps(mem(rax, 2*32), ymm0) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm1, ymm4, ymm11) vfmaddps(ymm9, ymm1, ymm5, ymm9) vfmaddps(ymm14, ymm1, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 2*32), ymm2) vfmaddps(ymm12, ymm1, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm1, ymm4, ymm10) vfmaddps(ymm8, ymm1, ymm5, ymm8) // iteration 2 prefetch(0, mem(rax, 18*32)) vfmaddps(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovshdup(mem(rbx, 2*32), ymm2) vfmaddps(ymm13, ymm0, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vmovaps(mem(rax, 3*32), ymm1) add(imm(4*8*4), rax) // a += 4*8 (unroll x mr) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm0, ymm4, ymm11) vfmaddps(ymm9, ymm0, ymm5, ymm9) vfmaddps(ymm14, ymm0, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 3*32), ymm2) vfmaddps(ymm12, ymm0, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm0, ymm4, ymm10) vfmaddps(ymm8, ymm0, ymm5, ymm8) // iteration 3 vfmaddps(ymm15, ymm1, ymm2, ymm15) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovshdup(mem(rbx, 3*32), ymm2) add(imm(4*8*4), rbx) // b += 4*8 (unroll x nr) vfmaddps(ymm13, ymm1, ymm3, ymm13) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vmovaps(mem(rax, 0*32), ymm0) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm1, ymm4, ymm11) vfmaddps(ymm9, ymm1, ymm5, ymm9) vfmaddps(ymm14, ymm1, ymm2, ymm14) vperm2f128(imm(0x03), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 0*32), ymm2) vfmaddps(ymm12, ymm1, ymm3, ymm12) vperm2f128(imm(0x03), ymm3, ymm3, ymm5) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm1, ymm4, ymm10) vfmaddps(ymm8, ymm1, ymm5, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 16*32)) vfmaddps(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmovshdup(mem(rbx, 0*32), ymm2) vfmaddps(ymm13, ymm0, ymm3, ymm13) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vmovaps(mem(rax, 1*32), ymm1) add(imm(8*1*4), rax) // a += 8 (1 x mr) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm11, ymm0, ymm4, ymm11) vfmaddps(ymm9, ymm0, ymm5, ymm9) vfmaddps(ymm14, ymm0, ymm2, ymm14) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vmovsldup(mem(rbx, 1*32), ymm2) add(imm(8*1*4), rbx) // b += 8 (1 x nr) vfmaddps(ymm12, ymm0, ymm3, ymm12) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) vpermilps(imm(0x4e), ymm2, ymm3) vfmaddps(ymm10, ymm0, ymm4, ymm10) vfmaddps(ymm8, ymm0, ymm5, ymm8) vmovaps(ymm1, ymm0) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 // ab10 ab12 ab14 ab16 // ab22 ab20 ab26 ab24 // ab32 ab30 ab36 ab34 // ab44 ab46 ab40 ab42 // ab54 ab56 ab50 ab52 // ab66 ab64 ab62 ab60 // ab76 ) ab74 ) ab72 ) ab70 ) // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 // ab11 ab13 ab15 ab17 // ab23 ab21 ab27 ab25 // ab33 ab31 ab37 ab35 // ab45 ab47 ab41 ab43 // ab55 ab57 ab51 ab53 // ab67 ab65 ab63 ab61 // ab77 ) ab75 ) ab73 ) ab71 ) GROUP_YMM_BY_4 // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 // ab10 ab12 ab14 ab16 // ab20 ab22 ab24 ab26 // ab30 ab32 ab34 ab36 // ab44 ab46 ab40 ab42 // ab54 ab56 ab50 ab52 // ab64 ab66 ab60 ab62 // ab74 ) ab76 ) ab70 ) ab72 ) // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 // ab11 ab13 ab15 ab17 // ab21 ab23 ab25 ab27 // ab31 ab33 ab35 ab37 // ab45 ab47 ab41 ab43 // ab55 ab57 ab51 ab53 // ab65 ab67 ab61 ab63 // ab75 ) ab77 ) ab71 ) ab73 ) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab02 ( ab04 ( ab06 // ab10 ab12 ab14 ab16 // ab20 ab22 ab24 ab26 // ab30 ab32 ab34 ab36 // ab40 ab42 ab44 ab46 // ab50 ab52 ab54 ab56 // ab60 ab62 ab64 ab66 // ab70 ) ab72 ) ab74 ) ab76 ) // ymm14: ymm12: ymm10: ymm8: // ( ab01 ( ab03 ( ab05 ( ab07 // ab11 ab13 ab15 ab17 // ab21 ab23 ab25 ab27 // ab31 ab33 ab35 ab37 // ab41 ab43 ab45 ab47 // ab51 ab53 ab55 ab57 // ab61 ab63 ab65 ab67 // ab71 ) ab73 ) ab75 ) ab77 ) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm4) // load beta and duplicate vmulps(ymm0, ymm8, ymm8) // scale by alpha vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm4) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case vmovaps(mem(rcx), ymm0) // load c00:c70, //vmulps(ymm4, ymm0, ymm0) // scale by beta, //vaddps(ymm15, ymm0, ymm0) // add the gemm result, vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, vmovaps(ymm0, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(mem(rcx), ymm1) // load c01:c71, //vmulps(ymm4, ymm1, ymm1) // scale by beta, //vaddps(ymm14, ymm1, ymm1) // add the gemm result, vfmaddps(ymm14, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, vmovaps(ymm1, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(mem(rcx), ymm0) // load c02:c72, //vmulps(ymm4, ymm0, ymm0) // scale by beta, //vaddps(ymm13, ymm0, ymm0) // add the gemm result, vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, vmovaps(ymm0, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(mem(rcx), ymm1) // load c03:c73, //vmulps(ymm4, ymm1, ymm1) // scale by beta, //vaddps(ymm12, ymm1, ymm1) // add the gemm result, vfmaddps(ymm12, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, vmovaps(ymm1, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(mem(rcx), ymm0) // load c04:c74, //vmulps(ymm4, ymm0, ymm0) // scale by beta, //vaddps(ymm11, ymm0, ymm0) // add the gemm result, vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, vmovaps(ymm0, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(mem(rcx), ymm1) // load c05:c75, //vmulps(ymm4, ymm1, ymm1) // scale by beta, //vaddps(ymm10, ymm1, ymm1) // add the gemm result, vfmaddps(ymm10, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, vmovaps(ymm1, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(mem(rcx), ymm0) // load c06:c76, //vmulps(ymm4, ymm0, ymm0) // scale by beta, //vaddps(ymm9, ymm0, ymm0) // add the gemm result, vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result, vmovaps(ymm0, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(mem(rcx), ymm1) // load c07:c77, //vmulps(ymm4, ymm1, ymm1) // scale by beta, //vaddps(ymm8, ymm1, ymm1) // add the gemm result, vfmaddps(ymm8, ymm1, ymm4, ymm1) // scale by beta and add the gemm result, vmovaps(ymm1, mem(rcx)) // and store back to memory. jmp(.SDONE) // jump to end. label(.SBETAZERO) vmovaps(ymm15, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(ymm14, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(ymm13, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(ymm12, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(ymm11, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(ymm10, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(ymm9, mem(rcx)) // and store back to memory. add(rdi, rcx) // c += cs_c; vmovaps(ymm8, mem(rcx)) // and store back to memory. label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( s ); } #undef KERNEL4x6_1 #undef KERNEL4x6_2 #undef KERNEL4x6_3 #undef KERNEL4x6_4 #define KERNEL4x6_1(xx) \ ALIGN4\ vmovddup(mem(rax, -8*8), xmm0)\ vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ vmovddup(mem(rax, -7*8), xmm0)\ vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ prefetch(0, mem(rax, 128))\ vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ vmovddup(mem(rax, -6*8), xmm0)\ vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ vmovddup(mem(rax, -5*8), xmm0)\ vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ vmovaps(mem(rbx, -6*8), xmm1)\ vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ vmovaps(mem(rbx, -4*8), xmm2)\ vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ vmovaps(mem(rbx, -2*8), xmm3) #define KERNEL4x6_2(xx) \ vmovddup(mem(rax, -4*8), xmm0)\ vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ prefetch(0, mem(rax, 192))\ vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ vmovddup(mem(rax, -3*8), xmm0)\ vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ vmovddup(mem(rax, -2*8), xmm0)\ vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ vmovddup(mem(rax, -1*8), xmm0)\ vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ vmovaps(mem(rbx, 0*8), xmm1)\ vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ vmovaps(mem(rbx, 2*8), xmm2)\ vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ vmovaps(mem(rbx, 4*8), xmm3)\ #define KERNEL4x6_3(xx) \ vmovddup(mem(rax, 0*8), xmm0)\ vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ vmovddup(mem(rax, 1*8), xmm0)\ vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ prefetch(0, mem(rax, 224))\ vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ vmovddup(mem(rax, 2*8), xmm0)\ vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ vmovddup(mem(rax, 3*8), xmm0)\ vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ vmovaps(mem(rbx, 6*8), xmm1)\ vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ vmovaps(mem(rbx, 8*8), xmm2)\ vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ vmovaps(mem(rbx, 10*8), xmm3) #define KERNEL4x6_4(xx) \ vmovddup(mem(rax, 4*8), xmm0)\ vfmaddpd(xmm4, xmm1, xmm0, xmm4)\ prefetch(0, mem(rax, 224))\ vfmaddpd(xmm5, xmm2, xmm0, xmm5)\ vfmaddpd(xmm6, xmm3, xmm0, xmm6)\ vmovddup(mem(rax, 5*8), xmm0)\ vfmaddpd(xmm7, xmm1, xmm0, xmm7)\ vfmaddpd(xmm8, xmm2, xmm0, xmm8)\ vfmaddpd(xmm9, xmm3, xmm0, xmm9)\ vmovddup(mem(rax, 6*8), xmm0)\ vfmaddpd(xmm10, xmm1, xmm0, xmm10)\ vfmaddpd(xmm11, xmm2, xmm0, xmm11)\ vfmaddpd(xmm12, xmm3, xmm0, xmm12)\ vmovddup(mem(rax, 7*8), xmm0)\ vfmaddpd(xmm13, xmm1, xmm0, xmm13)\ vmovaps(mem(rbx, 12*8), xmm1)\ vfmaddpd(xmm14, xmm2, xmm0, xmm14)\ vmovaps(mem(rbx, 14*8), xmm2)\ vfmaddpd(xmm15, xmm3, xmm0, xmm15)\ add(imm(16*8), rax)\ vmovaps(mem(rbx, 16*8), xmm3)\ add(imm(24*8), rbx) void bli_dgemm_bulldozer_asm_4x6_fma4 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 12; uint64_t k_left = k % 12; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_ANY( d, 4, 6, false ); begin_asm() vzeroall() mov(var(b), rbx) // load address of b. mov(var(a), rax) // load address of a. prefetch(0, mem(rax, 64)) vmovaps(mem(rbx, 0*8), xmm1) vmovaps(mem(rbx, 2*8), xmm2) vmovaps(mem(rbx, 4*8), xmm3) add(imm(12*8), rbx) add(imm(8*8), rax) mov(var(k_iter), rsi) // i = k_iter; notice var(k_iter) not $0 test(rsi, rsi) je(.CONSIDERKLEFT) ALIGN32 label(.LOOPKITER) // MAIN LOOP KERNEL4x6_1(xx) KERNEL4x6_2(xx) KERNEL4x6_3(xx) KERNEL4x6_4(xx) KERNEL4x6_1(xx) KERNEL4x6_2(xx) KERNEL4x6_3(xx) KERNEL4x6_4(xx) KERNEL4x6_1(xx) KERNEL4x6_2(xx) KERNEL4x6_3(xx) KERNEL4x6_4(xx) dec(rsi) jne(.LOOPKITER) label(.CONSIDERKLEFT) mov(var(k_left), rsi) test(rsi, rsi) label(.LOOPKLEFT) je(.POSTACCUM) KERNEL4x6_1(xx) add(imm(6*8), rbx) add(imm(4*8), rax) dec(rsi) jmp(.LOOPKLEFT) // iterate again if i != 0. label(.POSTACCUM) mov(var(rs_c), rsi) // load cs_c mov(var(cs_c), rdi) // load rs_c vmovddup(mem(var(alpha)), xmm2) //load alpha vmovddup(mem(var(beta)), xmm3) //load beta mov(var(c), rcx) // load address of c sal(imm(3), rsi) // cs_c *= sizeof(double) sal(imm(3), rdi) // rs_c *= sizeof(double) lea(mem(rcx, rdi, 2), rdx) vmovlpd(mem(rcx), xmm0, xmm0) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) lea(mem(rdx, rdi, 2), r8) vmulpd(xmm2, xmm4, xmm4) // scale by alpha, vmulpd(xmm2, xmm5, xmm5) // scale by alpha, vfmaddpd(xmm4, xmm0, xmm3, xmm4) // scale by beta, and add the gemm result vmovlpd(mem(r8), xmm0, xmm0) vfmaddpd(xmm5, xmm1, xmm3, xmm5) // scale by beta, and add the gemm result vmovhpd(mem(r8, rdi, 1), xmm0, xmm0) vmovlpd(xmm4, mem(rcx)) // and store back to memory. vmovlpd(xmm5, mem(rdx)) // and store back to memory. vmovhpd(xmm4, mem(rcx, rdi, 1)) add(rsi, rcx) vmovhpd(xmm5, mem(rdx, rdi, 1)) add(rsi, rdx) vmulpd(xmm2, xmm6, xmm6) // scale by alpha, vfmaddpd(xmm6, xmm0, xmm3, xmm6) // scale by beta, and add the gemm result vmovlpd(xmm6, mem(r8)) // and store back to memory. vmovhpd(xmm6, mem(r8, rdi, 1)) add(rsi, r8) vmovlpd(mem(rcx), xmm0, xmm0) vmovlpd(mem(rdx), xmm1, xmm1) vmovlpd(mem(r8), xmm4, xmm4) vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) vmovhpd(mem(r8, rdi, 1), xmm4, xmm4) vmulpd(xmm2, xmm7, xmm7) // scale by alpha, vmulpd(xmm2, xmm8, xmm8) // scale by alpha, vmulpd(xmm2, xmm9, xmm9) // scale by alpha, vfmaddpd(xmm7, xmm0, xmm3, xmm7) // scale by beta, and add the gemm result vfmaddpd(xmm8, xmm1, xmm3, xmm8) // scale by beta, and add the gemm result vfmaddpd(xmm9, xmm4, xmm3, xmm9) // scale by beta, and add the gemm result vmovlpd(xmm7, mem(rcx)) // and store back to memory. vmovlpd(xmm8, mem(rdx)) // and store back to memory. vmovlpd(xmm9, mem(r8)) // and store back to memory. vmovhpd(xmm7, mem(rcx, rdi, 1)) add(rsi, rcx) vmovhpd(xmm8, mem(rdx, rdi, 1)) add(rsi, rdx) vmovhpd(xmm9, mem(r8, rdi, 1)) add(rsi, r8) vmovlpd(mem(rcx), xmm0, xmm0) vmovlpd(mem(rdx), xmm1, xmm1) vmovlpd(mem(r8), xmm4, xmm4) vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) vmovhpd(mem(r8, rdi, 1), xmm4, xmm4) vmulpd(xmm2, xmm10, xmm10) // scale by alpha, vmulpd(xmm2, xmm11, xmm11) // scale by alpha, vmulpd(xmm2, xmm12, xmm12) // scale by alpha, vfmaddpd(xmm10, xmm0, xmm3, xmm10) // scale by beta, and add the gemm result vfmaddpd(xmm11, xmm1, xmm3, xmm11) // scale by beta, and add the gemm result vfmaddpd(xmm12, xmm4, xmm3, xmm12) // scale by beta, and add the gemm result vmovlpd(xmm10, mem(rcx)) // and store back to memory. vmovlpd(xmm11, mem(rdx)) // and store back to memory. vmovlpd(xmm12, mem(r8)) // and store back to memory. vmovhpd(xmm10, mem(rcx, rdi, 1)) add(rsi, rcx) vmovhpd(xmm11, mem(rdx, rdi, 1)) add(rsi, rdx) vmovhpd(xmm12, mem(r8, rdi, 1)) add(rsi, r8) vmovlpd(mem(rcx), xmm0, xmm0) vmovlpd(mem(rdx), xmm1, xmm1) vmovlpd(mem(r8), xmm4, xmm4) vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0) vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1) vmovhpd(mem(r8, rdi, 1), xmm4, xmm4) vmulpd(xmm2, xmm13, xmm13) // scale by alpha, vmulpd(xmm2, xmm14, xmm14) // scale by alpha, vmulpd(xmm2, xmm15, xmm15) // scale by alpha, vfmaddpd(xmm13, xmm0, xmm3, xmm13) // scale by beta, and add the gemm result vfmaddpd(xmm14, xmm1, xmm3, xmm14) // scale by beta, and add the gemm result vfmaddpd(xmm15, xmm4, xmm3, xmm15) // scale by beta, and add the gemm result vmovlpd(xmm13, mem(rcx)) // and store back to memory. vmovlpd(xmm14, mem(rdx)) // and store back to memory. vmovlpd(xmm15, mem(r8)) // and store back to memory. vmovhpd(xmm13, mem(rcx, rdi, 1)) vmovhpd(xmm14, mem(rdx, rdi, 1)) vmovhpd(xmm15, mem(r8, rdi, 1)) end_asm( : // output operands (none) : // input operands [k_iter] "r" (k_iter), // 0 [k_left] "r" (k_left), // 1 [a] "r" (a), // 2 [b] "r" (b), // 3 [alpha] "r" (alpha), // 4 [beta] "r" (beta), // 5 [c] "r" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( d ); } //The parameter "i" is the iteration number, i.e. the B values to read #define MADD_TO_YMM(i) \ vfmaddps(ymm15, ymm0, ymm2, ymm15)\ vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\ vfmaddps(ymm13, ymm0, ymm3, ymm13)\ vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\ vfmaddps(ymm14, ymm1, ymm2, ymm14)\ vmovshdup(mem(rbx, i*32), ymm2)\ vfmaddps(ymm12, ymm1, ymm3, ymm12)\ vpermilps(imm(0x4e), ymm2, ymm3)\ vfmaddps(ymm11, ymm0, ymm4, ymm11)\ vfmaddps(ymm9, ymm0, ymm5, ymm9)\ vpermilps(imm(0xb1), ymm0, ymm0)\ vfmaddps(ymm10, ymm1, ymm4, ymm10)\ vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\ vfmaddps(ymm8, ymm1, ymm5, ymm8)\ vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\ void bli_cgemm_bulldozer_asm_8x4_fma4 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_ALIGNED( c, 8, 4, false, 32 ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. //mov(var(a_next), r14) // load address of a_next. sub(imm(4*64), r15) vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovsldup(mem(rbx, 0*32), ymm2) vpermilps(imm(0x4e), ymm2, ymm3) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.CLOOPKITER) // MAIN LOOP add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr) // iteration 0 prefetch(0, mem(rax, 8*32)) vmovaps(mem(rax, 1*32), ymm1) MADD_TO_YMM(0) vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vaddsubps(ymm6, ymm15, ymm15) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm7, ymm13, ymm13) vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 1*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 2*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) // iteration 1 prefetch(0, mem(rax, 10*32)) vmovaps(mem(rax, 3*32), ymm1) MADD_TO_YMM(1) vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 2*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 4*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) // iteration 2 prefetch(0, mem(rax, 12*32)) vmovaps(mem(rax, 5*32), ymm1) MADD_TO_YMM(2) prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4] vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 3*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 6*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) // iteration 3 prefetch(0, mem(rax, 14*32)) vmovaps(mem(rax, 7*32), ymm1) MADD_TO_YMM(3) vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 4*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 8*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) add(imm(8*4*8), rax) // a += 8*4 (unroll x mr) add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) dec(rsi) // i -= 1; jne(.CLOOPKITER) // iterate again if i != 0. label(.CCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CLOOPKLEFT) // EDGE LOOP // iteration 0 prefetch(0, mem(rax, 8*32)) vmovaps(mem(rax, 1*32), ymm1) MADD_TO_YMM(0) vpermilps(imm(0xb1), ymm1, ymm1) vmulps(ymm0, ymm2, ymm6) vmulps(ymm0, ymm3, ymm7) vaddsubps(ymm6, ymm15, ymm15) vaddsubps(ymm7, ymm13, ymm13) vmulps(ymm1, ymm2, ymm6) vmovsldup(mem(rbx, 1*32), ymm2) vmulps(ymm1, ymm3, ymm7) vpermilps(imm(0x4e), ymm2, ymm3) vaddsubps(ymm6, ymm14, ymm14) vaddsubps(ymm7, ymm12, ymm12) vmulps(ymm0, ymm4, ymm6) vmulps(ymm0, ymm5, ymm7) vmovaps(mem(rax, 2*32), ymm0) vaddsubps(ymm6, ymm11, ymm11) vaddsubps(ymm7, ymm9, ymm9) vmulps(ymm1, ymm4, ymm6) vmulps(ymm1, ymm5, ymm7) vaddsubps(ymm6, ymm10, ymm10) vaddsubps(ymm7, ymm8, ymm8) add(imm(8*1*8), rax) // a += 8 (1 x mr) add(imm(4*1*8), rbx) // b += 4 (1 x nr) dec(rsi) // i -= 1; jne(.CLOOPKLEFT) // iterate again if i != 0. label(.CPOSTACCUM) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 // ab10 ab11 ab12 ab13 // ab21 ab20 ab23 ab22 // ab31 ab30 ab33 ab32 // ab42 ab43 ab40 ab41 // ab52 ab53 ab50 ab51 // ab63 ab62 ab61 ab60 // ab73 ) ab72 ) ab71 ) ab70 ) // ymm14: ymm12: ymm10: ymm8: // ( ab80 ( ab81 ( ab82 ( ab83 // ab90 ab91 ab92 ab93 // aba1 aba0 aba3 aba2 // abb1 abb0 abb3 abb2 // abc2 abc3 abc0 abc1 // abd2 abd3 abd0 abd1 // abe3 abe2 abe1 abe0 // abf3 abf2 abf1 abf0 ) GROUP_YMM_BY_4 // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 // ab10 ab11 ab12 ab13 // ab20 ab21 ab22 ab23 // ab30 ab31 ab32 ab33 // ab42 ab43 ab40 ab41 // ab52 ab53 ab50 ab51 // ab62 ab63 ab60 ab61 // ab72 ) ab73 ) ab70 ) ab71 ) // ymm14: ymm12: ymm10: ymm8: // ( ab80 ( ab81 ( ab82 ( ab83 // ab90 ab91 ab92 ab93 // aba0 aba1 aba2 aba3 // abb0 abb1 abb2 abb3 // abc2 abc3 abc0 abc1 // abd2 abd3 abd0 abd1 // abe2 abe3 abe0 abe1 // abf2 ) abf3 ) abf0 ) abf1 ) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 // ab10 ab11 ab12 ab13 // ab20 ab21 ab22 ab23 // ab30 ab31 ab32 ab33 // ab40 ab41 ab42 ab43 // ab50 ab51 ab52 ab53 // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) // ymm14: ymm12: ymm10: ymm8: // ( ab80 ( ab81 ( ab82 ( ab83 // ab90 ab91 ab92 ab93 // aba0 aba1 aba2 aba3 // abb0 abb1 abb2 abb3 // abc0 abc1 abc2 abc3 // abd0 abd1 abd2 abd3 // abe0 abe1 abe2 abe3 // abf0 ) abf1 ) abf2 ) abf3 ) // scale by alpha mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm15, ymm3) vmulps(ymm7, ymm15, ymm15) vmulps(ymm6, ymm3, ymm3) vaddsubps(ymm3, ymm15, ymm15) vpermilps(imm(0xb1), ymm14, ymm2) vmulps(ymm7, ymm14, ymm14) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm14, ymm14) vpermilps(imm(0xb1), ymm13, ymm1) vmulps(ymm7, ymm13, ymm13) vmulps(ymm6, ymm1, ymm1) vaddsubps(ymm1, ymm13, ymm13) vpermilps(imm(0xb1), ymm12, ymm0) vmulps(ymm7, ymm12, ymm12) vmulps(ymm6, ymm0, ymm0) vaddsubps(ymm0, ymm12, ymm12) vpermilps(imm(0xb1), ymm11, ymm3) vmulps(ymm7, ymm11, ymm11) vmulps(ymm6, ymm3, ymm3) vaddsubps(ymm3, ymm11, ymm11) vpermilps(imm(0xb1), ymm10, ymm2) vmulps(ymm7, ymm10, ymm10) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm10, ymm10) vpermilps(imm(0xb1), ymm9, ymm1) vmulps(ymm7, ymm9, ymm9) vmulps(ymm6, ymm1, ymm1) vaddsubps(ymm1, ymm9, ymm9) vpermilps(imm(0xb1), ymm8, ymm0) vmulps(ymm7, ymm8, ymm8) vmulps(ymm6, ymm0, ymm0) vaddsubps(ymm0, ymm8, ymm8) mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm7) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm6) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case // update c00:c70 vmovaps(mem(rcx), ymm0) // load c00:c70 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx)) // store c00:c70 // update c80:cf0 vmovaps(mem(rcx,32), ymm0) // load c80:f0 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx,32)) // store c80:cf0 add(rdi, rcx) // c += cs_c; // update c00:c70 vmovaps(mem(rcx), ymm0) // load c01:c71 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx)) // store c01:c71 // update c81:cf1 vmovaps(mem(rcx,32), ymm0) // load c81:f1 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx,32)) // store c81:cf1 add(rdi, rcx) // c += cs_c; // update c02:c72 vmovaps(mem(rcx), ymm0) // load c02:c72 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx)) // store c02:c72 // update c82:cf2 vmovaps(mem(rcx,32), ymm0) // load c82:f2 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx,32)) // store c82:cf2 add(rdi, rcx) // c += cs_c; // update c03:c73 vmovaps(mem(rcx), ymm0) // load c03:c73 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx)) // store c03:c73 // update c83:cf3 vmovaps(mem(rcx,32), ymm0) // load c83:f3 into ymm0 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta vmulps(ymm7, ymm0, ymm0) vmulps(ymm6, ymm2, ymm2) vaddsubps(ymm2, ymm0, ymm0) vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0 vmovaps(ymm0, mem(rcx,32)) // store c83:cf3 //add(rdi, rcx) // c += cs_c; jmp(.CDONE) // jump to end. label(.CBETAZERO) vmovaps(ymm15, mem(rcx)) // store c00:c70 vmovaps(ymm14, mem(rcx,32)) // store c80:cf0 add(rdi, rcx) // c += cs_c; vmovaps(ymm13, mem(rcx)) // store c01:c71 vmovaps(ymm12, mem(rcx,32)) // store c81:cf1 add(rdi, rcx) // c += cs_c; vmovaps(ymm11, mem(rcx)) // store c02:c72 vmovaps(ymm10, mem(rcx,32)) // store c82:cf2 add(rdi, rcx) // c += cs_c; vmovaps(ymm9, mem(rcx)) // store c03:c73 vmovaps(ymm8, mem(rcx,32)) // store c83:cf3 add(rdi, rcx) // c += cs_c; label(.CDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [b_next] "m" (b_next)/*, // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) GEMM_UKR_FLUSH_CT( c ); } #define MADDSUBPD_TO_YMM \ vfmaddpd(ymm13, ymm0, ymm4, ymm13)\ vfmaddpd(ymm9, ymm0, ymm5, ymm9)\ vpermilpd(imm(0x5), ymm0, ymm0)\ \ vfmaddpd(ymm12, ymm1, ymm4, ymm12)\ vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\ vfmaddpd(ymm8, ymm1, ymm5, ymm8)\ vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\ \ vpermilpd(imm(0x5), ymm1, ymm1)\ vmulpd(ymm0, ymm2, ymm6)\ vmulpd(ymm0, ymm3, ymm7)\ vaddsubpd(ymm6, ymm15, ymm15)\ vaddsubpd(ymm7, ymm11, ymm11)\ \ #define Z_ALPHA(i, j) \ vpermilpd(imm(0x5), ymm(i), ymm(j))\ vmulpd(ymm7, ymm(i), ymm(i))\ vmulpd(ymm6, ymm(j), ymm(j))\ vaddsubpd(ymm(j), ymm(i), ymm(i))\ void bli_zgemm_bulldozer_asm_4x4_fma4 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_ALIGNED( z, 4, 4, false, 32 ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(var(b_next), r15) // load address of b_next. //mov(var(a_next), r14) // load address of a_next. vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading vmovddup(mem(rbx, 0+0*32), ymm2) vmovddup(mem(rbx, 0+1*32), ymm3) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.ZLOOPKITER) // MAIN LOOP // iteration 0 vmovapd(mem(rax, 1*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) prefetch(0, mem(rax, 16*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+0*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+1*32), ymm3) MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+2*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+3*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 2*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) // iteration 1 vmovapd(mem(rax, 3*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) prefetch(0, mem(rax, 18*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+2*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+3*32), ymm3) MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+4*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+5*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 4*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) // iteration 2 vmovapd(mem(rax, 5*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) prefetch(0, mem(rax, 20*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+4*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+5*32), ymm3) MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+6*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+7*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 6*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) // iteration 3 vmovapd(mem(rax, 7*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) prefetch(0, mem(rax, 22*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+6*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+7*32), ymm3) MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+8*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+9*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 8*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) dec(rsi) // i -= 1; jne(.ZLOOPKITER) // iterate again if i != 0. label(.ZCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZLOOPKLEFT) // EDGE LOOP // iteration 0 vmovapd(mem(rax, 1*32), ymm1) vfmaddpd(ymm15, ymm0, ymm2, ymm15) vperm2f128(imm(0x3), ymm2, ymm2, ymm4) vfmaddpd(ymm11, ymm0, ymm3, ymm11) vperm2f128(imm(0x3), ymm3, ymm3, ymm5) prefetch(0, mem(rax, 16*32)) vfmaddpd(ymm14, ymm1, ymm2, ymm14) vmovddup(mem(rbx, 8+0*32), ymm2) vfmaddpd(ymm10, ymm1, ymm3, ymm10) vmovddup(mem(rbx, 8+1*32), ymm3) MADDSUBPD_TO_YMM vmulpd(ymm1, ymm2, ymm6) vmovddup(mem(rbx, 0+2*32), ymm2) vmulpd(ymm1, ymm3, ymm7) vmovddup(mem(rbx, 0+3*32), ymm3) vaddsubpd(ymm6, ymm14, ymm14) vaddsubpd(ymm7, ymm10, ymm10) vmulpd(ymm0, ymm4, ymm6) vmulpd(ymm0, ymm5, ymm7) vmovapd(mem(rax, 2*32), ymm0) vaddsubpd(ymm6, ymm13, ymm13) vaddsubpd(ymm7, ymm9, ymm9) vmulpd(ymm1, ymm4, ymm6) vmulpd(ymm1, ymm5, ymm7) vaddsubpd(ymm6, ymm12, ymm12) vaddsubpd(ymm7, ymm8, ymm8) add(imm(4*1*16), rax) // a += 4 (1 x mr) add(imm(4*1*16), rbx) // b += 4 (1 x nr) dec(rsi) // i -= 1; jne(.ZLOOPKLEFT) // iterate again if i != 0. label(.ZPOSTACCUM) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 // ab10 ab11 ab12 ab13 // ab21 ab20 ab23 ab22 // ab31 ) ab30 ) ab33 ) ab32 ) // ymm14: ymm12: ymm10: ymm8: // ( ab40 ( ab41 ( ab42 ( ab43 // ab50 ab51 ab52 ab53 // ab61 ab60 ab63 ab62 // ab71 ) ab70 ) ab73 ) ab72 ) vmovapd(ymm15, ymm7) vperm2f128(imm(0x12), ymm15, ymm13, ymm15) vperm2f128(imm(0x30), ymm7, ymm13, ymm13) vmovapd(ymm11, ymm7) vperm2f128(imm(0x12), ymm11, ymm9, ymm11) vperm2f128(imm(0x30), ymm7, ymm9, ymm9) vmovapd(ymm14, ymm7) vperm2f128(imm(0x12), ymm14, ymm12, ymm14) vperm2f128(imm(0x30), ymm7, ymm12, ymm12) vmovapd(ymm10, ymm7) vperm2f128(imm(0x12), ymm10, ymm8, ymm10) vperm2f128(imm(0x30), ymm7, ymm8, ymm8) // ymm15: ymm13: ymm11: ymm9: // ( ab00 ( ab01 ( ab02 ( ab03 // ab10 ab11 ab12 ab13 // ab20 ab21 ab22 ab23 // ab30 ) ab31 ) ab32 ) ab33 ) // ymm14: ymm12: ymm10: ymm8: // ( ab40 ( ab41 ( ab42 ( ab43 // ab50 ab51 ab52 ab53 // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) // scale by alpha mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate Z_ALPHA(15, 3) Z_ALPHA(14, 2) Z_ALPHA(13, 1) Z_ALPHA(12, 0) Z_ALPHA(11, 3) Z_ALPHA(10, 2) Z_ALPHA(9, 1) Z_ALPHA(8, 0) mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm7) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm6) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case // update c00:c30 vmovapd(mem(rcx), ymm0) // load c00:c30 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx)) // store c00:c30 // update c40:c70 vmovapd(mem(rcx,32), ymm0) // load c40:c70 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx,32)) // store c40:c70 add(rdi, rcx) // c += cs_c; // update c01:c31 vmovapd(mem(rcx), ymm0) // load c01:c31 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx)) // store c01:c31 // update c41:c71 vmovapd(mem(rcx,32), ymm0) // load c41:c71 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx,32)) // store c41:c71 add(rdi, rcx) // c += cs_c; // update c02:c32 vmovapd(mem(rcx), ymm0) // load c02:c32 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx)) // store c02:c32 // update c42:c72 vmovapd(mem(rcx,32), ymm0) // load c42:c72 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx,32)) // store c42:c72 add(rdi, rcx) // c += cs_c; // update c03:c33 vmovapd(mem(rcx), ymm0) // load c03:c33 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx)) // store c03:c33 // update c43:c73 vmovapd(mem(rcx,32), ymm0) // load c43:c73 into ymm0 Z_ALPHA(0, 2) // scale ymm0 by beta vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0 vmovapd(ymm0, mem(rcx,32)) // store c43:c73 add(rdi, rcx) // c += cs_c; jmp(.ZDONE) // jump to end. label(.ZBETAZERO) vmovapd(ymm15, mem(rcx)) // store c00:c30 vmovapd(ymm14, mem(rcx,32)) // store c40:c70 add(rdi, rcx) // c += cs_c; vmovapd(ymm13, mem(rcx)) // store c01:c31 vmovapd(ymm12, mem(rcx,32)) // store c41:c71 add(rdi, rcx) // c += cs_c; vmovapd(ymm11, mem(rcx)) // store c02:c32 vmovapd(ymm10, mem(rcx,32)) // store c42:c72 add(rdi, rcx) // c += cs_c; vmovapd(ymm9, mem(rcx)) // store c03:c33 vmovapd(ymm8, mem(rcx,32)) // store c43:c73 //add(rdi, rcx) // c += cs_c; label(.ZDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", "memory" ) GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/bulldozer/bli_kernels_bulldozer.h000066400000000000000000000035621427272030600260350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( float, s, gemm_bulldozer_asm_8x8_fma4 ) GEMM_UKR_PROT( double, d, gemm_bulldozer_asm_4x6_fma4 ) GEMM_UKR_PROT( scomplex, c, gemm_bulldozer_asm_8x4_fma4 ) GEMM_UKR_PROT( dcomplex, z, gemm_bulldozer_asm_4x4_fma4 ) cython-blis-0.9.1/blis/_src/kernels/generic/000077500000000000000000000000001427272030600207155ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/generic/generic.txt000066400000000000000000000012441427272030600230730ustar00rootroot00000000000000 generic.txt ----------- This file in 'kernels/generic' exists only to force 'git' to track what would otherwise be an empty directory. Having this empty directory is necessary because the 'generic' singleton family is defined in the configuration registry as: generic: generic which implies that the 'generic' sub-configuration depends on the 'generic' kernel set (because there were no complementary kernel sets specified via '/'). Thus, we need there to be a kernel set named 'generic', but we don't actually refer to any such kernels in BLIS. In other words, this file is simply a workaround to a quirk in the syntax and semantics of the config_registry file. -FGVZ cython-blis-0.9.1/blis/_src/kernels/haswell/000077500000000000000000000000001427272030600207405ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/1m/000077500000000000000000000000001427272030600212555ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c000066400000000000000000000274651427272030600266670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( scomplex, c, packm_3xk_haswell_ref ) void bli_cpackm_haswell_asm_3xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, scomplex* restrict kappa, scomplex* restrict a, inc_t inca0, inc_t lda0, scomplex* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_cpackm_3xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 3; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 6; // Define a local copy of 1.0 so we can test for unit kappa. float one_l = 1.0; float* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 4; #if 1 const uint64_t k_left = k0 % 4; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_ceq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem(, r8, 8), r8) // inca *= sizeof(scomplex) lea(mem(, r10, 8), r10) // lda *= sizeof(scomplex) mov(var(p), rbx) // load address of p. lea(mem( , r10, 4), r14) // r14 = 4*lda mov(var(one), rdx) // load address of 1.0 constant vbroadcastss(mem(rdx, 0), ymm1) // load 1.0 and duplicate vxorps(ymm0, ymm0, ymm0) // set ymm0 to 0.0. mov(var(kappa), rcx) // load address of kappa vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate // now branch on kappa == 1.0 vucomiss(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm11) // set ZF if kappa_i == 0.0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); and(r12b, r13b) // set ZF if r12b & r13b == 1. jne(.CKAPPAUNIT) // if ZF = 1, jump to beta == 0 case label(.CKAPPANONU) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.CCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.CROWNONU) jmp(.CDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.CCOLNONU) jmp(.CDONE) // jump to end. label(.CKAPPAUNIT) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.CCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.CROWUNIT) //lea(mem(r8, r8, 2), r12) // r12 = 3*inca //lea(mem(r12, r8, 2), rcx) // rcx = 5*inca //lea(mem(r12, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.CKITERROWU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, r8, 1, 0), ymm2) vmovupd(mem(rax, r8, 2, 0), ymm4) add(r14, rax) // a += 4*lda; vunpcklpd(ymm2, ymm0, ymm10) vunpckhpd(ymm2, ymm0, ymm11) vunpcklpd(ymm6, ymm4, ymm12) vunpckhpd(ymm6, ymm4, ymm13) vinsertf128(imm(0x1), xmm12, ymm10, ymm0) vinsertf128(imm(0x1), xmm13, ymm11, ymm2) vperm2f128(imm(0x31), ymm12, ymm10, ymm4) vperm2f128(imm(0x31), ymm13, ymm11, ymm6) vextractf128(imm(0x1), ymm0, xmm1) vextractf128(imm(0x1), ymm2, xmm3) vextractf128(imm(0x1), ymm4, xmm5) vextractf128(imm(0x1), ymm6, xmm7) vmovupd(xmm0, mem(rbx, 0*24)) vmovupd(xmm2, mem(rbx, 1*24)) vmovupd(xmm4, mem(rbx, 2*24)) vmovupd(xmm6, mem(rbx, 3*24)) vmovsd(xmm1, mem(rbx, 0*24+16)) vmovsd(xmm3, mem(rbx, 1*24+16)) vmovsd(xmm5, mem(rbx, 2*24+16)) vmovsd(xmm7, mem(rbx, 3*24+16)) add(imm(4*3*8), rbx) // p += 4*ldp = 4*3; dec(rsi) // i -= 1; jne(.CKITERROWU) // iterate again if i != 0. label(.CCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CKLEFTROWU) // EDGE LOOP (k_left) vmovsd(mem(rax, 0), xmm0) vmovsd(mem(rax, r8, 1, 0), xmm2) vmovsd(mem(rax, r8, 2, 0), xmm4) add(r10, rax) // a += lda; vmovsd(xmm0, mem(rbx, 0*8)) vmovsd(xmm2, mem(rbx, 1*8)) vmovsd(xmm4, mem(rbx, 2*8)) add(imm(3*8), rbx) // p += ldp = 3; dec(rsi) // i -= 1; jne(.CKLEFTROWU) // iterate again if i != 0. jmp(.CDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.CCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.CKITERCOLU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), xmm0) vmovsd( mem(rax, 16), xmm1) vmovupd(xmm0, mem(rbx, 0*24+ 0)) vmovsd( xmm1, mem(rbx, 0*24+16)) vmovupd(mem(rax, r10, 1, 0), xmm2) vmovsd( mem(rax, r10, 1, 16), xmm3) vmovupd(xmm2, mem(rbx, 1*24+ 0)) vmovsd( xmm3, mem(rbx, 1*24+16)) vmovupd(mem(rax, r10, 2, 0), xmm4) vmovsd( mem(rax, r10, 2, 16), xmm5) vmovupd(xmm4, mem(rbx, 2*24+ 0)) vmovsd( xmm5, mem(rbx, 2*24+16)) vmovupd(mem(rax, r13, 1, 0), xmm6) vmovsd( mem(rax, r13, 1, 16), xmm7) add(r14, rax) // a += 4*lda; vmovupd(xmm6, mem(rbx, 3*24+ 0)) vmovsd( xmm7, mem(rbx, 3*24+16)) add(imm(4*3*8), rbx) // p += 4*ldp = 4*3; dec(rsi) // i -= 1; jne(.CKITERCOLU) // iterate again if i != 0. label(.CCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CKLEFTCOLU) // EDGE LOOP (k_left) vmovupd(mem(rax, 0), xmm0) vmovsd( mem(rax, 16), xmm1) add(r10, rax) // a += lda; vmovupd(xmm0, mem(rbx, 0*24+ 0)) vmovsd( xmm1, mem(rbx, 0*24+16)) add(imm(3*8), rbx) // p += ldp = 3; dec(rsi) // i -= 1; jne(.CKLEFTCOLU) // iterate again if i != 0. //jmp(.CDONE) // jump to end. label(.CDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk ) { PASTEMAC(cscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; scomplex* restrict p_edge = p + (i )*1; bli_cset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } //bli_dfprintm( stdout, "packm 6xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" ); if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; scomplex* restrict p_edge = p + (j )*ldp; bli_cset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c000066400000000000000000000305611427272030600266630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( scomplex, c, packm_8xk_haswell_ref ) void bli_cpackm_haswell_asm_8xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, scomplex* restrict kappa, scomplex* restrict a, inc_t inca0, inc_t lda0, scomplex* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_cpackm_8xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 8; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 8; // Define a local copy of 1.0 so we can test for unit kappa. float one_l = 1.0; float* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 4; #if 1 const uint64_t k_left = k0 % 4; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_ceq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem(, r8, 8), r8) // inca *= sizeof(scomplex) lea(mem(, r10, 8), r10) // lda *= sizeof(scomplex) mov(var(p), rbx) // load address of p. lea(mem( , r10, 4), r14) // r14 = 4*lda mov(var(one), rdx) // load address of 1.0 constant vbroadcastss(mem(rdx, 0), ymm1) // load 1.0 and duplicate vxorps(ymm0, ymm0, ymm0) // set ymm0 to 0.0. mov(var(kappa), rcx) // load address of kappa vbroadcastss(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastss(mem(rcx, 4), ymm11) // load kappa_i and duplicate // now branch on kappa == 1.0 vucomiss(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm11) // set ZF if kappa_i == 0.0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); and(r12b, r13b) // set ZF if r12b & r13b == 1. jne(.CKAPPAUNIT) // if ZF = 1, jump to beta == 0 case label(.CKAPPANONU) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.CCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.CROWNONU) jmp(.CDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.CCOLNONU) jmp(.CDONE) // jump to end. label(.CKAPPAUNIT) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.CCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.CROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca lea(mem(r12, r8, 2), rcx) // rcx = 5*inca lea(mem(r12, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.CKITERROWU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, r8, 1, 0), ymm2) vmovupd(mem(rax, r8, 2, 0), ymm4) vmovupd(mem(rax, r12, 1, 0), ymm6) vunpcklpd(ymm2, ymm0, ymm10) vunpckhpd(ymm2, ymm0, ymm11) vunpcklpd(ymm6, ymm4, ymm12) vunpckhpd(ymm6, ymm4, ymm13) vinsertf128(imm(0x1), xmm12, ymm10, ymm0) vinsertf128(imm(0x1), xmm13, ymm11, ymm2) vperm2f128(imm(0x31), ymm12, ymm10, ymm4) vperm2f128(imm(0x31), ymm13, ymm11, ymm6) vmovupd(ymm0, mem(rbx, 0*64)) vmovupd(ymm2, mem(rbx, 1*64)) vmovupd(ymm4, mem(rbx, 2*64)) vmovupd(ymm6, mem(rbx, 3*64)) vmovupd(mem(rax, r8, 4, 0), ymm1) vmovupd(mem(rax, rcx, 1, 0), ymm3) vmovupd(mem(rax, r12, 2, 0), ymm5) vmovupd(mem(rax, rdx, 1, 0), ymm7) add(r14, rax) // a += 4*lda; vunpcklpd(ymm3, ymm1, ymm10) vunpckhpd(ymm3, ymm1, ymm11) vunpcklpd(ymm7, ymm5, ymm12) vunpckhpd(ymm7, ymm5, ymm13) vinsertf128(imm(0x1), xmm12, ymm10, ymm1) vinsertf128(imm(0x1), xmm13, ymm11, ymm3) vperm2f128(imm(0x31), ymm12, ymm10, ymm5) vperm2f128(imm(0x31), ymm13, ymm11, ymm7) vmovupd(ymm1, mem(rbx, 0*64+32)) vmovupd(ymm3, mem(rbx, 1*64+32)) vmovupd(ymm5, mem(rbx, 2*64+32)) vmovupd(ymm7, mem(rbx, 3*64+32)) add(imm(4*8*8), rbx) // p += 4*ldp = 4*8; dec(rsi) // i -= 1; jne(.CKITERROWU) // iterate again if i != 0. label(.CCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CKLEFTROWU) // EDGE LOOP (k_left) vmovsd(mem(rax, 0), xmm0) vmovsd(mem(rax, r8, 1, 0), xmm2) vmovsd(mem(rax, r8, 2, 0), xmm4) vmovsd(mem(rax, r12, 1, 0), xmm6) vmovsd(mem(rax, r8, 4, 0), xmm1) vmovsd(mem(rax, rcx, 1, 0), xmm3) vmovsd(mem(rax, r12, 2, 0), xmm5) vmovsd(mem(rax, rdx, 1, 0), xmm7) add(r10, rax) // a += lda; vmovsd(xmm0, mem(rbx, 0*8)) vmovsd(xmm2, mem(rbx, 1*8)) vmovsd(xmm4, mem(rbx, 2*8)) vmovsd(xmm6, mem(rbx, 3*8)) vmovsd(xmm1, mem(rbx, 4*8)) vmovsd(xmm3, mem(rbx, 5*8)) vmovsd(xmm5, mem(rbx, 6*8)) vmovsd(xmm7, mem(rbx, 7*8)) add(imm(8*8), rbx) // p += ldp = 8; dec(rsi) // i -= 1; jne(.CKLEFTROWU) // iterate again if i != 0. jmp(.CDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.CCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.CKITERCOLU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), ymm1) vmovupd(ymm0, mem(rbx, 0*64+ 0)) vmovupd(ymm1, mem(rbx, 0*64+32)) vmovupd(mem(rax, r10, 1, 0), ymm2) vmovupd(mem(rax, r10, 1, 32), ymm3) vmovupd(ymm2, mem(rbx, 1*64+ 0)) vmovupd(ymm3, mem(rbx, 1*64+32)) vmovupd(mem(rax, r10, 2, 0), ymm4) vmovupd(mem(rax, r10, 2, 32), ymm5) vmovupd(ymm4, mem(rbx, 2*64+ 0)) vmovupd(ymm5, mem(rbx, 2*64+32)) vmovupd(mem(rax, r13, 1, 0), ymm6) vmovupd(mem(rax, r13, 1, 32), ymm7) add(r14, rax) // a += 4*lda; vmovupd(ymm6, mem(rbx, 3*64+ 0)) vmovupd(ymm7, mem(rbx, 3*64+32)) add(imm(4*8*8), rbx) // p += 4*ldp = 4*8; dec(rsi) // i -= 1; jne(.CKITERCOLU) // iterate again if i != 0. label(.CCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CKLEFTCOLU) // EDGE LOOP (k_left) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), ymm1) add(r10, rax) // a += lda; vmovupd(ymm0, mem(rbx, 0*64+ 0)) vmovupd(ymm1, mem(rbx, 0*64+32)) add(imm(8*8), rbx) // p += ldp = 8; dec(rsi) // i -= 1; jne(.CKLEFTCOLU) // iterate again if i != 0. //jmp(.CDONE) // jump to end. label(.CDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk ) { PASTEMAC(cscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; scomplex* restrict p_edge = p + (i )*1; bli_cset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; scomplex* restrict p_edge = p + (j )*ldp; bli_cset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c000066400000000000000000000272321427272030600266630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( double, d, packm_6xk_haswell_ref ) void bli_dpackm_haswell_asm_6xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_dpackm_6xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 6; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 6; // Define a local copy of 1.0 so we can test for unit kappa. double one_l = 1.0; double* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 4; #if 1 const uint64_t k_left = k0 % 4; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_deq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem(, r8, 8), r8) // inca *= sizeof(double) lea(mem(, r10, 8), r10) // lda *= sizeof(double) mov(var(p), rbx) // load address of p. lea(mem( , r10, 4), r14) // r14 = 4*lda mov(var(one), rdx) // load address of 1.0 constant vmovsd(mem(rdx), xmm1) // load 1.0 mov(var(kappa), rcx) // load address of kappa vmovsd(mem(rcx), xmm0) // load kappa // now branch on kappa == 1.0 vucomisd(xmm0, xmm1) // set ZF if kappa == 1.0 je(.DKAPPAUNIT) // if ZF = 1, jump to beta == 0 case label(.DKAPPANONU) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.DCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.DROWNONU) jmp(.DDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.DCOLNONU) jmp(.DDONE) // jump to end. label(.DKAPPAUNIT) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.DCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.DROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca lea(mem(r12, r8, 2), rcx) // rcx = 5*inca //lea(mem(r12, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.DKITERROWU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, r8, 1, 0), ymm2) vmovupd(mem(rax, r8, 2, 0), ymm4) vmovupd(mem(rax, r12, 1, 0), ymm6) vunpcklpd(ymm2, ymm0, ymm10) vunpckhpd(ymm2, ymm0, ymm11) vunpcklpd(ymm6, ymm4, ymm12) vunpckhpd(ymm6, ymm4, ymm13) vinsertf128(imm(0x1), xmm12, ymm10, ymm0) vinsertf128(imm(0x1), xmm13, ymm11, ymm2) vperm2f128(imm(0x31), ymm12, ymm10, ymm4) vperm2f128(imm(0x31), ymm13, ymm11, ymm6) vmovupd(ymm0, mem(rbx, 0*48)) vmovupd(ymm2, mem(rbx, 1*48)) vmovupd(ymm4, mem(rbx, 2*48)) vmovupd(ymm6, mem(rbx, 3*48)) vmovupd(mem(rax, r8, 4, 0), ymm1) vmovupd(mem(rax, rcx, 1, 0), ymm3) add(r14, rax) // a += 4*lda; vunpcklpd(ymm3, ymm1, ymm10) vunpckhpd(ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm10, xmm12) vextractf128(imm(0x1), ymm11, xmm13) vmovupd(xmm10, mem(rbx, 0*48+32)) vmovupd(xmm11, mem(rbx, 1*48+32)) vmovupd(xmm12, mem(rbx, 2*48+32)) vmovupd(xmm13, mem(rbx, 3*48+32)) add(imm(4*6*8), rbx) // p += 4*ldp = 4*6; dec(rsi) // i -= 1; jne(.DKITERROWU) // iterate again if i != 0. label(.DCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DKLEFTROWU) // EDGE LOOP (k_left) vmovsd(mem(rax, 0), xmm0) vmovsd(mem(rax, r8, 1, 0), xmm2) vmovsd(mem(rax, r8, 2, 0), xmm4) vmovsd(mem(rax, r12, 1, 0), xmm6) vmovsd(mem(rax, r8, 4, 0), xmm1) vmovsd(mem(rax, rcx, 1, 0), xmm3) add(r10, rax) // a += lda; vmovsd(xmm0, mem(rbx, 0*8)) vmovsd(xmm2, mem(rbx, 1*8)) vmovsd(xmm4, mem(rbx, 2*8)) vmovsd(xmm6, mem(rbx, 3*8)) vmovsd(xmm1, mem(rbx, 4*8)) vmovsd(xmm3, mem(rbx, 5*8)) add(imm(6*8), rbx) // p += ldp = 6; dec(rsi) // i -= 1; jne(.DKLEFTROWU) // iterate again if i != 0. jmp(.DDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.DCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.DKITERCOLU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), xmm1) vmovupd(ymm0, mem(rbx, 0*48+ 0)) vmovupd(xmm1, mem(rbx, 0*48+32)) vmovupd(mem(rax, r10, 1, 0), ymm2) vmovupd(mem(rax, r10, 1, 32), xmm3) vmovupd(ymm2, mem(rbx, 1*48+ 0)) vmovupd(xmm3, mem(rbx, 1*48+32)) vmovupd(mem(rax, r10, 2, 0), ymm4) vmovupd(mem(rax, r10, 2, 32), xmm5) vmovupd(ymm4, mem(rbx, 2*48+ 0)) vmovupd(xmm5, mem(rbx, 2*48+32)) vmovupd(mem(rax, r13, 1, 0), ymm6) vmovupd(mem(rax, r13, 1, 32), xmm7) add(r14, rax) // a += 4*lda; vmovupd(ymm6, mem(rbx, 3*48+ 0)) vmovupd(xmm7, mem(rbx, 3*48+32)) add(imm(4*6*8), rbx) // p += 4*ldp = 4*6; dec(rsi) // i -= 1; jne(.DKITERCOLU) // iterate again if i != 0. label(.DCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DKLEFTCOLU) // EDGE LOOP (k_left) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), xmm1) add(r10, rax) // a += lda; vmovupd(ymm0, mem(rbx, 0*48+ 0)) vmovupd(xmm1, mem(rbx, 0*48+32)) add(imm(6*8), rbx) // p += ldp = 6; dec(rsi) // i -= 1; jne(.DKLEFTCOLU) // iterate again if i != 0. //jmp(.DDONE) // jump to end. label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) { PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } //bli_dfprintm( stdout, "packm 6xk ker: a_packed", cdim0, k0_max, p, 1, ldp0, "%5.2f", "" ); if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c000066400000000000000000000276441427272030600266740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( double, d, packm_8xk_haswell_ref ) void bli_dpackm_haswell_asm_8xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, double* restrict kappa, double* restrict a, inc_t inca0, inc_t lda0, double* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_dpackm_8xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 8; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 8; // Define a local copy of 1.0 so we can test for unit kappa. double one_l = 1.0; double* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 4; #if 1 const uint64_t k_left = k0 % 4; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_deq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem(, r8, 8), r8) // inca *= sizeof(double) lea(mem(, r10, 8), r10) // lda *= sizeof(double) mov(var(p), rbx) // load address of p. lea(mem( , r10, 4), r14) // r14 = 4*lda mov(var(one), rdx) // load address of 1.0 constant vmovsd(mem(rdx), xmm1) // load 1.0 mov(var(kappa), rcx) // load address of kappa vmovsd(mem(rcx), xmm0) // load kappa // now branch on kappa == 1.0 vucomisd(xmm0, xmm1) // set ZF if kappa == 1.0 je(.DKAPPAUNIT) // if ZF = 1, jump to beta == 0 case label(.DKAPPANONU) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.DCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.DROWNONU) jmp(.DDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.DCOLNONU) jmp(.DDONE) // jump to end. label(.DKAPPAUNIT) cmp(imm(8), r8) // set ZF if (8*inca) == 8. jz(.DCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.DROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca lea(mem(r12, r8, 2), rcx) // rcx = 5*inca lea(mem(r12, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.DKITERROWU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, r8, 1, 0), ymm2) vmovupd(mem(rax, r8, 2, 0), ymm4) vmovupd(mem(rax, r12, 1, 0), ymm6) vunpcklpd(ymm2, ymm0, ymm10) vunpckhpd(ymm2, ymm0, ymm11) vunpcklpd(ymm6, ymm4, ymm12) vunpckhpd(ymm6, ymm4, ymm13) vinsertf128(imm(0x1), xmm12, ymm10, ymm0) vinsertf128(imm(0x1), xmm13, ymm11, ymm2) vperm2f128(imm(0x31), ymm12, ymm10, ymm4) vperm2f128(imm(0x31), ymm13, ymm11, ymm6) vmovupd(ymm0, mem(rbx, 0*64)) vmovupd(ymm2, mem(rbx, 1*64)) vmovupd(ymm4, mem(rbx, 2*64)) vmovupd(ymm6, mem(rbx, 3*64)) vmovupd(mem(rax, r8, 4, 0), ymm1) vmovupd(mem(rax, rcx, 1, 0), ymm3) vmovupd(mem(rax, r12, 2, 0), ymm5) vmovupd(mem(rax, rdx, 1, 0), ymm7) add(r14, rax) // a += 4*lda; vunpcklpd(ymm3, ymm1, ymm10) vunpckhpd(ymm3, ymm1, ymm11) vunpcklpd(ymm7, ymm5, ymm12) vunpckhpd(ymm7, ymm5, ymm13) vinsertf128(imm(0x1), xmm12, ymm10, ymm1) vinsertf128(imm(0x1), xmm13, ymm11, ymm3) vperm2f128(imm(0x31), ymm12, ymm10, ymm5) vperm2f128(imm(0x31), ymm13, ymm11, ymm7) vmovupd(ymm1, mem(rbx, 0*64+32)) vmovupd(ymm3, mem(rbx, 1*64+32)) vmovupd(ymm5, mem(rbx, 2*64+32)) vmovupd(ymm7, mem(rbx, 3*64+32)) add(imm(4*8*8), rbx) // p += 4*ldp = 4*8; dec(rsi) // i -= 1; jne(.DKITERROWU) // iterate again if i != 0. label(.DCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DKLEFTROWU) // EDGE LOOP (k_left) vmovsd(mem(rax, 0), xmm0) vmovsd(mem(rax, r8, 1, 0), xmm2) vmovsd(mem(rax, r8, 2, 0), xmm4) vmovsd(mem(rax, r12, 1, 0), xmm6) vmovsd(mem(rax, r8, 4, 0), xmm1) vmovsd(mem(rax, rcx, 1, 0), xmm3) vmovsd(mem(rax, r12, 2, 0), xmm5) vmovsd(mem(rax, rdx, 1, 0), xmm7) add(r10, rax) // a += lda; vmovsd(xmm0, mem(rbx, 0*8)) vmovsd(xmm2, mem(rbx, 1*8)) vmovsd(xmm4, mem(rbx, 2*8)) vmovsd(xmm6, mem(rbx, 3*8)) vmovsd(xmm1, mem(rbx, 4*8)) vmovsd(xmm3, mem(rbx, 5*8)) vmovsd(xmm5, mem(rbx, 6*8)) vmovsd(xmm7, mem(rbx, 7*8)) add(imm(8*8), rbx) // p += ldp = 8; dec(rsi) // i -= 1; jne(.DKLEFTROWU) // iterate again if i != 0. jmp(.DDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.DCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.DKITERCOLU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), ymm1) vmovupd(ymm0, mem(rbx, 0*64+ 0)) vmovupd(ymm1, mem(rbx, 0*64+32)) vmovupd(mem(rax, r10, 1, 0), ymm2) vmovupd(mem(rax, r10, 1, 32), ymm3) vmovupd(ymm2, mem(rbx, 1*64+ 0)) vmovupd(ymm3, mem(rbx, 1*64+32)) vmovupd(mem(rax, r10, 2, 0), ymm4) vmovupd(mem(rax, r10, 2, 32), ymm5) vmovupd(ymm4, mem(rbx, 2*64+ 0)) vmovupd(ymm5, mem(rbx, 2*64+32)) vmovupd(mem(rax, r13, 1, 0), ymm6) vmovupd(mem(rax, r13, 1, 32), ymm7) add(r14, rax) // a += 4*lda; vmovupd(ymm6, mem(rbx, 3*64+ 0)) vmovupd(ymm7, mem(rbx, 3*64+32)) add(imm(4*8*8), rbx) // p += 4*ldp = 4*8; dec(rsi) // i -= 1; jne(.DKITERCOLU) // iterate again if i != 0. label(.DCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DKLEFTCOLU) // EDGE LOOP (k_left) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), ymm1) add(r10, rax) // a += lda; vmovupd(ymm0, mem(rbx, 0*64+ 0)) vmovupd(ymm1, mem(rbx, 0*64+32)) add(imm(8*8), rbx) // p += ldp = 8; dec(rsi) // i -= 1; jne(.DKLEFTCOLU) // iterate again if i != 0. //jmp(.DDONE) // jump to end. label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) { PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c000066400000000000000000000426121427272030600267620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( double, d, packm_16xk_haswell_ref ) void bli_spackm_haswell_asm_16xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_spackm_16xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 16; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 8; // Define a local copy of 1.0 so we can test for unit kappa. float one_l = 1.0; float* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 8; #if 1 const uint64_t k_left = k0 % 8; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_seq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem(, r8, 4), r8) // inca *= sizeof(float) lea(mem(, r10, 4), r10) // lda *= sizeof(float) mov(var(p), rbx) // load address of p. lea(mem( , r10, 8), r14) // r14 = 8*lda mov(var(one), rdx) // load address of 1.0 constant vmovss(mem(rdx), xmm1) // load 1.0 mov(var(kappa), rcx) // load address of kappa vmovss(mem(rcx), xmm0) // load kappa // now branch on kappa == 1.0 vucomiss(xmm0, xmm1) // set ZF if kappa == 1.0 je(.SKAPPAUNIT) // if ZF = 1, jump to beta == 0 case label(.SKAPPANONU) cmp(imm(4), r8) // set ZF if (4*inca) == 4. jz(.SCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.SROWNONU) jmp(.SDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.SCOLNONU) jmp(.SDONE) // jump to end. label(.SKAPPAUNIT) cmp(imm(4), r8) // set ZF if (4*inca) == 4. jz(.SCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.SROWUNIT) lea(mem(r8, r8, 2), r13) // r13 = 3*inca lea(mem(r13, r8, 2), r15) // r15 = 5*inca lea(mem(r13, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.SKITERROWU) // MAIN LOOP (k_iter) mov(rax, r12) // r12 = rax mov(rbx, rcx) // rcx = rbx // begin IO on rows 0-3 vmovups(mem(r12, 0), ymm4) vmovups(mem(r12, r8, 1, 0), ymm6) vmovups(mem(r12, r8, 2, 0), ymm8) vmovups(mem(r12, r13, 1, 0), ymm10) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 0*64)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, 4*64)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 1*64)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, 5*64)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 2*64)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, 6*64)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 3*64)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, 7*64)) // store ( gamma07..gamma37 ) lea(mem(r12, r8, 4), r12) // r12 += 4*inca add(imm(4*4), rcx) // rcx += 4; // begin IO on rows 4-7 vmovups(mem(r12, 0), ymm4) vmovups(mem(r12, r8, 1, 0), ymm6) vmovups(mem(r12, r8, 2, 0), ymm8) vmovups(mem(r12, r13, 1, 0), ymm10) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 0*64)) // store ( gamma40..gamma70 ) vmovups(xmm2, mem(rcx, 4*64)) // store ( gamma44..gamma74 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 1*64)) // store ( gamma41..gamma71 ) vmovups(xmm2, mem(rcx, 5*64)) // store ( gamma45..gamma75 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 2*64)) // store ( gamma42..gamma72 ) vmovups(xmm2, mem(rcx, 6*64)) // store ( gamma46..gamma76 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 3*64)) // store ( gamma43..gamma73 ) vmovups(xmm2, mem(rcx, 7*64)) // store ( gamma47..gamma77 ) lea(mem(r12, r8, 4), r12) // r12 += 4*inca add(imm(4*4), rcx) // rcx += 4; // begin IO on rows 8-11 vmovups(mem(r12, 0), ymm4) vmovups(mem(r12, r8, 1, 0), ymm6) vmovups(mem(r12, r8, 2, 0), ymm8) vmovups(mem(r12, r13, 1, 0), ymm10) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 0*64)) // store ( gamma80..gammaB0 ) vmovups(xmm2, mem(rcx, 4*64)) // store ( gamma84..gammaB4 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 1*64)) // store ( gamma81..gammaB1 ) vmovups(xmm2, mem(rcx, 5*64)) // store ( gamma85..gammaB5 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 2*64)) // store ( gamma82..gammaB2 ) vmovups(xmm2, mem(rcx, 6*64)) // store ( gamma86..gammaB6 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 3*64)) // store ( gamma83..gammaB3 ) vmovups(xmm2, mem(rcx, 7*64)) // store ( gamma87..gammaB7 ) lea(mem(r12, r8, 4), r12) // r12 += 4*inca add(imm(4*4), rcx) // rcx += 4; // begin IO on rows 12-15 vmovups(mem(r12, 0), ymm4) vmovups(mem(r12, r8, 1, 0), ymm6) vmovups(mem(r12, r8, 2, 0), ymm8) vmovups(mem(r12, r13, 1, 0), ymm10) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 0*64)) // store ( gammaC0..gammaF0 ) vmovups(xmm2, mem(rcx, 4*64)) // store ( gammaC4..gammaF4 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 1*64)) // store ( gammaC1..gammaF1 ) vmovups(xmm2, mem(rcx, 5*64)) // store ( gammaC5..gammaF5 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, 2*64)) // store ( gammaC2..gammaF2 ) vmovups(xmm2, mem(rcx, 6*64)) // store ( gammaC6..gammaF6 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, 3*64)) // store ( gammaC3..gammaF3 ) vmovups(xmm2, mem(rcx, 7*64)) // store ( gammaC7..gammaF7 ) add(r14, rax) // a += 8*lda; add(imm(8*16*4), rbx) // p += 8*ldp = 8*16; dec(rsi) // i -= 1; jne(.SKITERROWU) // iterate again if i != 0. label(.SCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SKLEFTROWU) // EDGE LOOP (k_left) vmovss(mem(rax, 0), xmm0) vmovss(mem(rax, r8, 1, 0), xmm2) vmovss(mem(rax, r8, 2, 0), xmm4) vmovss(mem(rax, r13, 1, 0), xmm6) vmovss(mem(rax, r8, 4, 0), xmm1) vmovss(mem(rax, r15, 1, 0), xmm3) vmovss(mem(rax, r13, 2, 0), xmm5) vmovss(mem(rax, rdx, 1, 0), xmm7) vmovss(xmm0, mem(rbx, 0*4)) vmovss(xmm2, mem(rbx, 1*4)) vmovss(xmm4, mem(rbx, 2*4)) vmovss(xmm6, mem(rbx, 3*4)) vmovss(xmm1, mem(rbx, 4*4)) vmovss(xmm3, mem(rbx, 5*4)) vmovss(xmm5, mem(rbx, 6*4)) vmovss(xmm7, mem(rbx, 7*4)) lea(mem(rax, r8, 8), r12) // r12 = a + 8*inca vmovss(mem(r12, 0), xmm0) vmovss(mem(r12, r8, 1, 0), xmm2) vmovss(mem(r12, r8, 2, 0), xmm4) vmovss(mem(r12, r13, 1, 0), xmm6) vmovss(mem(r12, r8, 4, 0), xmm1) vmovss(mem(r12, r15, 1, 0), xmm3) vmovss(mem(r12, r13, 2, 0), xmm5) vmovss(mem(r12, rdx, 1, 0), xmm7) add(r10, rax) // a += lda; vmovss(xmm0, mem(rbx, 8*4)) vmovss(xmm2, mem(rbx, 9*4)) vmovss(xmm4, mem(rbx, 10*4)) vmovss(xmm6, mem(rbx, 11*4)) vmovss(xmm1, mem(rbx, 12*4)) vmovss(xmm3, mem(rbx, 13*4)) vmovss(xmm5, mem(rbx, 14*4)) vmovss(xmm7, mem(rbx, 15*4)) add(imm(16*4), rbx) // p += ldp = 16; dec(rsi) // i -= 1; jne(.SKLEFTROWU) // iterate again if i != 0. jmp(.SDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.SCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda lea(mem(r13, r10, 2), r15) // r15 = 5*lda lea(mem(r13, r10, 4), rdx) // rdx = 7*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.SKITERCOLU) // MAIN LOOP (k_iter) vmovups(mem(rax, 0), ymm0) vmovups(mem(rax, 32), ymm1) vmovups(ymm0, mem(rbx, 0*64+ 0)) vmovups(ymm1, mem(rbx, 0*64+32)) vmovups(mem(rax, r10, 1, 0), ymm2) vmovups(mem(rax, r10, 1, 32), ymm3) vmovups(ymm2, mem(rbx, 1*64+ 0)) vmovups(ymm3, mem(rbx, 1*64+32)) vmovups(mem(rax, r10, 2, 0), ymm4) vmovups(mem(rax, r10, 2, 32), ymm5) vmovups(ymm4, mem(rbx, 2*64+ 0)) vmovups(ymm5, mem(rbx, 2*64+32)) vmovups(mem(rax, r13, 1, 0), ymm6) vmovups(mem(rax, r13, 1, 32), ymm7) vmovups(ymm6, mem(rbx, 3*64+ 0)) vmovups(ymm7, mem(rbx, 3*64+32)) vmovups(mem(rax, r10, 4, 0), ymm8) vmovups(mem(rax, r10, 4, 32), ymm9) vmovups(ymm8, mem(rbx, 4*64+ 0)) vmovups(ymm9, mem(rbx, 4*64+32)) vmovups(mem(rax, r15, 1, 0), ymm10) vmovups(mem(rax, r15, 1, 32), ymm11) vmovups(ymm10, mem(rbx, 5*64+ 0)) vmovups(ymm11, mem(rbx, 5*64+32)) vmovups(mem(rax, r13, 2, 0), ymm12) vmovups(mem(rax, r13, 2, 32), ymm13) vmovups(ymm12, mem(rbx, 6*64+ 0)) vmovups(ymm13, mem(rbx, 6*64+32)) vmovups(mem(rax, rdx, 1, 0), ymm14) vmovups(mem(rax, rdx, 1, 32), ymm15) add(r14, rax) // a += 8*lda; vmovups(ymm14, mem(rbx, 7*64+ 0)) vmovups(ymm15, mem(rbx, 7*64+32)) add(imm(8*16*4), rbx) // p += 8*ldp = 8*16; dec(rsi) // i -= 1; jne(.SKITERCOLU) // iterate again if i != 0. label(.SCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SKLEFTCOLU) // EDGE LOOP (k_left) vmovups(mem(rax, 0), ymm0) vmovups(mem(rax, 32), ymm1) add(r10, rax) // a += lda; vmovups(ymm0, mem(rbx, 0*64+ 0)) vmovups(ymm1, mem(rbx, 0*64+32)) add(imm(16*4), rbx) // p += ldp = 16; dec(rsi) // i -= 1; jne(.SKLEFTCOLU) // iterate again if i != 0. //jmp(.SDONE) // jump to end. label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) { PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; float* restrict p_edge = p + (i )*1; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; float* restrict p_edge = p + (j )*ldp; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c000066400000000000000000000325521427272030600267030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( double, d, packm_6xk_haswell_ref ) void bli_spackm_haswell_asm_6xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, float* restrict kappa, float* restrict a, inc_t inca0, inc_t lda0, float* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_spackm_6xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 6; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 8; // Define a local copy of 1.0 so we can test for unit kappa. float one_l = 1.0; float* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 8; #if 1 const uint64_t k_left = k0 % 8; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_seq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem(, r8, 4), r8) // inca *= sizeof(float) lea(mem(, r10, 4), r10) // lda *= sizeof(float) mov(var(p), rbx) // load address of p. lea(mem( , r10, 8), r14) // r14 = 8*lda mov(var(one), rdx) // load address of 1.0 constant vmovss(mem(rdx), xmm1) // load 1.0 mov(var(kappa), rcx) // load address of kappa vmovss(mem(rcx), xmm0) // load kappa // now branch on kappa == 1.0 vucomiss(xmm0, xmm1) // set ZF if kappa == 1.0 je(.SKAPPAUNIT) // if ZF = 1, jump to beta == 0 case label(.SKAPPANONU) cmp(imm(4), r8) // set ZF if (4*inca) == 4. jz(.SCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.SROWNONU) jmp(.SDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.SCOLNONU) jmp(.SDONE) // jump to end. label(.SKAPPAUNIT) cmp(imm(4), r8) // set ZF if (4*inca) == 4. jz(.SCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.SROWUNIT) lea(mem(r8, r8, 2), r13) // r13 = 3*inca lea(mem(r13, r8, 2), r15) // r15 = 5*inca //lea(mem(r13, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.SKITERROWU) // MAIN LOOP (k_iter) // begin IO on rows 0-3 vmovups(mem(rax, 0), ymm4) vmovups(mem(rax, r8, 1, 0), ymm6) vmovups(mem(rax, r8, 2, 0), ymm8) vmovups(mem(rax, r13, 1, 0), ymm10) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rbx, 0*24)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rbx, 4*24)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rbx, 1*24)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rbx, 5*24)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rbx, 2*24)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rbx, 6*24)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rbx, 3*24)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rbx, 7*24)) // store ( gamma07..gamma37 ) // begin IO on rows 4-5 vmovups(mem(rax, r8, 4, 0), ymm12) vmovups(mem(rax, r15, 1, 0), ymm14) vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rbx, 0*24+16)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rbx, 1*24+16)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rbx, 4*24+16)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rbx, 5*24+16)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rbx, 2*24+16)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rbx, 3*24+16)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rbx, 6*24+16)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rbx, 7*24+16)) // store ( gamma47..gamma57 ) add(r14, rax) // a += 8*lda; add(imm(8*6*4), rbx) // p += 8*ldp = 8*6; dec(rsi) // i -= 1; jne(.SKITERROWU) // iterate again if i != 0. label(.SCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SKLEFTROWU) // EDGE LOOP (k_left) vmovss(mem(rax, 0), xmm0) vmovss(mem(rax, r8, 1, 0), xmm2) vmovss(mem(rax, r8, 2, 0), xmm4) vmovss(mem(rax, r13, 1, 0), xmm6) vmovss(mem(rax, r8, 4, 0), xmm1) vmovss(mem(rax, r15, 1, 0), xmm3) vmovss(xmm0, mem(rbx, 0*4)) vmovss(xmm2, mem(rbx, 1*4)) vmovss(xmm4, mem(rbx, 2*4)) vmovss(xmm6, mem(rbx, 3*4)) vmovss(xmm1, mem(rbx, 4*4)) vmovss(xmm3, mem(rbx, 5*4)) add(r10, rax) // a += lda; add(imm(6*4), rbx) // p += ldp = 6; dec(rsi) // i -= 1; jne(.SKLEFTROWU) // iterate again if i != 0. jmp(.SDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.SCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda lea(mem(r13, r10, 2), r15) // r15 = 5*lda lea(mem(r13, r10, 4), rdx) // rdx = 7*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.SKITERCOLU) // MAIN LOOP (k_iter) vmovups(mem(rax, 0), xmm0) vmovsd( mem(rax, 16), xmm1) vmovups(xmm0, mem(rbx, 0*24+ 0)) vmovsd( xmm1, mem(rbx, 0*24+16)) vmovups(mem(rax, r10, 1, 0), xmm2) vmovsd( mem(rax, r10, 1, 16), xmm3) vmovups(xmm2, mem(rbx, 1*24+ 0)) vmovsd( xmm3, mem(rbx, 1*24+16)) vmovups(mem(rax, r10, 2, 0), xmm4) vmovsd( mem(rax, r10, 2, 16), xmm5) vmovups(xmm4, mem(rbx, 2*24+ 0)) vmovsd( xmm5, mem(rbx, 2*24+16)) vmovups(mem(rax, r13, 1, 0), xmm6) vmovsd( mem(rax, r13, 1, 16), xmm7) vmovups(xmm6, mem(rbx, 3*24+ 0)) vmovsd( xmm7, mem(rbx, 3*24+16)) vmovups(mem(rax, r10, 4, 0), xmm8) vmovsd( mem(rax, r10, 4, 16), xmm9) vmovups(xmm8, mem(rbx, 4*24+ 0)) vmovsd( xmm9, mem(rbx, 4*24+16)) vmovups(mem(rax, r15, 1, 0), xmm10) vmovsd( mem(rax, r15, 1, 16), xmm11) vmovups(xmm10, mem(rbx, 5*24+ 0)) vmovsd( xmm11, mem(rbx, 5*24+16)) vmovups(mem(rax, r13, 2, 0), xmm12) vmovsd( mem(rax, r13, 2, 16), xmm13) vmovups(xmm12, mem(rbx, 6*24+ 0)) vmovsd( xmm13, mem(rbx, 6*24+16)) vmovups(mem(rax, rdx, 1, 0), xmm14) vmovsd( mem(rax, rdx, 1, 16), xmm15) vmovups(xmm14, mem(rbx, 7*24+ 0)) vmovsd( xmm15, mem(rbx, 7*24+16)) add(r14, rax) // a += 8*lda; add(imm(8*6*4), rbx) // p += 8*ldp = 8*6; dec(rsi) // i -= 1; jne(.SKITERCOLU) // iterate again if i != 0. label(.SCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SKLEFTCOLU) // EDGE LOOP (k_left) vmovups(mem(rax, 0), xmm0) vmovsd( mem(rax, 16), xmm1) add(r10, rax) // a += lda; vmovups(xmm0, mem(rbx, 0*24+ 0)) vmovsd( xmm1, mem(rbx, 0*24+16)) add(imm(6*4), rbx) // p += ldp = 6; dec(rsi) // i -= 1; jne(.SKLEFTCOLU) // iterate again if i != 0. //jmp(.SDONE) // jump to end. label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || !unitk ) { PASTEMAC(sscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; float* restrict p_edge = p + (i )*1; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; float* restrict p_edge = p + (j )*ldp; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c000066400000000000000000000276161427272030600267140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( dcomplex, z, packm_3xk_haswell_ref ) void bli_zpackm_haswell_asm_3xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, dcomplex* restrict kappa, dcomplex* restrict a, inc_t inca0, inc_t lda0, dcomplex* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_zpackm_3xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 3; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 8; // Define a local copy of 1.0 so we can test for unit kappa. double one_l = 1.0; double* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 4; #if 1 const uint64_t k_left = k0 % 4; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_zeq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem( , r8, 2), r8) lea(mem( , r8, 8), r8) // inca *= sizeof(dcomplex) lea(mem( , r10, 2), r10) lea(mem( , r10, 8), r10) // lda *= sizeof(dcomplex) mov(var(p), rbx) // load address of p. lea(mem( , r10, 4), r14) // r14 = 4*lda mov(var(one), rdx) // load address of 1.0 constant vbroadcastsd(mem(rdx, 0), ymm1) // load 1.0 and duplicate vxorpd(ymm0, ymm0, ymm0) // set ymm0 to 0.0. mov(var(kappa), rcx) // load address of kappa vbroadcastsd(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastsd(mem(rcx, 8), ymm11) // load kappa_i and duplicate // now branch on kappa == 1.0 vucomisd(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm11) // set ZF if kappa_i == 0.0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); and(r12b, r13b) // set ZF if r12b & r13b == 1. jne(.ZKAPPAUNIT) // if ZF = 1, jump to beta == 0 case label(.ZKAPPANONU) cmp(imm(16), r8) // set ZF if (16*inca) == 16. jz(.ZCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.ZROWNONU) jmp(.ZDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.ZCOLNONU) jmp(.ZDONE) // jump to end. label(.ZKAPPAUNIT) cmp(imm(16), r8) // set ZF if (16*inca) == 16. jz(.ZCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.ZROWUNIT) //lea(mem(r8, r8, 2), r12) // r12 = 3*inca //lea(mem(r12, r8, 2), rcx) // rcx = 5*inca //lea(mem(r12, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.ZKITERROWU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm8) vmovupd(mem(rax, r8, 1, 0), ymm10) vmovupd(mem(rax, r8, 2, 0), ymm12) vextractf128(imm(0x1), ymm8, xmm9) vextractf128(imm(0x1), ymm10, xmm11) vextractf128(imm(0x1), ymm12, xmm13) vmovupd(xmm8, mem(rbx, 0*16+0*48)) vmovupd(xmm10, mem(rbx, 1*16+0*48)) vmovupd(xmm12, mem(rbx, 2*16+0*48)) vmovupd(xmm9, mem(rbx, 0*16+1*48)) vmovupd(xmm11, mem(rbx, 1*16+1*48)) vmovupd(xmm13, mem(rbx, 2*16+1*48)) vmovupd(mem(rax, 32), ymm8) vmovupd(mem(rax, r8, 1, 32), ymm10) vmovupd(mem(rax, r8, 2, 32), ymm12) add(r14, rax) // a += 4*lda; vextractf128(imm(0x1), ymm8, xmm9) vextractf128(imm(0x1), ymm10, xmm11) vextractf128(imm(0x1), ymm12, xmm13) vmovupd(xmm8, mem(rbx, 0*16+2*48)) vmovupd(xmm10, mem(rbx, 1*16+2*48)) vmovupd(xmm12, mem(rbx, 2*16+2*48)) vmovupd(xmm9, mem(rbx, 0*16+3*48)) vmovupd(xmm11, mem(rbx, 1*16+3*48)) vmovupd(xmm13, mem(rbx, 2*16+3*48)) add(imm(4*3*16), rbx) // p += 4*ldp = 4*3; dec(rsi) // i -= 1; jne(.ZKITERROWU) // iterate again if i != 0. label(.ZCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZKLEFTROWU) // EDGE LOOP (k_left) vmovups(mem(rax, 0), xmm0) vmovups(mem(rax, r8, 1, 0), xmm2) vmovups(mem(rax, r8, 2, 0), xmm4) add(r10, rax) // a += lda; vmovups(xmm0, mem(rbx, 0*16+0*48)) vmovups(xmm2, mem(rbx, 1*16+0*48)) vmovups(xmm4, mem(rbx, 2*16+0*48)) add(imm(3*16), rbx) // p += ldp = 4; dec(rsi) // i -= 1; jne(.ZKLEFTROWU) // iterate again if i != 0. jmp(.ZDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.ZCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.ZKITERCOLU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), xmm1) vmovupd(ymm0, mem(rbx, 0*48+ 0)) vmovupd(xmm1, mem(rbx, 0*48+32)) vmovupd(mem(rax, r10, 1, 0), ymm2) vmovupd(mem(rax, r10, 1, 32), xmm3) vmovupd(ymm2, mem(rbx, 1*48+ 0)) vmovupd(xmm3, mem(rbx, 1*48+32)) vmovupd(mem(rax, r10, 2, 0), ymm4) vmovupd(mem(rax, r10, 2, 32), xmm5) vmovupd(ymm4, mem(rbx, 2*48+ 0)) vmovupd(xmm5, mem(rbx, 2*48+32)) vmovupd(mem(rax, r13, 1, 0), ymm6) vmovupd(mem(rax, r13, 1, 32), xmm7) add(r14, rax) // a += 4*lda; vmovupd(ymm6, mem(rbx, 3*48+ 0)) vmovupd(xmm7, mem(rbx, 3*48+32)) add(imm(4*3*16), rbx) // p += 4*ldp = 4*3; dec(rsi) // i -= 1; jne(.ZKITERCOLU) // iterate again if i != 0. label(.ZCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZKLEFTCOLU) // EDGE LOOP (k_left) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), xmm1) add(r10, rax) // a += lda; vmovupd(ymm0, mem(rbx, 0*48+ 0)) vmovupd(xmm1, mem(rbx, 0*48+32)) add(imm(3*16), rbx) // p += ldp = 3; dec(rsi) // i -= 1; jne(.ZKLEFTCOLU) // iterate again if i != 0. //jmp(.ZDONE) // jump to end. label(.ZDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk ) { PASTEMAC(zscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; dcomplex* restrict p_edge = p + (i )*1; bli_zset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; dcomplex* restrict p_edge = p + (j )*ldp; bli_zset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c000066400000000000000000000304141427272030600267030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // Prototype reference packm kernels. PACKM_KER_PROT( dcomplex, z, packm_4xk_haswell_ref ) void bli_zpackm_haswell_asm_4xk ( conj_t conja, pack_t schema, dim_t cdim0, dim_t k0, dim_t k0_max, dcomplex* restrict kappa, dcomplex* restrict a, inc_t inca0, inc_t lda0, dcomplex* restrict p, inc_t ldp0, cntx_t* restrict cntx ) { #if 0 bli_zpackm_4xk_haswell_ref ( conja, schema, cdim0, k0, k0_max, kappa, a, inca0, lda0, p, ldp0, cntx ); return; #endif // This is the panel dimension assumed by the packm kernel. const dim_t mnr = 4; // This is the "packing" dimension assumed by the packm kernel. // This should be equal to ldp. //const dim_t packmnr = 8; // Define a local copy of 1.0 so we can test for unit kappa. double one_l = 1.0; double* restrict one = &one_l; // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. const uint64_t k_iter = k0 / 4; #if 1 const uint64_t k_left = k0 % 4; #else const uint64_t k_left = k0; #endif // NOTE: For the purposes of the comments in this packm kernel, we // interpret inca and lda as rs_a and cs_a, respectively, and similarly // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading // this packm kernel, you should think of the operation as packing an // m x n micropanel, where m and n are tiny and large, respectively, and // where elements of each column of the packed matrix P are contiguous. // (This packm kernel can still be used to pack micropanels of matrix B // in a gemm operation.) const uint64_t inca = inca0; const uint64_t lda = lda0; const uint64_t ldp = ldp0; const bool gs = ( inca0 != 1 && lda0 != 1 ); // NOTE: If/when this kernel ever supports scaling by kappa within the // assembly region, this constraint should be lifted. const bool unitk = bli_zeq1( *kappa ); // ------------------------------------------------------------------------- if ( cdim0 == mnr && !gs && !conja && unitk ) { begin_asm() mov(var(a), rax) // load address of a. mov(var(inca), r8) // load inca mov(var(lda), r10) // load lda lea(mem( , r8, 2), r8) lea(mem( , r8, 8), r8) // inca *= sizeof(dcomplex) lea(mem( , r10, 2), r10) lea(mem( , r10, 8), r10) // lda *= sizeof(dcomplex) mov(var(p), rbx) // load address of p. lea(mem( , r10, 4), r14) // r14 = 4*lda mov(var(one), rdx) // load address of 1.0 constant vbroadcastsd(mem(rdx, 0), ymm1) // load 1.0 and duplicate vxorpd(ymm0, ymm0, ymm0) // set ymm0 to 0.0. mov(var(kappa), rcx) // load address of kappa vbroadcastsd(mem(rcx, 0), ymm10) // load kappa_r and duplicate vbroadcastsd(mem(rcx, 8), ymm11) // load kappa_i and duplicate // now branch on kappa == 1.0 vucomisd(xmm1, xmm10) // set ZF if kappa_r == 1.0. sete(r12b) // r12b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm11) // set ZF if kappa_i == 0.0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); and(r12b, r13b) // set ZF if r12b & r13b == 1. jne(.ZKAPPAUNIT) // if ZF = 1, jump to kappa == 1.0 case label(.ZKAPPANONU) cmp(imm(16), r8) // set ZF if (16*inca) == 16. jz(.ZCOLNONU) // jump to column storage case // -- kappa non-unit, row storage on A ------------------------------------- label(.ZROWNONU) jmp(.ZDONE) // jump to end. // -- kappa non-unit, column storage on A ---------------------------------- label(.ZCOLNONU) jmp(.ZDONE) // jump to end. label(.ZKAPPAUNIT) cmp(imm(16), r8) // set ZF if (16*inca) == 16. jz(.ZCOLUNIT) // jump to column storage case // -- kappa unit, row storage on A ----------------------------------------- label(.ZROWUNIT) lea(mem(r8, r8, 2), r12) // r12 = 3*inca //lea(mem(r12, r8, 2), rcx) // rcx = 5*inca //lea(mem(r12, r8, 4), rdx) // rdx = 7*inca mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONKLEFTROWU) // if i == 0, jump to code that // contains the k_left loop. label(.ZKITERROWU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm8) vmovupd(mem(rax, r8, 1, 0), ymm10) vmovupd(mem(rax, r8, 2, 0), ymm12) vmovupd(mem(rax, r12, 1, 0), ymm14) vextractf128(imm(0x1), ymm8, xmm9) vextractf128(imm(0x1), ymm10, xmm11) vextractf128(imm(0x1), ymm12, xmm13) vextractf128(imm(0x1), ymm14, xmm15) vmovupd(xmm8, mem(rbx, 0*16+0*64)) vmovupd(xmm10, mem(rbx, 1*16+0*64)) vmovupd(xmm12, mem(rbx, 2*16+0*64)) vmovupd(xmm14, mem(rbx, 3*16+0*64)) vmovupd(xmm9, mem(rbx, 0*16+1*64)) vmovupd(xmm11, mem(rbx, 1*16+1*64)) vmovupd(xmm13, mem(rbx, 2*16+1*64)) vmovupd(xmm15, mem(rbx, 3*16+1*64)) vmovupd(mem(rax, 32), ymm8) vmovupd(mem(rax, r8, 1, 32), ymm10) vmovupd(mem(rax, r8, 2, 32), ymm12) vmovupd(mem(rax, r12, 1, 32), ymm14) add(r14, rax) // a += 4*lda; vextractf128(imm(0x1), ymm8, xmm9) vextractf128(imm(0x1), ymm10, xmm11) vextractf128(imm(0x1), ymm12, xmm13) vextractf128(imm(0x1), ymm14, xmm15) vmovupd(xmm8, mem(rbx, 0*16+2*64)) vmovupd(xmm10, mem(rbx, 1*16+2*64)) vmovupd(xmm12, mem(rbx, 2*16+2*64)) vmovupd(xmm14, mem(rbx, 3*16+2*64)) vmovupd(xmm9, mem(rbx, 0*16+3*64)) vmovupd(xmm11, mem(rbx, 1*16+3*64)) vmovupd(xmm13, mem(rbx, 2*16+3*64)) vmovupd(xmm15, mem(rbx, 3*16+3*64)) add(imm(4*4*16), rbx) // p += 4*ldp = 4*4; dec(rsi) // i -= 1; jne(.ZKITERROWU) // iterate again if i != 0. label(.ZCONKLEFTROWU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZKLEFTROWU) // EDGE LOOP (k_left) vmovups(mem(rax, 0), xmm0) vmovups(mem(rax, r8, 1, 0), xmm2) vmovups(mem(rax, r8, 2, 0), xmm4) vmovups(mem(rax, r12, 1, 0), xmm6) add(r10, rax) // a += lda; vmovups(xmm0, mem(rbx, 0*16+0*64)) vmovups(xmm2, mem(rbx, 1*16+0*64)) vmovups(xmm4, mem(rbx, 2*16+0*64)) vmovups(xmm6, mem(rbx, 3*16+0*64)) add(imm(4*16), rbx) // p += ldp = 4; dec(rsi) // i -= 1; jne(.ZKLEFTROWU) // iterate again if i != 0. jmp(.ZDONE) // jump to end. // -- kappa unit, column storage on A -------------------------------------- label(.ZCOLUNIT) lea(mem(r10, r10, 2), r13) // r13 = 3*lda mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONKLEFTCOLU) // if i == 0, jump to code that // contains the k_left loop. label(.ZKITERCOLU) // MAIN LOOP (k_iter) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), ymm1) vmovupd(ymm0, mem(rbx, 0*64+ 0)) vmovupd(ymm1, mem(rbx, 0*64+32)) vmovupd(mem(rax, r10, 1, 0), ymm2) vmovupd(mem(rax, r10, 1, 32), ymm3) vmovupd(ymm2, mem(rbx, 1*64+ 0)) vmovupd(ymm3, mem(rbx, 1*64+32)) vmovupd(mem(rax, r10, 2, 0), ymm4) vmovupd(mem(rax, r10, 2, 32), ymm5) vmovupd(ymm4, mem(rbx, 2*64+ 0)) vmovupd(ymm5, mem(rbx, 2*64+32)) vmovupd(mem(rax, r13, 1, 0), ymm6) vmovupd(mem(rax, r13, 1, 32), ymm7) add(r14, rax) // a += 4*lda; vmovupd(ymm6, mem(rbx, 3*64+ 0)) vmovupd(ymm7, mem(rbx, 3*64+32)) add(imm(4*4*16), rbx) // p += 4*ldp = 4*4; dec(rsi) // i -= 1; jne(.ZKITERCOLU) // iterate again if i != 0. label(.ZCONKLEFTCOLU) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZDONE) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZKLEFTCOLU) // EDGE LOOP (k_left) vmovupd(mem(rax, 0), ymm0) vmovupd(mem(rax, 32), ymm1) add(r10, rax) // a += lda; vmovupd(ymm0, mem(rbx, 0*64+ 0)) vmovupd(ymm1, mem(rbx, 0*64+32)) add(imm(4*16), rbx) // p += ldp = 4; dec(rsi) // i -= 1; jne(.ZKLEFTCOLU) // iterate again if i != 0. //jmp(.ZDONE) // jump to end. label(.ZDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [kappa] "m" (kappa), [one] "m" (one) : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", /*"r9",*/ "r10", /*"r11",*/ "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk ) { PASTEMAC(zscal2m,BLIS_TAPI_EX_SUF) ( 0, BLIS_NONUNIT_DIAG, BLIS_DENSE, ( trans_t )conja, cdim0, k0, kappa, a, inca0, lda0, p, 1, ldp0, cntx, NULL ); if ( cdim0 < mnr ) { // Handle zero-filling along the "long" edge of the micropanel. const dim_t i = cdim0; const dim_t m_edge = mnr - cdim0; const dim_t n_edge = k0_max; dcomplex* restrict p_edge = p + (i )*1; bli_zset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( k0 < k0_max ) { // Handle zero-filling along the "short" (far) edge of the micropanel. const dim_t j = k0; const dim_t m_edge = mnr; const dim_t n_edge = k0_max - k0; dcomplex* restrict p_edge = p + (j )*ldp; bli_zset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/000077500000000000000000000000001427272030600211025ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c000066400000000000000000001567551427272030600262740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #define SGEMM_INPUT_GS_BETA_NZ \ vmovlps(mem(rcx), xmm0, xmm0) \ vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) \ vmovlps(mem(rcx, rsi, 2), xmm1, xmm1) \ vmovhps(mem(rcx, r13, 1), xmm1, xmm1) \ vshufps(imm(0x88), xmm1, xmm0, xmm0) \ vmovlps(mem(rcx, rsi, 4), xmm2, xmm2) \ vmovhps(mem(rcx, r15, 1), xmm2, xmm2) \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss instead. But since we're limited to using ymm0 through ymm2 (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ vmovss(mem(rcx, r10, 1), xmm1) \ vpermilps(imm(0xcf), xmm1, xmm1) \ vmovlps(mem(rcx, r13, 2), xmm1, xmm1) \ /*vmovhps(mem(rcx, r10, 1), xmm1, xmm1)*/ \ vshufps(imm(0x88), xmm1, xmm2, xmm2) \ vperm2f128(imm(0x20), ymm2, ymm0, ymm0) #define SGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm2) \ vmovss(xmm0, mem(rcx)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, rsi, 1)) \ vpermilps(imm(0x39), xmm1, xmm0) \ vmovss(xmm0, mem(rcx, rsi, 2)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, r13, 1)) \ vmovss(xmm2, mem(rcx, rsi, 4)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r15, 1)) \ vpermilps(imm(0x39), xmm1, xmm2) \ vmovss(xmm2, mem(rcx, r13, 2)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemm_haswell_asm_6x16 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_AMBI( s, 6, 16, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPREFETCH) // jump to column prefetch case lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.SPREFETCHDONE) label(.SCOLPREFETCH) lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 7*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c lea(mem(rcx, rsi, 8), r14) // r14 = c + 8*cs_c; lea(mem(r14, r13, 1), rdx) // rdx = c + 11*cs_c; prefetch(0, mem(r14, 7*8)) // prefetch c + 8*cs_c prefetch(0, mem(r14, rsi, 1, 7*8)) // prefetch c + 9*cs_c prefetch(0, mem(r14, rsi, 2, 7*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 12*cs_c prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 15*cs_c label(.SPREFETCHDONE) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 64*4)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 76*4)) vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 64*4)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rsi, 8), rdx) // load address of c + 8*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. jz(.SCOLSTORED) // jump to column storage case vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm5) vmovups(ymm5, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm7) vmovups(ymm7, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm9) vmovups(ymm9, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm11) vmovups(ymm11, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm13) vmovups(ymm13, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm14) vmovups(ymm14, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm15) vmovups(ymm15, mem(rcx,32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14), xmm1, xmm1) vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) vmovhpd(mem(r14, r15, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) vmovhpd(mem(r14, r13, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(r14, r13, 2), xmm1, xmm1) vmovhpd(mem(r14, r10, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r15, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, r13, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, r13, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, r10, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14), xmm1, xmm1) vmovhpd(mem(r14, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(r14, rsi, 4), xmm1, xmm1) vmovhpd(mem(r14, r15, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(r14, rsi, 2), xmm1, xmm1) vmovhpd(mem(r14, r13, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(r14, r13, 2), xmm1, xmm1) vmovhpd(mem(r14, r10, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*cs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx,32)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx,32)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx,32)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx,32)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) vmovups(ymm13, mem(rcx,32)) add(rdi, rcx) vmovups(ymm14, mem(rcx)) vmovups(ymm15, mem(rcx,32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_ label(.SDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( s ); } #define DGEMM_INPUT_GS_BETA_NZ \ vmovlpd(mem(rcx), xmm0, xmm0) \ vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) \ vmovhpd(mem(rcx, r13, 1), xmm1, xmm1) \ vperm2f128(imm(0x20), ymm1, ymm0, ymm0) /*\ vmovlpd(mem(rcx, rsi, 4), xmm2, xmm2) \ vmovhpd(mem(rcx, r15, 1), xmm2, xmm2) \ vmovlpd(mem(rcx, r13, 2), xmm1, xmm1) \ vmovhpd(mem(rcx, r10, 1), xmm1, xmm1) \ vperm2f128(imm(0x20), ymm1, ymm2, ymm2)*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm1) \ vmovlpd(xmm0, mem(rcx)) \ vmovhpd(xmm0, mem(rcx, rsi, 1)) \ vmovlpd(xmm1, mem(rcx, rsi, 2)) \ vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ vextractf128(imm(1), ymm2, xmm1) \ vmovlpd(xmm2, mem(rcx, rsi, 4)) \ vmovhpd(xmm2, mem(rcx, r15, 1)) \ vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemm_haswell_asm_6x8 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_AMBI( d, 6, 8, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPREFETCH) // jump to column prefetch case lea(mem(rdi, rdi, 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.DPREFETCHDONE) label(.DCOLPREFETCH) lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 7*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 7*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 7*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, r13, 1, 7*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 4, 7*8)) // prefetch c + 7*cs_c label(.DPREFETCHDONE) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) // iteration 1 prefetch(0, mem(rax, 72*8)) vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 80*8)) vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm5) vmovupd(ymm5, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm7) vmovupd(ymm7, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm9) vmovupd(ymm9, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm11) vmovupd(ymm11, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm13) vmovupd(ymm13, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm14) vmovupd(ymm14, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm15) vmovupd(ymm15, mem(rcx,32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(r14), xmm3, xmm0) vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) lea(mem(r14, rsi, 4), r14) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, r13, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(r14), xmm3, xmm0) vfmadd231pd(mem(r14, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(r14, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(r14, r13, 1), xmm3, xmm4) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) //lea(mem(r14, rsi, 4), r14) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(ymm7, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(ymm11, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) vmovupd(ymm13, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx)) vmovupd(ymm15, mem(rcx,32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) lea(mem(r14, rsi, 4), r14) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm4, mem(r14, r13, 1)) //lea(mem(r14, rsi, 4), r14) label(.DDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( d ); } #define CGEMM_INPUT_SCALE_RS_BETA_NZ(where) \ vmovups(where, ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) void bli_cgemm_haswell_asm_3x8 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( c, 3, 8, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.CLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 32*8)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 38*8)) vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(4*3*8), rax) // a += 4*3 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.CLOOPKITER) // iterate again if i != 0. label(.CCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 32*8)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(1*3*8), rax) // a += 1*3 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.CLOOPKLEFT) // iterate again if i != 0. label(.CPOSTACCUM) // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm7, ymm7) vpermilps(imm(0xb1), ymm10, ymm10) vpermilps(imm(0xb1), ymm11, ymm11) vpermilps(imm(0xb1), ymm14, ymm14) vpermilps(imm(0xb1), ymm15, ymm15) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) vaddsubps(ymm10, ymm8, ymm8) vaddsubps(ymm11, ymm9, ymm9) vaddsubps(ymm14, ymm12, ymm12) vaddsubps(ymm15, ymm13, ymm13) mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) vpermilps(imm(0xb1), ymm5, ymm3) vmulps(ymm0, ymm5, ymm5) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm5, ymm5) vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) vpermilps(imm(0xb1), ymm9, ymm3) vmulps(ymm0, ymm9, ymm9) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm9, ymm9) vpermilps(imm(0xb1), ymm12, ymm3) vmulps(ymm0, ymm12, ymm12) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm12, ymm12) vpermilps(imm(0xb1), ymm13, ymm3) vmulps(ymm0, ymm13, ymm13) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm13, ymm13) mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx)) vaddps(ymm4, ymm0, ymm0) vmovups(ymm0, mem(rcx)) CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx,32)) vaddps(ymm5, ymm0, ymm0) vmovups(ymm0, mem(rcx,32)) CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11)) vaddps(ymm8, ymm0, ymm0) vmovups(ymm0, mem(r11)) CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11,32)) vaddps(ymm9, ymm0, ymm0) vmovups(ymm0, mem(r11,32)) CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12)) vaddps(ymm12, ymm0, ymm0) vmovups(ymm0, mem(r12)) CGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12,32)) vaddps(ymm13, ymm0, ymm0) vmovups(ymm0, mem(r12,32)) jmp(.CDONE) // jump to end. label(.CBETAZERO) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx,32)) vmovups(ymm8, mem(r11)) vmovups(ymm9, mem(r11,32)) vmovups(ymm12, mem(r12)) vmovups(ymm13, mem(r12,32)) label(.CDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( c ); } #define ZGEMM_INPUT_SCALE_RS_BETA_NZ(where) \ vmovupd(where, ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) void bli_zgemm_haswell_asm_3x4 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( z, 3, 4, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*rs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.ZLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 32*16)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) // iteration 1 prefetch(0, mem(rax, 36*16)) vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 40*16)) vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(4*3*16), rax) // a += 4*3 (unroll x mr) add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.ZLOOPKITER) // iterate again if i != 0. label(.ZCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 32*16)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(1*3*16), rax) // a += 1*3 (unroll x mr) add(imm(1*4*16), rbx) // b += 1*4 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.ZLOOPKLEFT) // iterate again if i != 0. label(.ZPOSTACCUM) // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm7, ymm7) vpermilpd(imm(0x5), ymm10, ymm10) vpermilpd(imm(0x5), ymm11, ymm11) vpermilpd(imm(0x5), ymm14, ymm14) vpermilpd(imm(0x5), ymm15, ymm15) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) vaddsubpd(ymm10, ymm8, ymm8) vaddsubpd(ymm11, ymm9, ymm9) vaddsubpd(ymm14, ymm12, ymm12) vaddsubpd(ymm15, ymm13, ymm13) mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) vpermilpd(imm(0x5), ymm5, ymm3) vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) vpermilpd(imm(0x5), ymm9, ymm3) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm9, ymm9) vpermilpd(imm(0x5), ymm12, ymm3) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm12, ymm12) vpermilpd(imm(0x5), ymm13, ymm3) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm13, ymm13) mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx)) vaddpd(ymm4, ymm0, ymm0) vmovupd(ymm0, mem(rcx)) ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(rcx,32)) vaddpd(ymm5, ymm0, ymm0) vmovupd(ymm0, mem(rcx,32)) ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11)) vaddpd(ymm8, ymm0, ymm0) vmovupd(ymm0, mem(r11)) ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r11,32)) vaddpd(ymm9, ymm0, ymm0) vmovupd(ymm0, mem(r11,32)) ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12)) vaddpd(ymm12, ymm0, ymm0) vmovupd(ymm0, mem(r12)) ZGEMM_INPUT_SCALE_RS_BETA_NZ(mem(r12,32)) vaddpd(ymm13, ymm0, ymm0) vmovupd(ymm0, mem(r12,32)) jmp(.ZDONE) // jump to end. label(.ZBETAZERO) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx,32)) vmovupd(ymm8, mem(r11)) vmovupd(ymm9, mem(r11,32)) vmovupd(ymm12, mem(r12)) vmovupd(ymm13, mem(r12,32)) label(.ZDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c000066400000000000000000001167141427272030600262630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #define SGEMM_INPUT_GS_BETA_NZ \ vmovlps(mem(rcx), xmm0, xmm0) \ vmovhps(mem(rcx, rsi, 1), xmm0, xmm0) \ vmovlps(mem(rcx, rsi, 2), xmm1, xmm1) \ vmovhps(mem(rcx, r13, 1), xmm1, xmm1) \ vshufps(imm(0x88), xmm1, xmm0, xmm0) \ vmovlps(mem(rcx, rsi, 4), xmm2, xmm2) \ vmovhps(mem(rcx, r15, 1), xmm2, xmm2) \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss instead. But since we're limited to using ymm0 through ymm2 (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ vmovss(mem(rcx, r10, 1), xmm1) \ vpermilps(imm(0xcf), xmm1, xmm1) \ vmovlps(mem(rcx, r13, 2), xmm1, xmm1) \ /*vmovhps(mem(rcx, r10, 1), xmm1, xmm1)*/ \ vshufps(imm(0x88), xmm1, xmm2, xmm2) \ vperm2f128(imm(0x20), ymm2, ymm0, ymm0) #define SGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm2) \ vmovss(xmm0, mem(rcx)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, rsi, 1)) \ vpermilps(imm(0x39), xmm1, xmm0) \ vmovss(xmm0, mem(rcx, rsi, 2)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, r13, 1)) \ vmovss(xmm2, mem(rcx, rsi, 4)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r15, 1)) \ vpermilps(imm(0x39), xmm1, xmm2) \ vmovss(xmm2, mem(rcx, r13, 2)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemm_haswell_asm_16x6 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( s, 16, 6, false ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) // initialize loop by pre-loading vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 128*4)) vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rax, -2*32), ymm0) vmovaps(mem(rax, -1*32), ymm1) // iteration 1 vbroadcastss(mem(rbx, 6*4), ymm2) vbroadcastss(mem(rbx, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 8*4), ymm2) vbroadcastss(mem(rbx, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 10*4), ymm2) vbroadcastss(mem(rbx, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rax, 0*32), ymm0) vmovaps(mem(rax, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 152*4)) vbroadcastss(mem(rbx, 12*4), ymm2) vbroadcastss(mem(rbx, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 14*4), ymm2) vbroadcastss(mem(rbx, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 16*4), ymm2) vbroadcastss(mem(rbx, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rax, 2*32), ymm0) vmovaps(mem(rax, 3*32), ymm1) // iteration 3 vbroadcastss(mem(rbx, 18*4), ymm2) vbroadcastss(mem(rbx, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 20*4), ymm2) vbroadcastss(mem(rbx, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 22*4), ymm2) vbroadcastss(mem(rbx, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) add(imm(4*6*4), rbx) // b += 4*6 (unroll x nr) vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 128*4)) vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(1*16*4), rax) // a += 1*16 (unroll x mr) add(imm(1*6*4), rbx) // b += 1*6 (unroll x nr) vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm5) vmovups(ymm5, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm7) vmovups(ymm7, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm9) vmovups(ymm9, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm11) vmovups(ymm11, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm13) vmovups(ymm13, mem(rcx,32)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm14) vmovups(ymm14, mem(rcx)) vfmadd231ps(mem(rcx,32), ymm3, ymm15) vmovups(ymm15, mem(rcx,32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx,32)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx,32)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx,32)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx,32)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) vmovups(ymm13, mem(rcx,32)) add(rdi, rcx) vmovups(ymm14, mem(rcx)) vmovups(ymm15, mem(rcx,32)) //add(rdi, rcx) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( s ); } #define DGEMM_INPUT_GS_BETA_NZ \ vmovlpd(mem(rcx), xmm0, xmm0) \ vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) \ vmovhpd(mem(rcx, r13, 1), xmm1, xmm1) \ vperm2f128(imm(0x20), ymm1, ymm0, ymm0) /*\ vmovlpd(mem(rcx, rsi, 4), xmm2, xmm2) \ vmovhpd(mem(rcx, r15, 1), xmm2, xmm2) \ vmovlpd(mem(rcx, r13, 2), xmm1, xmm1) \ vmovhpd(mem(rcx, r10, 1), xmm1, xmm1) \ vperm2f128(imm(0x20), ymm1, ymm2, ymm2)*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm1) \ vmovlpd(xmm0, mem(rcx)) \ vmovhpd(xmm0, mem(rcx, rsi, 1)) \ vmovlpd(xmm1, mem(rcx, rsi, 2)) \ vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ vextractf128(imm(1), ymm2, xmm1) \ vmovlpd(xmm2, mem(rcx, rsi, 4)) \ vmovhpd(xmm2, mem(rcx, r15, 1)) \ vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemm_haswell_asm_8x6 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( d, 8, 6, false ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) // initialize loop by pre-loading vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) lea(mem(rdi, rdi, 2), r13) // r13 = 3*cs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*cs_c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rax, -2*32), ymm0) vmovapd(mem(rax, -1*32), ymm1) // iteration 1 vbroadcastsd(mem(rbx, 6*8), ymm2) vbroadcastsd(mem(rbx, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 8*8), ymm2) vbroadcastsd(mem(rbx, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 10*8), ymm2) vbroadcastsd(mem(rbx, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rax, 0*32), ymm0) vmovapd(mem(rax, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 76*8)) vbroadcastsd(mem(rbx, 12*8), ymm2) vbroadcastsd(mem(rbx, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 14*8), ymm2) vbroadcastsd(mem(rbx, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 16*8), ymm2) vbroadcastsd(mem(rbx, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rax, 2*32), ymm0) vmovapd(mem(rax, 3*32), ymm1) // iteration 3 vbroadcastsd(mem(rbx, 18*8), ymm2) vbroadcastsd(mem(rbx, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 20*8), ymm2) vbroadcastsd(mem(rbx, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 22*8), ymm2) vbroadcastsd(mem(rbx, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) add(imm(4*6*8), rbx) // b += 4*6 (unroll x nr) vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(1*8*8), rax) // a += 1*8 (unroll x mr) add(imm(1*6*8), rbx) // b += 1*6 (unroll x nr) vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm5) vmovupd(ymm5, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm7) vmovupd(ymm7, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm9) vmovupd(ymm9, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm11) vmovupd(ymm11, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm13) vmovupd(ymm13, mem(rcx,32)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm14) vmovupd(ymm14, mem(rcx)) vfmadd231pd(mem(rcx,32), ymm3, ymm15) vmovupd(ymm15, mem(rcx,32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(ymm7, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(ymm11, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) vmovupd(ymm13, mem(rcx,32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx)) vmovupd(ymm15, mem(rcx,32)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( d ); } #define CGEMM_INPUT_SCALE_CS_BETA_NZ(where) \ vmovups(where, ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) void bli_cgemm_haswell_asm_8x3 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( c, 8, 3, false ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) // initialize loop by pre-loading vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.CLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 32*8)) vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rax, -2*32), ymm0) vmovaps(mem(rax, -1*32), ymm1) // iteration 1 vbroadcastss(mem(rbx, 6*4), ymm2) vbroadcastss(mem(rbx, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 8*4), ymm2) vbroadcastss(mem(rbx, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 10*4), ymm2) vbroadcastss(mem(rbx, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rax, 0*32), ymm0) vmovaps(mem(rax, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 38*8)) vbroadcastss(mem(rbx, 12*4), ymm2) vbroadcastss(mem(rbx, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 14*4), ymm2) vbroadcastss(mem(rbx, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 16*4), ymm2) vbroadcastss(mem(rbx, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rax, 2*32), ymm0) vmovaps(mem(rax, 3*32), ymm1) // iteration 3 vbroadcastss(mem(rbx, 18*4), ymm2) vbroadcastss(mem(rbx, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 20*4), ymm2) vbroadcastss(mem(rbx, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 22*4), ymm2) vbroadcastss(mem(rbx, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) add(imm(4*3*8), rbx) // b += 4*3 (unroll x nr) vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.CLOOPKITER) // iterate again if i != 0. label(.CCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 32*8)) vbroadcastss(mem(rbx, 0*4), ymm2) vbroadcastss(mem(rbx, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rbx, 2*4), ymm2) vbroadcastss(mem(rbx, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rbx, 4*4), ymm2) vbroadcastss(mem(rbx, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(1*8*8), rax) // a += 1*8 (unroll x mr) add(imm(1*3*8), rbx) // b += 1*3 (unroll x nr) vmovaps(mem(rax, -4*32), ymm0) vmovaps(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.CLOOPKLEFT) // iterate again if i != 0. label(.CPOSTACCUM) // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm7, ymm7) vpermilps(imm(0xb1), ymm10, ymm10) vpermilps(imm(0xb1), ymm11, ymm11) vpermilps(imm(0xb1), ymm14, ymm14) vpermilps(imm(0xb1), ymm15, ymm15) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) vaddsubps(ymm10, ymm8, ymm8) vaddsubps(ymm11, ymm9, ymm9) vaddsubps(ymm14, ymm12, ymm12) vaddsubps(ymm15, ymm13, ymm13) mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) vpermilps(imm(0xb1), ymm5, ymm3) vmulps(ymm0, ymm5, ymm5) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm5, ymm5) vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) vpermilps(imm(0xb1), ymm9, ymm3) vmulps(ymm0, ymm9, ymm9) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm9, ymm9) vpermilps(imm(0xb1), ymm12, ymm3) vmulps(ymm0, ymm12, ymm12) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm12, ymm12) vpermilps(imm(0xb1), ymm13, ymm3) vmulps(ymm0, ymm13, ymm13) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm13, ymm13) mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 1, jump to beta == 0 case CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx)) vaddps(ymm4, ymm0, ymm0) vmovups(ymm0, mem(rcx)) CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx,32)) vaddps(ymm5, ymm0, ymm0) vmovups(ymm0, mem(rcx,32)) CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11)) vaddps(ymm8, ymm0, ymm0) vmovups(ymm0, mem(r11)) CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11,32)) vaddps(ymm9, ymm0, ymm0) vmovups(ymm0, mem(r11,32)) CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12)) vaddps(ymm12, ymm0, ymm0) vmovups(ymm0, mem(r12)) CGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12,32)) vaddps(ymm13, ymm0, ymm0) vmovups(ymm0, mem(r12,32)) jmp(.CDONE) // jump to end. label(.CBETAZERO) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx,32)) vmovups(ymm8, mem(r11)) vmovups(ymm9, mem(r11,32)) vmovups(ymm12, mem(r12)) vmovups(ymm13, mem(r12,32)) label(.CDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( c ); } #define ZGEMM_INPUT_SCALE_CS_BETA_NZ(where) \ vmovups(where, ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) void bli_zgemm_haswell_asm_4x3 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( z, 4, 3, false ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. //mov(%9, r15) // load address of b_next. add(imm(32*4), rax) // initialize loop by pre-loading vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 1), r11) // r11 = c + 1*cs_c; lea(mem(rcx, rdi, 2), r12) // r12 = c + 2*cs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c prefetch(0, mem(r11, 7*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, 7*8)) // prefetch c + 2*cs_c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.ZLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 32*16)) vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rax, -2*32), ymm0) vmovapd(mem(rax, -1*32), ymm1) // iteration 1 vbroadcastsd(mem(rbx, 6*8), ymm2) vbroadcastsd(mem(rbx, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 8*8), ymm2) vbroadcastsd(mem(rbx, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 10*8), ymm2) vbroadcastsd(mem(rbx, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rax, 0*32), ymm0) vmovapd(mem(rax, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 38*16)) vbroadcastsd(mem(rbx, 12*8), ymm2) vbroadcastsd(mem(rbx, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 14*8), ymm2) vbroadcastsd(mem(rbx, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 16*8), ymm2) vbroadcastsd(mem(rbx, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rax, 2*32), ymm0) vmovapd(mem(rax, 3*32), ymm1) // iteration 3 vbroadcastsd(mem(rbx, 18*8), ymm2) vbroadcastsd(mem(rbx, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 20*8), ymm2) vbroadcastsd(mem(rbx, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 22*8), ymm2) vbroadcastsd(mem(rbx, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(4*4*16), rax) // a += 4*4 (unroll x mr) add(imm(4*3*16), rbx) // b += 4*3 (unroll x nr) vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.ZLOOPKITER) // iterate again if i != 0. label(.ZCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 32*16)) vbroadcastsd(mem(rbx, 0*8), ymm2) vbroadcastsd(mem(rbx, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rbx, 2*8), ymm2) vbroadcastsd(mem(rbx, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rbx, 4*8), ymm2) vbroadcastsd(mem(rbx, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(1*4*16), rax) // a += 1*4 (unroll x mr) add(imm(1*3*16), rbx) // b += 1*3 (unroll x nr) vmovapd(mem(rax, -4*32), ymm0) vmovapd(mem(rax, -3*32), ymm1) dec(rsi) // i -= 1; jne(.ZLOOPKLEFT) // iterate again if i != 0. label(.ZPOSTACCUM) // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm7, ymm7) vpermilpd(imm(0x5), ymm10, ymm10) vpermilpd(imm(0x5), ymm11, ymm11) vpermilpd(imm(0x5), ymm14, ymm14) vpermilpd(imm(0x5), ymm15, ymm15) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) vaddsubpd(ymm10, ymm8, ymm8) vaddsubpd(ymm11, ymm9, ymm9) vaddsubpd(ymm14, ymm12, ymm12) vaddsubpd(ymm15, ymm13, ymm13) mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) vpermilpd(imm(0x5), ymm5, ymm3) vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) vpermilpd(imm(0x5), ymm9, ymm3) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm9, ymm9) vpermilpd(imm(0x5), ymm12, ymm3) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm12, ymm12) vpermilpd(imm(0x5), ymm13, ymm3) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm13, ymm13) mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx)) vaddpd(ymm4, ymm0, ymm0) vmovupd(ymm0, mem(rcx)) ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(rcx,32)) vaddpd(ymm5, ymm0, ymm0) vmovupd(ymm0, mem(rcx,32)) ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11)) vaddpd(ymm8, ymm0, ymm0) vmovupd(ymm0, mem(r11)) ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r11,32)) vaddpd(ymm9, ymm0, ymm0) vmovupd(ymm0, mem(r11,32)) ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12)) vaddpd(ymm12, ymm0, ymm0) vmovupd(ymm0, mem(r12)) ZGEMM_INPUT_SCALE_CS_BETA_NZ(mem(r12,32)) vaddpd(ymm13, ymm0, ymm0) vmovupd(ymm0, mem(r12,32)) jmp(.ZDONE) // jump to end. label(.ZBETAZERO) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx,32)) vmovupd(ymm8, mem(r11)) vmovupd(ymm9, mem(r11,32)) vmovupd(ymm12, mem(r12)) vmovupd(ymm13, mem(r12,32)) label(.ZDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c)/*, // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c000066400000000000000000001215331427272030600274770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #define SGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm2) \ vmovss(xmm0, mem(rcx)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, rsi, 1)) \ vpermilps(imm(0x39), xmm1, xmm0) \ vmovss(xmm0, mem(rcx, rsi, 2)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, r13, 1)) \ vmovss(xmm2, mem(rcx, rsi, 4)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r15, 1)) \ vpermilps(imm(0x39), xmm1, xmm2) \ vmovss(xmm2, mem(rcx, r13, 2)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemmtrsm_l_haswell_asm_6x16 ( dim_t m, dim_t n, dim_t k0, float* restrict alpha, float* restrict a10, float* restrict a11, float* restrict b01, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; float* beta = bli_sm1; GEMMTRSM_UKR_SETUP_CT_ANY( s, 6, 16, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) mov(var(b11), rcx) // load address of b11 mov(imm(16), rdi) // set rs_b = PACKNR = 16 lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. mov(var(c11), r8) // load address of c11 mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 64*4)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 76*4)) vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 64*4)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4..ymm15 = -a10 * b01 mov(var(alpha), rbx) // load address of alpha vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate mov(imm(1), rsi) // load cs_b = 1 lea(mem(, rsi, 4), rsi) // cs_b *= sizeof(float) lea(mem(rcx, rsi, 8), rdx) // load address of b11 + 8*cs_b mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+8*cs_b for later // b11 := alpha * b11 - a10 * b01 vfmsub231ps(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm5) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm7) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm9) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm11) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm13) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm15) //add(rdi, rdx) // prefetch c11 #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(float) lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 0*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 0*8)) // prefetch c11 + 2*rs_c prefetch(0, mem(rdx, 0*8)) // prefetch c11 + 3*rs_c prefetch(0, mem(rdx, r9, 1, 0*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 0*8)) // prefetch c11 + 5*rs_c #endif // trsm computation begins here // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) // ymm8 ymm9 = ( beta20..27 ) ( beta28..2F ) // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+8*cs_b // Note: rdi = rs_b // iteration 0 ------------- vbroadcastss(mem(0+0*6)*4(rax), ymm0) // ymm0 = (1/alpha00) #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulps(ymm0, ymm5, ymm5) // ymm5 *= (1/alpha00) #else vdivps(ymm0, ymm4, ymm4) // ymm4 /= alpha00 vdivps(ymm0, ymm5, ymm5) // ymm5 /= alpha00 #endif vmovups(ymm4, mem(rcx)) // store ( beta00..beta07 ) = ymm4 vmovups(ymm5, mem(rdx)) // store ( beta08..beta0F ) = ymm5 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 1 ------------- vbroadcastss(mem(1+0*6)*4(rax), ymm0) // ymm0 = alpha10 vbroadcastss(mem(1+1*6)*4(rax), ymm1) // ymm1 = (1/alpha11) vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha10 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha10 * ymm5 vsubps(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubps(ymm3, ymm7, ymm7) // ymm7 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulps(ymm1, ymm7, ymm7) // ymm7 *= (1/alpha11) #else vdivps(ymm1, ymm6, ymm6) // ymm6 /= alpha11 vdivps(ymm1, ymm7, ymm7) // ymm7 /= alpha11 #endif vmovups(ymm6, mem(rcx)) // store ( beta10..beta17 ) = ymm6 vmovups(ymm7, mem(rdx)) // store ( beta18..beta1F ) = ymm7 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 2 ------------- vbroadcastss(mem(2+0*6)*4(rax), ymm0) // ymm0 = alpha20 vbroadcastss(mem(2+1*6)*4(rax), ymm1) // ymm1 = alpha21 vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha20 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha20 * ymm5 vbroadcastss(mem(2+2*6)*4(rax), ymm0) // ymm0 = (1/alpha22) vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha21 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha21 * ymm7 vsubps(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubps(ymm3, ymm9, ymm9) // ymm9 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulps(ymm0, ymm9, ymm9) // ymm9 *= (1/alpha22) #else vdivps(ymm0, ymm8, ymm8) // ymm8 /= alpha22 vdivps(ymm0, ymm9, ymm9) // ymm9 /= alpha22 #endif vmovups(ymm8, mem(rcx)) // store ( beta20..beta27 ) = ymm8 vmovups(ymm9, mem(rdx)) // store ( beta28..beta2F ) = ymm9 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 3 ------------- vbroadcastss(mem(3+0*6)*4(rax), ymm0) // ymm0 = alpha30 vbroadcastss(mem(3+1*6)*4(rax), ymm1) // ymm1 = alpha31 vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha30 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha30 * ymm5 vbroadcastss(mem(3+2*6)*4(rax), ymm0) // ymm0 = alpha32 vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha31 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha31 * ymm7 vbroadcastss(mem(3+3*6)*4(rax), ymm1) // ymm0 = (1/alpha33) vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha32 * ymm8 vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha32 * ymm9 vsubps(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubps(ymm3, ymm11, ymm11) // ymm11 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulps(ymm1, ymm11, ymm11) // ymm11 *= (1/alpha33) #else vdivps(ymm1, ymm10, ymm10) // ymm10 /= alpha33 vdivps(ymm1, ymm11, ymm11) // ymm11 /= alpha33 #endif vmovups(ymm10, mem(rcx)) // store ( beta30..beta37 ) = ymm10 vmovups(ymm11, mem(rdx)) // store ( beta38..beta3F ) = ymm11 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 4 ------------- vbroadcastss(mem(4+0*6)*4(rax), ymm0) // ymm0 = alpha40 vbroadcastss(mem(4+1*6)*4(rax), ymm1) // ymm1 = alpha41 vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha40 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha40 * ymm5 vbroadcastss(mem(4+2*6)*4(rax), ymm0) // ymm0 = alpha42 vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha41 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha41 * ymm7 vbroadcastss(mem(4+3*6)*4(rax), ymm1) // ymm1 = alpha43 vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha42 * ymm8 vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha42 * ymm9 vbroadcastss(mem(4+4*6)*4(rax), ymm0) // ymm0 = (1/alpha44) vfmadd231ps(ymm1, ymm10, ymm2) // ymm2 += alpha43 * ymm10 vfmadd231ps(ymm1, ymm11, ymm3) // ymm3 += alpha43 * ymm11 vsubps(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubps(ymm3, ymm13, ymm13) // ymm13 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulps(ymm0, ymm13, ymm13) // ymm13 *= (1/alpha44) #else vdivps(ymm0, ymm12, ymm12) // ymm12 /= alpha44 vdivps(ymm0, ymm13, ymm13) // ymm13 /= alpha44 #endif vmovups(ymm12, mem(rcx)) // store ( beta40..beta47 ) = ymm12 vmovups(ymm13, mem(rdx)) // store ( beta48..beta4F ) = ymm13 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 5 ------------- vbroadcastss(mem(5+0*6)*4(rax), ymm0) // ymm0 = alpha50 vbroadcastss(mem(5+1*6)*4(rax), ymm1) // ymm1 = alpha51 vmulps(ymm0, ymm4, ymm2) // ymm2 = alpha50 * ymm4 vmulps(ymm0, ymm5, ymm3) // ymm3 = alpha50 * ymm5 vbroadcastss(mem(5+2*6)*4(rax), ymm0) // ymm0 = alpha52 vfmadd231ps(ymm1, ymm6, ymm2) // ymm2 += alpha51 * ymm6 vfmadd231ps(ymm1, ymm7, ymm3) // ymm3 += alpha51 * ymm7 vbroadcastss(mem(5+3*6)*4(rax), ymm1) // ymm1 = alpha53 vfmadd231ps(ymm0, ymm8, ymm2) // ymm2 += alpha52 * ymm8 vfmadd231ps(ymm0, ymm9, ymm3) // ymm3 += alpha52 * ymm9 vbroadcastss(mem(5+4*6)*4(rax), ymm0) // ymm0 = alpha54 vfmadd231ps(ymm1, ymm10, ymm2) // ymm2 += alpha53 * ymm10 vfmadd231ps(ymm1, ymm11, ymm3) // ymm3 += alpha53 * ymm11 vbroadcastss(mem(5+5*6)*4(rax), ymm1) // ymm1 = (1/alpha55) vfmadd231ps(ymm0, ymm12, ymm2) // ymm2 += alpha54 * ymm12 vfmadd231ps(ymm0, ymm13, ymm3) // ymm3 += alpha54 * ymm13 vsubps(ymm2, ymm14, ymm14) // ymm14 -= ymm2 vsubps(ymm3, ymm15, ymm15) // ymm15 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulps(ymm1, ymm15, ymm15) // ymm15 *= (1/alpha55) #else vdivps(ymm1, ymm14, ymm14) // ymm14 /= alpha55 vdivps(ymm1, ymm15, ymm15) // ymm15 /= alpha55 #endif vmovups(ymm14, mem(rcx)) // store ( beta50..beta57 ) = ymm14 vmovups(ymm15, mem(rdx)) // store ( beta58..beta5F ) = ymm15 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 lea(mem(rcx, rsi, 8), rdx) // load address of c11 + 8*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. jz(.SROWSTORED) // jump to row storage case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case // if neither row- or column- // stored, use general case. label(.SGENSTORED) vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ mov(rdx, rcx) // rcx = c11 + 8*cs_c vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ jmp(.SDONE) label(.SROWSTORED) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm5, mem(rdx)) add(rdi, rdx) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm7, mem(rdx)) add(rdi, rdx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm9, mem(rdx)) add(rdi, rdx) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm11, mem(rdx)) add(rdi, rdx) vmovups(ymm12, mem(rcx)) add(rdi, rcx) vmovups(ymm13, mem(rdx)) add(rdi, rdx) vmovups(ymm14, mem(rcx)) //add(rdi, rcx) vmovups(ymm15, mem(rdx)) //add(rdi, rdx) jmp(.SDONE) label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vunpckhps(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx)) // store ( gamma08..gamma38 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma09..gamma39 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma0C..gamma3C ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma0D..gamma3D ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma0A..gamma3A ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma0B..gamma3B ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma0E..gamma3E ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma0F..gamma3F ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vunpckhps(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovlpd(xmm0, mem(r14)) // store ( gamma48..gamma58 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma49..gamma59 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma4A..gamma5A ) vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma4B..gamma5B ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma4C..gamma5C ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma4D..gamma5D ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma4E..gamma5E ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma4F..gamma5F ) //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c label(.SDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a10] "m" (a10), // 2 [b01] "m" (b01), // 3 [beta] "m" (beta), // 4 [alpha] "m" (alpha), // 5 [a11] "m" (a11), // 6 [b11] "m" (b11), // 7 [c11] "m" (c11), // 8 [rs_c] "m" (rs_c), // 9 [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMMTRSM_UKR_FLUSH_CT( s ); } #define DGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm1) \ vmovlpd(xmm0, mem(rcx)) \ vmovhpd(xmm0, mem(rcx, rsi, 1)) \ vmovlpd(xmm1, mem(rcx, rsi, 2)) \ vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ vextractf128(imm(1), ymm2, xmm1) \ vmovlpd(xmm2, mem(rcx, rsi, 4)) \ vmovhpd(xmm2, mem(rcx, r15, 1)) \ vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemmtrsm_l_haswell_asm_6x8 ( dim_t m, dim_t n, dim_t k0, double* restrict alpha, double* restrict a10, double* restrict a11, double* restrict b01, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; double* beta = bli_dm1; GEMMTRSM_UKR_SETUP_CT_ANY( d, 6, 8, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) mov(var(b11), rcx) // load address of b11 mov(imm(8), rdi) // set rs_b = PACKNR = 8 lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. mov(var(c11), r8) // load address of c11 mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) // iteration 1 prefetch(0, mem(rax, 72*8)) vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 80*8)) vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4..ymm15 = -a10 * b01 mov(var(alpha), rbx) // load address of alpha vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate mov(imm(1), rsi) // set cs_b = 1 lea(mem(, rsi, 8), rsi) // cs_b *= sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of b11 + 4*cs_b mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+4*cs_b for later // b11 := alpha * b11 - a10 * b01 vfmsub231pd(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm5) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm7) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm9) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm11) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm13) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm15) //add(rdi, rdx) // prefetch c11 #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(double) lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 7*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 7*8)) // prefetch c11 + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c11 + 3*rs_c prefetch(0, mem(rdx, r9, 1, 7*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 7*8)) // prefetch c11 + 5*rs_c #endif // trsm computation begins here // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) // ymm8 ymm9 = ( beta20..23 ) ( beta24..27 ) // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+4*cs_b // Note: rdi = rs_b // iteration 0 ------------- vbroadcastsd(mem(0+0*6)*8(rax), ymm0) // ymm0 = (1/alpha00) #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulpd(ymm0, ymm5, ymm5) // ymm5 *= (1/alpha00) #else vdivpd(ymm0, ymm4, ymm4) // ymm4 /= alpha00 vdivpd(ymm0, ymm5, ymm5) // ymm5 /= alpha00 #endif vmovupd(ymm4, mem(rcx)) // store ( beta00..beta03 ) = ymm4 vmovupd(ymm5, mem(rdx)) // store ( beta04..beta07 ) = ymm5 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 1 ------------- vbroadcastsd(mem(1+0*6)*8(rax), ymm0) // ymm0 = alpha10 vbroadcastsd(mem(1+1*6)*8(rax), ymm1) // ymm1 = (1/alpha11) vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha10 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha10 * ymm5 vsubpd(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubpd(ymm3, ymm7, ymm7) // ymm7 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulpd(ymm1, ymm7, ymm7) // ymm7 *= (1/alpha11) #else vdivpd(ymm1, ymm6, ymm6) // ymm6 /= alpha11 vdivpd(ymm1, ymm7, ymm7) // ymm7 /= alpha11 #endif vmovupd(ymm6, mem(rcx)) // store ( beta10..beta13 ) = ymm6 vmovupd(ymm7, mem(rdx)) // store ( beta14..beta17 ) = ymm7 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 2 ------------- vbroadcastsd(mem(2+0*6)*8(rax), ymm0) // ymm0 = alpha20 vbroadcastsd(mem(2+1*6)*8(rax), ymm1) // ymm1 = alpha21 vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha20 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha20 * ymm5 vbroadcastsd(mem(2+2*6)*8(rax), ymm0) // ymm0 = (1/alpha22) vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha21 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha21 * ymm7 vsubpd(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubpd(ymm3, ymm9, ymm9) // ymm9 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulpd(ymm0, ymm9, ymm9) // ymm9 *= (1/alpha22) #else vdivpd(ymm0, ymm8, ymm8) // ymm8 /= alpha22 vdivpd(ymm0, ymm9, ymm9) // ymm9 /= alpha22 #endif vmovupd(ymm8, mem(rcx)) // store ( beta20..beta23 ) = ymm8 vmovupd(ymm9, mem(rdx)) // store ( beta24..beta27 ) = ymm9 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 3 ------------- vbroadcastsd(mem(3+0*6)*8(rax), ymm0) // ymm0 = alpha30 vbroadcastsd(mem(3+1*6)*8(rax), ymm1) // ymm1 = alpha31 vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha30 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha30 * ymm5 vbroadcastsd(mem(3+2*6)*8(rax), ymm0) // ymm0 = alpha32 vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha31 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha31 * ymm7 vbroadcastsd(mem(3+3*6)*8(rax), ymm1) // ymm1 = (1/alpha33) vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha32 * ymm8 vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha32 * ymm9 vsubpd(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubpd(ymm3, ymm11, ymm11) // ymm11 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulpd(ymm1, ymm11, ymm11) // ymm11 *= (1/alpha33) #else vdivpd(ymm1, ymm10, ymm10) // ymm10 /= alpha33 vdivpd(ymm1, ymm11, ymm11) // ymm11 /= alpha33 #endif vmovupd(ymm10, mem(rcx)) // store ( beta30..beta33 ) = ymm10 vmovupd(ymm11, mem(rdx)) // store ( beta34..beta37 ) = ymm11 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 4 ------------- vbroadcastsd(mem(4+0*6)*8(rax), ymm0) // ymm0 = alpha40 vbroadcastsd(mem(4+1*6)*8(rax), ymm1) // ymm1 = alpha41 vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha40 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha40 * ymm5 vbroadcastsd(mem(4+2*6)*8(rax), ymm0) // ymm0 = alpha42 vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha41 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha41 * ymm7 vbroadcastsd(mem(4+3*6)*8(rax), ymm1) // ymm1 = alpha43 vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha42 * ymm8 vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha42 * ymm9 vbroadcastsd(mem(4+4*6)*8(rax), ymm0) // ymm4 = (1/alpha44) vfmadd231pd(ymm1, ymm10, ymm2) // ymm2 += alpha43 * ymm10 vfmadd231pd(ymm1, ymm11, ymm3) // ymm3 += alpha43 * ymm11 vsubpd(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubpd(ymm3, ymm13, ymm13) // ymm13 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulpd(ymm0, ymm13, ymm13) // ymm13 *= (1/alpha44) #else vdivpd(ymm0, ymm12, ymm12) // ymm12 /= alpha44 vdivpd(ymm0, ymm13, ymm13) // ymm13 /= alpha44 #endif vmovupd(ymm12, mem(rcx)) // store ( beta40..beta43 ) = ymm12 vmovupd(ymm13, mem(rdx)) // store ( beta44..beta47 ) = ymm13 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b // iteration 5 ------------- vbroadcastsd(mem(5+0*6)*8(rax), ymm0) // ymm0 = alpha50 vbroadcastsd(mem(5+1*6)*8(rax), ymm1) // ymm1 = alpha51 vmulpd(ymm0, ymm4, ymm2) // ymm2 = alpha50 * ymm4 vmulpd(ymm0, ymm5, ymm3) // ymm3 = alpha50 * ymm5 vbroadcastsd(mem(5+2*6)*8(rax), ymm0) // ymm0 = alpha52 vfmadd231pd(ymm1, ymm6, ymm2) // ymm2 += alpha51 * ymm6 vfmadd231pd(ymm1, ymm7, ymm3) // ymm3 += alpha51 * ymm7 vbroadcastsd(mem(5+3*6)*8(rax), ymm1) // ymm1 = alpha53 vfmadd231pd(ymm0, ymm8, ymm2) // ymm2 += alpha52 * ymm8 vfmadd231pd(ymm0, ymm9, ymm3) // ymm3 += alpha52 * ymm9 vbroadcastsd(mem(5+4*6)*8(rax), ymm0) // ymm0 = alpha54 vfmadd231pd(ymm1, ymm10, ymm2) // ymm2 += alpha53 * ymm10 vfmadd231pd(ymm1, ymm11, ymm3) // ymm3 += alpha53 * ymm11 vbroadcastsd(mem(5+5*6)*8(rax), ymm1) // ymm1 = (1/alpha55) vfmadd231pd(ymm0, ymm12, ymm2) // ymm2 += alpha54 * ymm12 vfmadd231pd(ymm0, ymm13, ymm3) // ymm3 += alpha54 * ymm13 vsubpd(ymm2, ymm14, ymm14) // ymm14 -= ymm2 vsubpd(ymm3, ymm15, ymm15) // ymm15 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulpd(ymm1, ymm15, ymm15) // ymm15 *= (1/alpha55) #else vdivpd(ymm1, ymm14, ymm14) // ymm14 /= alpha55 vdivpd(ymm1, ymm15, ymm15) // ymm15 /= alpha55 #endif vmovupd(ymm14, mem(rcx)) // store ( beta50..beta53 ) = ymm14 vmovupd(ymm15, mem(rdx)) // store ( beta54..beta57 ) = ymm15 add(rdi, rcx) // rcx += rs_b add(rdi, rdx) // rdx += rs_b mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 lea(mem(rcx, rsi, 4), rdx) // load address of c11 + 4*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.DROWSTORED) // jump to row storage case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case // if neither row- or column- // stored, use general case. label(.DGENSTORED) vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ mov(rdx, rcx) // rcx = c11 + 4*cs_c vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ jmp(.DDONE) label(.DROWSTORED) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rdx)) add(rdi, rdx) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm7, mem(rdx)) add(rdi, rdx) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm9, mem(rdx)) add(rdi, rdx) vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vmovupd(ymm11, mem(rdx)) add(rdi, rdx) vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vmovupd(ymm13, mem(rdx)) add(rdi, rdx) vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) vmovupd(ymm15, mem(rdx)) //add(rdi, rdx) jmp(.DDONE) label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) lea(mem(r14, rsi, 4), r14) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) //lea(mem(r14, rsi, 4), r14) label(.DDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a10] "m" (a10), // 2 [b01] "m" (b01), // 3 [beta] "m" (beta), // 4 [alpha] "m" (alpha), // 5 [a11] "m" (a11), // 6 [b11] "m" (b11), // 7 [c11] "m" (c11), // 8 [rs_c] "m" (rs_c), // 9 [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMMTRSM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c000066400000000000000000001223121427272030600275040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #define SGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm2) \ vmovss(xmm0, mem(rcx)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, rsi, 1)) \ vpermilps(imm(0x39), xmm1, xmm0) \ vmovss(xmm0, mem(rcx, rsi, 2)) \ vpermilps(imm(0x39), xmm0, xmm1) \ vmovss(xmm1, mem(rcx, r13, 1)) \ vmovss(xmm2, mem(rcx, rsi, 4)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r15, 1)) \ vpermilps(imm(0x39), xmm1, xmm2) \ vmovss(xmm2, mem(rcx, r13, 2)) \ vpermilps(imm(0x39), xmm2, xmm1) \ vmovss(xmm1, mem(rcx, r10, 1)) void bli_sgemmtrsm_u_haswell_asm_6x16 ( dim_t m, dim_t n, dim_t k0, float* restrict alpha, float* restrict a10, float* restrict a11, float* restrict b01, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; float* beta = bli_sm1; GEMMTRSM_UKR_SETUP_CT_ANY( s, 6, 16, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) mov(var(b11), rcx) // load address of b11 mov(imm(16), rdi) // set rs_b = PACKNR = 16 lea(mem(, rdi, 4), rdi) // rs_b *= sizeof(float) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. mov(var(c11), r8) // load address of c11 mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 4), r9) // rs_c *= sizeof(float) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 4), r10) // cs_c *= sizeof(float) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 64*4)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, -2*32), ymm0) vmovaps(mem(rbx, -1*32), ymm1) // iteration 1 vbroadcastss(mem(rax, 6*4), ymm2) vbroadcastss(mem(rax, 7*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 8*4), ymm2) vbroadcastss(mem(rax, 9*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 10*4), ymm2) vbroadcastss(mem(rax, 11*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 0*32), ymm0) vmovaps(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 76*4)) vbroadcastss(mem(rax, 12*4), ymm2) vbroadcastss(mem(rax, 13*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 14*4), ymm2) vbroadcastss(mem(rax, 15*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 16*4), ymm2) vbroadcastss(mem(rax, 17*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) vmovaps(mem(rbx, 2*32), ymm0) vmovaps(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastss(mem(rax, 18*4), ymm2) vbroadcastss(mem(rax, 19*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 20*4), ymm2) vbroadcastss(mem(rax, 21*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 22*4), ymm2) vbroadcastss(mem(rax, 23*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(4*6*4), rax) // a += 4*6 (unroll x mr) add(imm(4*16*4), rbx) // b += 4*16 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 64*4)) vbroadcastss(mem(rax, 0*4), ymm2) vbroadcastss(mem(rax, 1*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, 2*4), ymm2) vbroadcastss(mem(rax, 3*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, 4*4), ymm2) vbroadcastss(mem(rax, 5*4), ymm3) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(imm(1*6*4), rax) // a += 1*6 (unroll x mr) add(imm(1*16*4), rbx) // b += 1*16 (unroll x nr) vmovaps(mem(rbx, -4*32), ymm0) vmovaps(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4..ymm15 = -a10 * b01 mov(var(alpha), rbx) // load address of alpha vbroadcastss(mem(rbx), ymm3) // load alpha and duplicate mov(imm(1), rsi) // load cs_b = 1 lea(mem(, rsi, 4), rsi) // cs_b *= sizeof(float) lea(mem(rcx, rsi, 8), rdx) // load address of b11 + 8*cs_b mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+8*cs_b for later // b11 := alpha * b11 - a10 * b01 vfmsub231ps(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm5) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm7) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm9) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm11) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm13) add(rdi, rdx) vfmsub231ps(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231ps(mem(rdx), ymm3, ymm15) //add(rdi, rdx) // prefetch c11 #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(float) lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 0*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 0*8)) // prefetch c11 + 2*rs_c prefetch(0, mem(rdx, 0*8)) // prefetch c11 + 3*rs_c prefetch(0, mem(rdx, r9, 1, 0*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 0*8)) // prefetch c11 + 5*rs_c #endif // trsm computation begins here // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..07 ) ( beta08..0F ) // ymm6 ymm7 = ( beta10..17 ) ( beta18..1F ) // ymm8 ymm9 = ( beta20..27 ) ( beta28..2F ) // ymm10 ymm11 = ( beta30..37 ) ( beta38..3F ) // ymm12 ymm13 = ( beta40..47 ) ( beta48..4F ) // ymm14 ymm15 = ( beta50..57 ) ( beta58..5F ) mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+8*cs_b lea(mem(rcx, rdi, 4), rcx) // rcx = b11 + (6-1)*rs_b lea(mem(rcx, rdi, 1), rcx) lea(mem(rdx, rdi, 4), rdx) // rdx = b11 + (6-1)*rs_b + 8*cs_b lea(mem(rdx, rdi, 1), rdx) // iteration 0 ------------- vbroadcastss(mem(5+5*6)*4(rax), ymm0) // ymm0 = (1/alpha55) #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulps(ymm0, ymm15, ymm15) // ymm15 *= (1/alpha55) #else vdivps(ymm0, ymm14, ymm14) // ymm14 /= alpha55 vdivps(ymm0, ymm15, ymm15) // ymm15 /= alpha55 #endif vmovups(ymm14, mem(rcx)) // store ( beta50..beta57 ) = ymm14 vmovups(ymm15, mem(rdx)) // store ( beta58..beta5F ) = ymm15 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 1 ------------- vbroadcastss(mem(4+5*6)*4(rax), ymm0) // ymm0 = alpha45 vbroadcastss(mem(4+4*6)*4(rax), ymm1) // ymm1 = (1/alpha44) vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha45 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha45 * ymm15 vsubps(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubps(ymm3, ymm13, ymm13) // ymm13 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulps(ymm1, ymm13, ymm13) // ymm13 *= (1/alpha44) #else vdivps(ymm1, ymm12, ymm12) // ymm12 /= alpha44 vdivps(ymm1, ymm13, ymm13) // ymm13 /= alpha44 #endif vmovups(ymm12, mem(rcx)) // store ( beta40..beta47 ) = ymm12 vmovups(ymm13, mem(rdx)) // store ( beta48..beta4F ) = ymm13 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 2 ------------- vbroadcastss(mem(3+5*6)*4(rax), ymm0) // ymm0 = alpha35 vbroadcastss(mem(3+4*6)*4(rax), ymm1) // ymm1 = alpha34 vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha35 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha35 * ymm15 vbroadcastss(mem(3+3*6)*4(rax), ymm0) // ymm0 = (1/alpha33) vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha34 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha34 * ymm13 vsubps(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubps(ymm3, ymm11, ymm11) // ymm11 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulps(ymm0, ymm11, ymm11) // ymm11 *= (1/alpha33) #else vdivps(ymm0, ymm10, ymm10) // ymm10 /= alpha33 vdivps(ymm0, ymm11, ymm11) // ymm11 /= alpha33 #endif vmovups(ymm10, mem(rcx)) // store ( beta30..beta37 ) = ymm10 vmovups(ymm11, mem(rdx)) // store ( beta38..beta3F ) = ymm11 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 3 ------------- vbroadcastss(mem(2+5*6)*4(rax), ymm0) // ymm0 = alpha25 vbroadcastss(mem(2+4*6)*4(rax), ymm1) // ymm1 = alpha24 vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha25 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha25 * ymm15 vbroadcastss(mem(2+3*6)*4(rax), ymm0) // ymm0 = alpha23 vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha24 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha24 * ymm13 vbroadcastss(mem(2+2*6)*4(rax), ymm1) // ymm1 = (1/alpha22) vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha23 * ymm10 vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha23 * ymm11 vsubps(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubps(ymm3, ymm9, ymm9) // ymm9 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulps(ymm1, ymm9, ymm9) // ymm9 *= (1/alpha22) #else vdivps(ymm1, ymm8, ymm8) // ymm8 /= alpha22 vdivps(ymm1, ymm9, ymm9) // ymm9 /= alpha22 #endif vmovups(ymm8, mem(rcx)) // store ( beta20..beta27 ) = ymm8 vmovups(ymm9, mem(rdx)) // store ( beta28..beta2F ) = ymm9 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 4 ------------- vbroadcastss(mem(1+5*6)*4(rax), ymm0) // ymm0 = alpha15 vbroadcastss(mem(1+4*6)*4(rax), ymm1) // ymm1 = alpha14 vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha15 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha15 * ymm15 vbroadcastss(mem(1+3*6)*4(rax), ymm0) // ymm0 = alpha13 vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha14 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha14 * ymm13 vbroadcastss(mem(1+2*6)*4(rax), ymm1) // ymm1 = alpha12 vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha13 * ymm10 vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha13 * ymm11 vbroadcastss(mem(1+1*6)*4(rax), ymm0) // ymm4 = (1/alpha11) vfmadd231ps(ymm1, ymm8, ymm2) // ymm2 += alpha12 * ymm8 vfmadd231ps(ymm1, ymm9, ymm3) // ymm3 += alpha12 * ymm9 vsubps(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubps(ymm3, ymm7, ymm7) // ymm7 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm0, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulps(ymm0, ymm7, ymm7) // ymm7 *= (1/alpha11) #else vdivps(ymm0, ymm6, ymm6) // ymm6 /= alpha11 vdivps(ymm0, ymm7, ymm7) // ymm7 /= alpha11 #endif vmovups(ymm6, mem(rcx)) // store ( beta10..beta17 ) = ymm6 vmovups(ymm7, mem(rdx)) // store ( beta18..beta1F ) = ymm7 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 5 ------------- vbroadcastss(mem(0+5*6)*4(rax), ymm0) // ymm0 = alpha05 vbroadcastss(mem(0+4*6)*4(rax), ymm1) // ymm1 = alpha04 vmulps(ymm0, ymm14, ymm2) // ymm2 = alpha05 * ymm14 vmulps(ymm0, ymm15, ymm3) // ymm3 = alpha05 * ymm15 vbroadcastss(mem(0+3*6)*4(rax), ymm0) // ymm0 = alpha03 vfmadd231ps(ymm1, ymm12, ymm2) // ymm2 += alpha04 * ymm12 vfmadd231ps(ymm1, ymm13, ymm3) // ymm3 += alpha04 * ymm13 vbroadcastss(mem(0+2*6)*4(rax), ymm1) // ymm1 = alpha02 vfmadd231ps(ymm0, ymm10, ymm2) // ymm2 += alpha03 * ymm10 vfmadd231ps(ymm0, ymm11, ymm3) // ymm3 += alpha03 * ymm11 vbroadcastss(mem(0+1*6)*4(rax), ymm0) // ymm0 = alpha01 vfmadd231ps(ymm1, ymm8, ymm2) // ymm2 += alpha02 * ymm8 vfmadd231ps(ymm1, ymm9, ymm3) // ymm3 += alpha02 * ymm9 vbroadcastss(mem(0+0*6)*4(rax), ymm1) // ymm1 = (1/alpha00) vfmadd231ps(ymm0, ymm6, ymm2) // ymm2 += alpha01 * ymm6 vfmadd231ps(ymm0, ymm7, ymm3) // ymm3 += alpha01 * ymm7 vsubps(ymm2, ymm4, ymm4) // ymm4 -= ymm2 vsubps(ymm3, ymm5, ymm5) // ymm5 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulps(ymm1, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulps(ymm1, ymm5, ymm5) // ymm5 *= (1/alpha00) #else vdivps(ymm1, ymm4, ymm4) // ymm4 /= alpha00 vdivps(ymm1, ymm5, ymm5) // ymm5 /= alpha00 #endif vmovups(ymm4, mem(rcx)) // store ( beta00..beta07 ) = ymm4 vmovups(ymm5, mem(rdx)) // store ( beta08..beta0F ) = ymm5 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 lea(mem(rcx, rsi, 8), rdx) // load address of c11 + 8*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; cmp(imm(4), rsi) // set ZF if (4*cs_c) == 4. jz(.SROWSTORED) // jump to row storage case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case // if neither row- or column- // stored, use general case. label(.SGENSTORED) vmovaps(ymm4, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm6, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm8, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm10, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm12, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm14, ymm0) SGEMM_OUTPUT_GS_BETA_NZ mov(rdx, rcx) // rcx = c11 + 8*cs_c vmovaps(ymm5, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm7, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm9, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm11, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm13, ymm0) SGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovaps(ymm15, ymm0) SGEMM_OUTPUT_GS_BETA_NZ jmp(.SDONE) label(.SROWSTORED) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm5, mem(rdx)) add(rdi, rdx) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm7, mem(rdx)) add(rdi, rdx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm9, mem(rdx)) add(rdi, rdx) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm11, mem(rdx)) add(rdi, rdx) vmovups(ymm12, mem(rcx)) add(rdi, rcx) vmovups(ymm13, mem(rdx)) add(rdi, rdx) vmovups(ymm14, mem(rcx)) //add(rdi, rcx) vmovups(ymm15, mem(rdx)) //add(rdi, rdx) jmp(.SDONE) label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma06..gamma36 ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vunpckhps(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovlpd(xmm0, mem(r14)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma45..gamma55 ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma47..gamma57 ) lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx)) // store ( gamma08..gamma38 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma09..gamma39 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma0C..gamma3C ) vmovups(xmm3, mem(rcx, r15, 1)) // store ( gamma0D..gamma3D ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma0A..gamma3A ) vmovups(xmm1, mem(rcx, r13, 1)) // store ( gamma0B..gamma3B ) vmovups(xmm2, mem(rcx, r13, 2)) // store ( gamma0E..gamma3E ) vmovups(xmm3, mem(rcx, r10, 1)) // store ( gamma0F..gamma3F ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vunpckhps(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovlpd(xmm0, mem(r14)) // store ( gamma48..gamma58 ) vmovhpd(xmm0, mem(r14, rsi, 1)) // store ( gamma49..gamma59 ) vmovlpd(xmm1, mem(r14, rsi, 2)) // store ( gamma4A..gamma5A ) vmovhpd(xmm1, mem(r14, r13, 1)) // store ( gamma4B..gamma5B ) vmovlpd(xmm2, mem(r14, rsi, 4)) // store ( gamma4C..gamma5C ) vmovhpd(xmm2, mem(r14, r15, 1)) // store ( gamma4D..gamma5D ) vmovlpd(xmm3, mem(r14, r13, 2)) // store ( gamma4E..gamma5E ) vmovhpd(xmm3, mem(r14, r10, 1)) // store ( gamma4F..gamma5F ) //lea(mem(r14, rsi, 8), r14) // r14 += 8*cs_c label(.SDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a10] "m" (a10), // 2 [b01] "m" (b01), // 3 [beta] "m" (beta), // 4 [alpha] "m" (alpha), // 5 [a11] "m" (a11), // 6 [b11] "m" (b11), // 7 [c11] "m" (c11), // 8 [rs_c] "m" (rs_c), // 9 [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMMTRSM_UKR_FLUSH_CT( s ); } #define DGEMM_OUTPUT_GS_BETA_NZ \ vextractf128(imm(1), ymm0, xmm1) \ vmovlpd(xmm0, mem(rcx)) \ vmovhpd(xmm0, mem(rcx, rsi, 1)) \ vmovlpd(xmm1, mem(rcx, rsi, 2)) \ vmovhpd(xmm1, mem(rcx, r13, 1)) /*\ vextractf128(imm(1), ymm2, xmm1) \ vmovlpd(xmm2, mem(rcx, rsi, 4)) \ vmovhpd(xmm2, mem(rcx, r15, 1)) \ vmovlpd(xmm1, mem(rcx, r13, 2)) \ vmovhpd(xmm1, mem(rcx, r10, 1))*/ void bli_dgemmtrsm_u_haswell_asm_6x8 ( dim_t m, dim_t n, dim_t k0, double* restrict alpha, double* restrict a10, double* restrict a11, double* restrict b01, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; double* beta = bli_dm1; GEMMTRSM_UKR_SETUP_CT_ANY( d, 6, 8, true ); begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a10), rax) // load address of a. mov(var(b01), rbx) // load address of b. add(imm(32*4), rbx) // initialize loop by pre-loading vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) mov(var(b11), rcx) // load address of b11 mov(imm(8), rdi) // set rs_b = PACKNR = 8 lea(mem(, rdi, 8), rdi) // rs_b *= sizeof(double) // NOTE: c11, rs_c, and cs_c aren't // needed for a while, but we load // them now to avoid stalling later. mov(var(c11), r8) // load address of c11 mov(var(rs_c), r9) // load rs_c lea(mem(, r9 , 8), r9) // rs_c *= sizeof(double) mov(var(k_left)0, r10) // load cs_c lea(mem(, r10, 8), r10) // cs_c *= sizeof(double) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // iteration 0 prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, -2*32), ymm0) vmovapd(mem(rbx, -1*32), ymm1) // iteration 1 prefetch(0, mem(rax, 72*8)) vbroadcastsd(mem(rax, 6*8), ymm2) vbroadcastsd(mem(rax, 7*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 8*8), ymm2) vbroadcastsd(mem(rax, 9*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 10*8), ymm2) vbroadcastsd(mem(rax, 11*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 0*32), ymm0) vmovapd(mem(rbx, 1*32), ymm1) // iteration 2 prefetch(0, mem(rax, 80*8)) vbroadcastsd(mem(rax, 12*8), ymm2) vbroadcastsd(mem(rax, 13*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 14*8), ymm2) vbroadcastsd(mem(rax, 15*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 16*8), ymm2) vbroadcastsd(mem(rax, 17*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) vmovapd(mem(rbx, 2*32), ymm0) vmovapd(mem(rbx, 3*32), ymm1) // iteration 3 vbroadcastsd(mem(rax, 18*8), ymm2) vbroadcastsd(mem(rax, 19*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 20*8), ymm2) vbroadcastsd(mem(rax, 21*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 22*8), ymm2) vbroadcastsd(mem(rax, 23*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(4*6*8), rax) // a += 4*6 (unroll x mr) add(imm(4*8*8), rbx) // b += 4*8 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rax, 64*8)) vbroadcastsd(mem(rax, 0*8), ymm2) vbroadcastsd(mem(rax, 1*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, 2*8), ymm2) vbroadcastsd(mem(rax, 3*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, 4*8), ymm2) vbroadcastsd(mem(rax, 5*8), ymm3) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(imm(1*6*8), rax) // a += 1*6 (unroll x mr) add(imm(1*8*8), rbx) // b += 1*8 (unroll x nr) vmovapd(mem(rbx, -4*32), ymm0) vmovapd(mem(rbx, -3*32), ymm1) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4..ymm15 = -a10 * b01 mov(var(alpha), rbx) // load address of alpha vbroadcastsd(mem(rbx), ymm3) // load alpha and duplicate mov(imm(1), rsi) // set cs_b = 1 lea(mem(, rsi, 8), rsi) // cs_b *= sizeof(double) lea(mem(rcx, rsi, 4), rdx) // load address of b11 + 4*cs_b mov(rcx, r11) // save rcx = b11 for later mov(rdx, r14) // save rdx = b11+4*cs_b for later // b11 := alpha * b11 - a10 * b01 vfmsub231pd(mem(rcx), ymm3, ymm4) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm5) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm6) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm7) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm8) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm9) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm10) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm11) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm12) add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm13) add(rdi, rdx) vfmsub231pd(mem(rcx), ymm3, ymm14) //add(rdi, rcx) vfmsub231pd(mem(rdx), ymm3, ymm15) //add(rdi, rdx) // prefetch c11 #if 0 mov(r8, rcx) // load address of c11 from r8 // Note: r9 = rs_c * sizeof(double) lea(mem(r9 , r9 , 2), r13) // r13 = 3*rs_c; lea(mem(rcx, r13, 1), rdx) // rdx = c11 + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c11 + 0*rs_c prefetch(0, mem(rcx, r9, 1, 7*8)) // prefetch c11 + 1*rs_c prefetch(0, mem(rcx, r9 , 2, 7*8)) // prefetch c11 + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c11 + 3*rs_c prefetch(0, mem(rdx, r9, 1, 7*8)) // prefetch c11 + 4*rs_c prefetch(0, mem(rdx, r9 , 2, 7*8)) // prefetch c11 + 5*rs_c #endif // trsm computation begins here // Note: contents of b11 are stored as // ymm4 ymm5 = ( beta00..03 ) ( beta04..07 ) // ymm6 ymm7 = ( beta10..13 ) ( beta14..17 ) // ymm8 ymm9 = ( beta20..23 ) ( beta24..27 ) // ymm10 ymm11 = ( beta30..33 ) ( beta34..37 ) // ymm12 ymm13 = ( beta40..43 ) ( beta44..47 ) // ymm14 ymm15 = ( beta50..53 ) ( beta54..57 ) mov(var(a11), rax) // load address of a11 mov(r11, rcx) // recall address of b11 mov(r14, rdx) // recall address of b11+4*cs_b lea(mem(rcx, rdi, 4), rcx) // rcx = b11 + (6-1)*rs_b lea(mem(rcx, rdi, 1), rcx) lea(mem(rdx, rdi, 4), rdx) // rdx = b11 + (6-1)*rs_b + 4*cs_b lea(mem(rdx, rdi, 1), rdx) // iteration 0 ------------- vbroadcastsd(mem(5+5*6)*8(rax), ymm0) // ymm0 = (1/alpha55) #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm14, ymm14) // ymm14 *= (1/alpha55) vmulpd(ymm0, ymm15, ymm15) // ymm15 *= (1/alpha55) #else vdivpd(ymm0, ymm14, ymm14) // ymm14 /= alpha55 vdivpd(ymm0, ymm15, ymm15) // ymm15 /= alpha55 #endif vmovupd(ymm14, mem(rcx)) // store ( beta50..beta53 ) = ymm14 vmovupd(ymm15, mem(rdx)) // store ( beta54..beta57 ) = ymm15 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 1 ------------- vbroadcastsd(mem(4+5*6)*8(rax), ymm0) // ymm0 = alpha45 vbroadcastsd(mem(4+4*6)*8(rax), ymm1) // ymm1 = (1/alpha44) vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha45 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha45 * ymm15 vsubpd(ymm2, ymm12, ymm12) // ymm12 -= ymm2 vsubpd(ymm3, ymm13, ymm13) // ymm13 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm12, ymm12) // ymm12 *= (1/alpha44) vmulpd(ymm1, ymm13, ymm13) // ymm13 *= (1/alpha44) #else vdivpd(ymm1, ymm12, ymm12) // ymm12 /= alpha44 vdivpd(ymm1, ymm13, ymm13) // ymm13 /= alpha44 #endif vmovupd(ymm12, mem(rcx)) // store ( beta40..beta43 ) = ymm12 vmovupd(ymm13, mem(rdx)) // store ( beta44..beta47 ) = ymm13 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 2 ------------- vbroadcastsd(mem(3+5*6)*8(rax), ymm0) // ymm0 = alpha35 vbroadcastsd(mem(3+4*6)*8(rax), ymm1) // ymm1 = alpha34 vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha35 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha35 * ymm15 vbroadcastsd(mem(3+3*6)*8(rax), ymm0) // ymm0 = (1/alpha33) vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha34 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha34 * ymm13 vsubpd(ymm2, ymm10, ymm10) // ymm10 -= ymm2 vsubpd(ymm3, ymm11, ymm11) // ymm11 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm10, ymm10) // ymm10 *= (1/alpha33) vmulpd(ymm0, ymm11, ymm11) // ymm11 *= (1/alpha33) #else vdivpd(ymm0, ymm10, ymm10) // ymm10 /= alpha33 vdivpd(ymm0, ymm11, ymm11) // ymm11 /= alpha33 #endif vmovupd(ymm10, mem(rcx)) // store ( beta30..beta33 ) = ymm10 vmovupd(ymm11, mem(rdx)) // store ( beta34..beta37 ) = ymm11 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 3 ------------- vbroadcastsd(mem(2+5*6)*8(rax), ymm0) // ymm0 = alpha25 vbroadcastsd(mem(2+4*6)*8(rax), ymm1) // ymm1 = alpha24 vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha25 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha25 * ymm15 vbroadcastsd(mem(2+3*6)*8(rax), ymm0) // ymm0 = alpha23 vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha24 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha24 * ymm13 vbroadcastsd(mem(2+2*6)*8(rax), ymm1) // ymm1 = (1/alpha22) vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha23 * ymm10 vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha23 * ymm11 vsubpd(ymm2, ymm8, ymm8) // ymm8 -= ymm2 vsubpd(ymm3, ymm9, ymm9) // ymm9 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm8, ymm8) // ymm8 *= (1/alpha22) vmulpd(ymm1, ymm9, ymm9) // ymm9 *= (1/alpha22) #else vdivpd(ymm1, ymm8, ymm8) // ymm8 /= alpha22 vdivpd(ymm1, ymm9, ymm9) // ymm9 /= alpha22 #endif vmovupd(ymm8, mem(rcx)) // store ( beta20..beta23 ) = ymm8 vmovupd(ymm9, mem(rdx)) // store ( beta24..beta27 ) = ymm9 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 4 ------------- vbroadcastsd(mem(1+5*6)*8(rax), ymm0) // ymm0 = alpha15 vbroadcastsd(mem(1+4*6)*8(rax), ymm1) // ymm1 = alpha14 vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha15 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha15 * ymm15 vbroadcastsd(mem(1+3*6)*8(rax), ymm0) // ymm0 = alpha13 vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha14 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha14 * ymm13 vbroadcastsd(mem(1+2*6)*8(rax), ymm1) // ymm1 = alpha12 vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha13 * ymm10 vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha13 * ymm11 vbroadcastsd(mem(1+1*6)*8(rax), ymm0) // ymm4 = (1/alpha11) vfmadd231pd(ymm1, ymm8, ymm2) // ymm2 += alpha12 * ymm8 vfmadd231pd(ymm1, ymm9, ymm3) // ymm3 += alpha12 * ymm9 vsubpd(ymm2, ymm6, ymm6) // ymm6 -= ymm2 vsubpd(ymm3, ymm7, ymm7) // ymm7 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm0, ymm6, ymm6) // ymm6 *= (1/alpha11) vmulpd(ymm0, ymm7, ymm7) // ymm7 *= (1/alpha11) #else vdivpd(ymm0, ymm6, ymm6) // ymm6 /= alpha11 vdivpd(ymm0, ymm7, ymm7) // ymm7 /= alpha11 #endif vmovupd(ymm6, mem(rcx)) // store ( beta10..beta13 ) = ymm6 vmovupd(ymm7, mem(rdx)) // store ( beta14..beta17 ) = ymm7 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b // iteration 5 ------------- vbroadcastsd(mem(0+5*6)*8(rax), ymm0) // ymm0 = alpha05 vbroadcastsd(mem(0+4*6)*8(rax), ymm1) // ymm1 = alpha04 vmulpd(ymm0, ymm14, ymm2) // ymm2 = alpha05 * ymm14 vmulpd(ymm0, ymm15, ymm3) // ymm3 = alpha05 * ymm15 vbroadcastsd(mem(0+3*6)*8(rax), ymm0) // ymm0 = alpha03 vfmadd231pd(ymm1, ymm12, ymm2) // ymm2 += alpha04 * ymm12 vfmadd231pd(ymm1, ymm13, ymm3) // ymm3 += alpha04 * ymm13 vbroadcastsd(mem(0+2*6)*8(rax), ymm1) // ymm1 = alpha02 vfmadd231pd(ymm0, ymm10, ymm2) // ymm2 += alpha03 * ymm10 vfmadd231pd(ymm0, ymm11, ymm3) // ymm3 += alpha03 * ymm11 vbroadcastsd(mem(0+1*6)*8(rax), ymm0) // ymm0 = alpha01 vfmadd231pd(ymm1, ymm8, ymm2) // ymm2 += alpha02 * ymm8 vfmadd231pd(ymm1, ymm9, ymm3) // ymm3 += alpha02 * ymm9 vbroadcastsd(mem(0+0*6)*8(rax), ymm1) // ymm1 = (1/alpha00) vfmadd231pd(ymm0, ymm6, ymm2) // ymm2 += alpha01 * ymm6 vfmadd231pd(ymm0, ymm7, ymm3) // ymm3 += alpha01 * ymm7 vsubpd(ymm2, ymm4, ymm4) // ymm4 -= ymm2 vsubpd(ymm3, ymm5, ymm5) // ymm5 -= ymm3 #ifdef BLIS_ENABLE_TRSM_PREINVERSION vmulpd(ymm1, ymm4, ymm4) // ymm4 *= (1/alpha00) vmulpd(ymm1, ymm5, ymm5) // ymm5 *= (1/alpha00) #else vdivpd(ymm1, ymm4, ymm4) // ymm4 /= alpha00 vdivpd(ymm1, ymm5, ymm5) // ymm5 /= alpha00 #endif vmovupd(ymm4, mem(rcx)) // store ( beta00..beta03 ) = ymm4 vmovupd(ymm5, mem(rdx)) // store ( beta04..beta07 ) = ymm5 sub(rdi, rcx) // rcx -= rs_b sub(rdi, rdx) // rdx -= rs_b mov(r8, rcx) // load address of c11 from r8 mov(r9, rdi) // load rs_c (in bytes) from r9 mov(r10, rsi) // load cs_c (in bytes) from r10 lea(mem(rcx, rsi, 4), rdx) // load address of c11 + 4*cs_c; lea(mem(rcx, rdi, 4), r14) // load address of c11 + 4*rs_c; // These are used in the macros below. lea(mem(rsi, rsi, 2), r13) // r13 = 3*cs_c; //lea(mem(rsi, rsi, 4), r15) // r15 = 5*cs_c; //lea(mem(r13, rsi, 4), r10) // r10 = 7*cs_c; cmp(imm(8), rsi) // set ZF if (8*cs_c) == 8. jz(.DROWSTORED) // jump to row storage case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case // if neither row- or column- // stored, use general case. label(.DGENSTORED) vmovapd(ymm4, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm6, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm8, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm10, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm12, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm14, ymm0) DGEMM_OUTPUT_GS_BETA_NZ mov(rdx, rcx) // rcx = c11 + 4*cs_c vmovapd(ymm5, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm7, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm9, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm11, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm13, ymm0) DGEMM_OUTPUT_GS_BETA_NZ add(rdi, rcx) // c11 += rs_c; vmovapd(ymm15, ymm0) DGEMM_OUTPUT_GS_BETA_NZ jmp(.DDONE) label(.DROWSTORED) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rdx)) add(rdi, rdx) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm7, mem(rdx)) add(rdi, rdx) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm9, mem(rdx)) add(rdi, rdx) vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vmovupd(ymm11, mem(rdx)) add(rdi, rdx) vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vmovupd(ymm13, mem(rdx)) add(rdi, rdx) vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) vmovupd(ymm15, mem(rdx)) //add(rdi, rdx) jmp(.DDONE) label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, r13, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) lea(mem(r14, rsi, 4), r14) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, r13, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm3) vmovupd(xmm0, mem(r14)) vmovupd(xmm1, mem(r14, rsi, 1)) vmovupd(xmm2, mem(r14, rsi, 2)) vmovupd(xmm3, mem(r14, r13, 1)) //lea(mem(r14, rsi, 4), r14) label(.DDONE) vzeroupper() end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a10] "m" (a10), // 2 [b01] "m" (b01), // 3 [beta] "m" (beta), // 4 [alpha] "m" (alpha), // 5 [a11] "m" (a11), // 6 [b11] "m" (b11), // 7 [c11] "m" (c11), // 8 [rs_c] "m" (rs_c), // 9 [cs_c] "m" (cs_c) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMMTRSM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/haswell/3/old/000077500000000000000000000000001427272030600216605ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/3/old/bli_gemm_haswell_asm_d12x4.c000066400000000000000000002063101427272030600271020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define SGEMM_INPUT_GS_BETA_NZ \ "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss instead. But since we're limited to using ymm0 through ymm2 (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ "vmovss (%%rcx,%%r10 ), %%xmm1 \n\t" \ "vpermilps $0xcf, %%xmm1, %%xmm1 \n\t" \ "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ /*"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t"*/ \ "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \ "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" #define SGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ "vmovss %%xmm0, (%%rcx ) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" void bli_sgemm_haswell_asm_24x4 ( dim_t k0, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" " \n\t" // initialize loop by pre-loading "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" "vmovaps 2 * 32(%%rax), %%ymm2 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %8, %%rdi \n\t" // load cs_c "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; "prefetcht0 7 * 4(%%rcx) \n\t" // prefetch c + 0*cs_c "prefetcht0 7 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c "prefetcht0 7 * 4(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c "prefetcht0 7 * 4(%%rcx,%%r13) \n\t" // prefetch c + 3*cs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".SLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 3 * 32(%%rax), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 4 * 32(%%rax), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 5 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastss 4 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 6 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 7 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 6 * 32(%%rax), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 7 * 32(%%rax), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 8 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 22 * 32(%%rax) \n\t" " \n\t" "vbroadcastss 8 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 9 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 10 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 11 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 9 * 32(%%rax), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 10 * 32(%%rax), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 11 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastss 12 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 13 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 14 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 15 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 12 * 32(%%rax), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 13 * 32(%%rax), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 14 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $4 * 24 * 4, %%rax \n\t" // a += 4*24 (unroll x mr) "addq $4 * 4 * 4, %%rbx \n\t" // b += 4*4 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 3 * 32(%%rax), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 4 * 32(%%rax), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 5 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $1 * 24 * 4, %%rax \n\t" // a += 1*24 (unroll x mr) "addq $1 * 4 * 4, %%rbx \n\t" // b += 1*4 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".SPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %7, %%rsi \n\t" // load rs_c "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) " \n\t" "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // rdx = c + 8*rs_c; "leaq (%%rdx,%%rsi,8), %%r12 \n\t" // r12 = c + 16*rs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; " \n\t" " \n\t" " \n\t" " \n\t" // determine if " \n\t" // c % 32 == 0, AND " \n\t" // 8*cs_c % 32 == 0, AND " \n\t" // rs_c == 1 " \n\t" // ie: aligned, ldim aligned, and " \n\t" // column-stored " \n\t" "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); "testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero. "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); " \n\t" // and(bl,bh) followed by " \n\t" // and(bh,al) will reveal result " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" " \n\t" // check if aligned/column-stored "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. "jne .SCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".SGENSTORED: \n\t" " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 16*rs_c " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SCOLSTORED: \n\t" " \n\t" " \n\t" "vmovaps (%%rcx), %%ymm0 \n\t" "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" "vmovaps %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovaps (%%rdx), %%ymm1 \n\t" "vfmadd213ps %%ymm5, %%ymm3, %%ymm1 \n\t" "vmovaps %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovaps (%%r12), %%ymm2 \n\t" "vfmadd213ps %%ymm6, %%ymm3, %%ymm2 \n\t" "vmovaps %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vmovaps (%%rcx), %%ymm0 \n\t" "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" "vmovaps %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovaps (%%rdx), %%ymm1 \n\t" "vfmadd213ps %%ymm8, %%ymm3, %%ymm1 \n\t" "vmovaps %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovaps (%%r12), %%ymm2 \n\t" "vfmadd213ps %%ymm9, %%ymm3, %%ymm2 \n\t" "vmovaps %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vmovaps (%%rcx), %%ymm0 \n\t" "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" "vmovaps %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovaps (%%rdx), %%ymm1 \n\t" "vfmadd213ps %%ymm11, %%ymm3, %%ymm1 \n\t" "vmovaps %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovaps (%%r12), %%ymm2 \n\t" "vfmadd213ps %%ymm12, %%ymm3, %%ymm2 \n\t" "vmovaps %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vmovaps (%%rcx), %%ymm0 \n\t" "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" "vmovaps %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovaps (%%rdx), %%ymm1 \n\t" "vfmadd213ps %%ymm14, %%ymm3, %%ymm1 \n\t" "vmovaps %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovaps (%%r12), %%ymm2 \n\t" "vfmadd213ps %%ymm15, %%ymm3, %%ymm2 \n\t" "vmovaps %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SBETAZERO: \n\t" " \n\t" // check if aligned/column-stored "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. "jne .SCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".SGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovaps %%ymm4, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm7, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm10, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm13, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c " \n\t" " \n\t" "vmovaps %%ymm5, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm8, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm11, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm14, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 16*rs_c " \n\t" " \n\t" "vmovaps %%ymm6, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm9, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm12, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm15, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SCOLSTORBZ: \n\t" " \n\t" " \n\t" "vmovaps %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovaps %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovaps %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovaps %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovaps %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovaps %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovaps %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovaps %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovaps %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovaps %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovaps %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" "vmovaps %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ "vmovlpd %%xmm0, (%%rcx ) \n\t" \ "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ void bli_dgemm_haswell_asm_12x4 ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" " \n\t" // initialize loop by pre-loading "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" "vmovapd 2 * 32(%%rax), %%ymm2 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %8, %%rdi \n\t" // load cs_c "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c "prefetcht0 7 * 8(%%rcx,%%r13) \n\t" // prefetch c + 3*cs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".DLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 3 * 32(%%rax), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 4 * 32(%%rax), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 5 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastsd 4 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 6 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 6 * 32(%%rax), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 7 * 32(%%rax), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 8 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 22 * 32(%%rax) \n\t" " \n\t" "vbroadcastsd 8 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 10 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 9 * 32(%%rax), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 10 * 32(%%rax), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 11 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastsd 12 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 14 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 12 * 32(%%rax), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 13 * 32(%%rax), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 14 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $4 * 12 * 8, %%rax \n\t" // a += 4*12 (unroll x mr) "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".DLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 3 * 32(%%rax), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 4 * 32(%%rax), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 5 * 32(%%rax), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $1 * 12 * 8, %%rax \n\t" // a += 1*12 (unroll x mr) "addq $1 * 4 * 8, %%rbx \n\t" // b += 1*4 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".DPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %7, %%rsi \n\t" // load rs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) " \n\t" "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // rdx = c + 4*rs_c; "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*rs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; " \n\t" " \n\t" " \n\t" " \n\t" // determine if " \n\t" // c % 32 == 0, AND " \n\t" // 8*cs_c % 32 == 0, AND " \n\t" // rs_c == 1 " \n\t" // ie: aligned, ldim aligned, and " \n\t" // column-stored " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 ); "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero. "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 ); "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero. "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 ); " \n\t" // and(bl,bh) followed by " \n\t" // and(bh,al) will reveal result " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" " \n\t" // check if aligned/column-stored "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. "jne .DCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".DGENSTORED: \n\t" " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 8*rs_c " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DCOLSTORED: \n\t" " \n\t" " \n\t" "vmovapd (%%rcx), %%ymm0 \n\t" "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" "vmovapd %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovapd (%%rdx), %%ymm1 \n\t" "vfmadd213pd %%ymm5, %%ymm3, %%ymm1 \n\t" "vmovapd %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovapd (%%r12), %%ymm2 \n\t" "vfmadd213pd %%ymm6, %%ymm3, %%ymm2 \n\t" "vmovapd %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vmovapd (%%rcx), %%ymm0 \n\t" "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" "vmovapd %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovapd (%%rdx), %%ymm1 \n\t" "vfmadd213pd %%ymm8, %%ymm3, %%ymm1 \n\t" "vmovapd %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovapd (%%r12), %%ymm2 \n\t" "vfmadd213pd %%ymm9, %%ymm3, %%ymm2 \n\t" "vmovapd %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vmovapd (%%rcx), %%ymm0 \n\t" "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" "vmovapd %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovapd (%%rdx), %%ymm1 \n\t" "vfmadd213pd %%ymm11, %%ymm3, %%ymm1 \n\t" "vmovapd %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovapd (%%r12), %%ymm2 \n\t" "vfmadd213pd %%ymm12, %%ymm3, %%ymm2 \n\t" "vmovapd %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vmovapd (%%rcx), %%ymm0 \n\t" "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" "vmovapd %%ymm0, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovapd (%%rdx), %%ymm1 \n\t" "vfmadd213pd %%ymm14, %%ymm3, %%ymm1 \n\t" "vmovapd %%ymm1, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovapd (%%r12), %%ymm2 \n\t" "vfmadd213pd %%ymm15, %%ymm3, %%ymm2 \n\t" "vmovapd %%ymm2, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DBETAZERO: \n\t" " \n\t" // check if aligned/column-stored "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1. "andb %%bh, %%al \n\t" // set ZF if bh & al == 1. "jne .DCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".DGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovapd %%ymm4, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm7, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm10, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm13, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c " \n\t" " \n\t" "vmovapd %%ymm5, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm8, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm11, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm14, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 8*rs_c " \n\t" " \n\t" "vmovapd %%ymm6, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm9, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm12, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm15, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DCOLSTORBZ: \n\t" " \n\t" " \n\t" "vmovapd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovapd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovapd %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovapd %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovapd %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovapd %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovapd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovapd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovapd %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovapd %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovapd %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" "vmovapd %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #if 0 void bli_cgemm_haswell_asm_ ( dim_t k0, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; } void bli_zgemm_haswell_asm_ ( dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; } #endif cython-blis-0.9.1/blis/_src/kernels/haswell/3/old/bli_gemm_haswell_asm_d4x12.c000066400000000000000000001755501427272030600271150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define SGEMM_INPUT_GS_BETA_NZ \ "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss instead. But since we're limited to using ymm0 through ymm2 (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ "vmovss (%%rcx,%%r10 ), %%xmm1 \n\t" \ "vpermilps $0xcf, %%xmm1, %%xmm1 \n\t" \ "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ /*"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t"*/ \ "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \ "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" #define SGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ "vmovss %%xmm0, (%%rcx ) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" void bli_sgemm_haswell_asm_4x24 ( dim_t k0, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" " \n\t" // initialize loop by pre-loading "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" "vmovaps 2 * 32(%%rbx), %%ymm2 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c "leaq (,%%rdi,4), %%rdi \n\t" // rs_c *= sizeof(float) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; "prefetcht0 7 * 4(%%rcx) \n\t" // prefetch c + 0*rs_c "prefetcht0 7 * 4(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c "prefetcht0 7 * 4(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c "prefetcht0 7 * 4(%%rcx,%%r13) \n\t" // prefetch c + 3*rs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".SLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 3 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 4 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 5 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastss 4 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 6 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 6 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 7 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 8 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 22 * 32(%%rax) \n\t" " \n\t" "vbroadcastss 8 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 10 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 9 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 10 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 11 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastss 12 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 14 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 12 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 13 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 14 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $4 * 4 * 4, %%rax \n\t" // a += 4*4 (unroll x mr) "addq $4 * 24 * 4, %%rbx \n\t" // b += 4*24 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovaps 3 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovaps 4 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231ps %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovaps 5 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $1 * 4 * 4, %%rax \n\t" // a += 1*4 (unroll x mr) "addq $1 * 24 * 4, %%rbx \n\t" // b += 1*24 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".SPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %8, %%rsi \n\t" // load cs_c "leaq (,%%rsi,4), %%rsi \n\t" // rsi = cs_c * sizeof(float) " \n\t" "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // rdx = c + 8*cs_c; "leaq (%%rdx,%%rsi,8), %%r12 \n\t" // r12 = c + 16*cs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".SGENSTORED: \n\t" " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 16*cs_c " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SROWSTORED: \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t" "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vfmadd231ps (%%r12), %%ymm3, %%ymm6 \n\t" "vmovups %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm7 \n\t" "vmovups %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm8 \n\t" "vmovups %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vfmadd231ps (%%r12), %%ymm3, %%ymm9 \n\t" "vmovups %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t" "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t" "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vfmadd231ps (%%r12), %%ymm3, %%ymm12 \n\t" "vmovups %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm13 \n\t" "vmovups %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm14 \n\t" "vmovups %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" "vfmadd231ps (%%r12), %%ymm3, %%ymm15 \n\t" "vmovups %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SBETAZERO: \n\t" " \n\t" "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".SGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovaps %%ymm4, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm7, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm10, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm13, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c " \n\t" " \n\t" "vmovaps %%ymm5, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm8, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm11, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm14, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 16*cs_c " \n\t" " \n\t" "vmovaps %%ymm6, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm9, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm12, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm15, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SROWSTORBZ: \n\t" " \n\t" " \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovups %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovups %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovups %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovups %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovups %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovups %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" "vmovups %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ "vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ "vmovlpd %%xmm0, (%%rcx ) \n\t" \ "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ void bli_dgemm_haswell_asm_4x12 ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" " \n\t" // initialize loop by pre-loading "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c "prefetcht0 7 * 8(%%rcx,%%r13) \n\t" // prefetch c + 3*rs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".DLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 3 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 4 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 5 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastsd 4 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 6 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 6 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 7 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 8 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 22 * 32(%%rax) \n\t" " \n\t" "vbroadcastsd 8 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 10 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 9 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 10 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 11 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastsd 12 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 14 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 12 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 13 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 14 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $4 * 4 * 8, %%rax \n\t" // a += 4*4 (unroll x mr) "addq $4 * 12 * 8, %%rbx \n\t" // b += 4*12 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".DLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 16 * 32(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm5 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm6 \n\t" " \n\t" "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm7 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm8 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm9 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm12 \n\t" " \n\t" "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vmovapd 3 * 32(%%rbx), %%ymm0 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm14 \n\t" "vmovapd 4 * 32(%%rbx), %%ymm1 \n\t" "vfmadd231pd %%ymm2, %%ymm3, %%ymm15 \n\t" "vmovapd 5 * 32(%%rbx), %%ymm2 \n\t" " \n\t" " \n\t" " \n\t" "addq $1 * 4 * 8, %%rax \n\t" // a += 1*4 (unroll x mr) "addq $1 * 12 * 8, %%rbx \n\t" // b += 1*12 (unroll x nr) " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".DPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %8, %%rsi \n\t" // load cs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) " \n\t" "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // rdx = c + 4*cs_c; "leaq (%%rcx,%%rsi,8), %%r12 \n\t" // r12 = c + 8*cs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".DGENSTORED: \n\t" " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DROWSTORED: \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" "vmovupd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vfmadd231pd (%%r12), %%ymm3, %%ymm6 \n\t" "vmovupd %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm7 \n\t" "vmovupd %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm8 \n\t" "vmovupd %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vfmadd231pd (%%r12), %%ymm3, %%ymm9 \n\t" "vmovupd %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" "vmovupd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" "vmovupd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vfmadd231pd (%%r12), %%ymm3, %%ymm12 \n\t" "vmovupd %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm13 \n\t" "vmovupd %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm14 \n\t" "vmovupd %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" "vfmadd231pd (%%r12), %%ymm3, %%ymm15 \n\t" "vmovupd %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DBETAZERO: \n\t" " \n\t" // check if aligned/column-stored "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".DGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovapd %%ymm4, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm7, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm10, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm13, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c " \n\t" " \n\t" "vmovapd %%ymm5, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm8, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm11, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm14, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%r12, %%rcx \n\t" // rcx = c + 8*cs_c " \n\t" " \n\t" "vmovapd %%ymm6, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm9, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm12, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm15, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DROWSTORBZ: \n\t" " \n\t" " \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovupd %%ymm6, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovupd %%ymm7, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm8, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovupd %%ymm9, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovupd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" "vmovupd %%ymm12, (%%r12) \n\t" "addq %%rdi, %%r12 \n\t" " \n\t" "vmovupd %%ymm13, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovupd %%ymm14, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" "vmovupd %%ymm15, (%%r12) \n\t" //"addq %%rdi, %%r12 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #if 0 void bli_cgemm_haswell_asm_ ( dim_t k0, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; } void bli_zgemm_haswell_asm_ ( dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; } #endif cython-blis-0.9.1/blis/_src/kernels/haswell/3/old/bli_gemm_haswell_asm_d6x8.c000066400000000000000000003420721427272030600270370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define SGEMM_INPUT_GS_BETA_NZ \ "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss instead. But since we're limited to using ymm0 through ymm2 (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ "vmovss (%%rcx,%%r10 ), %%xmm1 \n\t" \ "vpermilps $0xcf, %%xmm1, %%xmm1 \n\t" \ "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ /*"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t"*/ \ "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \ "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" #define SGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ "vmovss %%xmm0, (%%rcx ) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" void bli_sgemm_haswell_asm_6x16 ( dim_t k0, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rbx \n\t" " \n\t" // initialize loop by pre-loading "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c "leaq (,%%rdi,4), %%rdi \n\t" // rs_c *= sizeof(float) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*rs_c; "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".SLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 64 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 76 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 6 * 4, %%rax \n\t" // a += 4*6 (unroll x mr) "addq $4 * 16 * 4, %%rbx \n\t" // b += 4*16 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 64 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 6 * 4, %%rax \n\t" // a += 1*6 (unroll x mr) "addq $1 * 16 * 4, %%rbx \n\t" // b += 1*16 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".SPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %8, %%rsi \n\t" // load cs_c "leaq (,%%rsi,4), %%rsi \n\t" // rsi = cs_c * sizeof(float) " \n\t" "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of c + 8*cs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".SGENSTORED: \n\t" " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SROWSTORED: \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t" "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t" "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t" "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t" "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t" "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t" "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t" "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t" "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t" "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t" "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t" "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SBETAZERO: \n\t" " \n\t" "cmpq $4, %%rsi \n\t" // set ZF if (4*cs_c) == 4. "jz .SROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".SGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovaps %%ymm4, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm6, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm8, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm10, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm12, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm14, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*cs_c " \n\t" " \n\t" "vmovaps %%ymm5, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm7, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm9, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm11, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm13, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovaps %%ymm15, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SROWSTORBZ: \n\t" " \n\t" " \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ "vmovlpd (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhpd (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ "vmovlpd (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ "vmovhpd (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ "vmovlpd %%xmm0, (%%rcx ) \n\t" \ "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ void bli_dgemm_haswell_asm_6x8 ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rbx \n\t" " \n\t" // initialize loop by pre-loading "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(double) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*rs_c; "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*rs_c; "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*rs_c "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*rs_c "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*rs_c "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*rs_c "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*rs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".DLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 64 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd -2 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 2 * 32(%%rbx), %%ymm0 \n\t" "vmovapd 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 6 * 8, %%rax \n\t" // a += 4*6 (unroll x mr) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".DLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 64 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 6 * 8, %%rax \n\t" // a += 1*6 (unroll x mr) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".DPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %8, %%rsi \n\t" // load cs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(double) " \n\t" "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*cs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*cs_c; //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*cs_c; " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".DGENSTORED: \n\t" " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DROWSTORED: \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" "vmovupd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t" "vmovupd %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t" "vmovupd %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t" "vmovupd %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t" "vmovupd %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" "vmovupd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" "vmovupd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t" "vmovupd %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t" "vmovupd %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t" "vmovupd %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t" "vmovupd %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DBETAZERO: \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .DROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".DGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovapd %%ymm4, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm6, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm8, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm10, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm12, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm14, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*cs_c " \n\t" " \n\t" "vmovapd %%ymm5, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm7, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm9, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm11, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm13, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += rs_c; " \n\t" " \n\t" "vmovapd %%ymm15, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DROWSTORBZ: \n\t" " \n\t" " \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" "vmovupd %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovupd %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define CGEMM_INPUT_SCALE_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlpd (%%rcx,%%rsi,2), %%xmm3, %%xmm3 \n\t" \ "vmovhpd (%%rcx,%%r13 ), %%xmm3, %%xmm3 \n\t" \ "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" // assumes values to output are in ymm0 #define CGEMM_OUTPUT_GS \ "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ "vmovlpd %%xmm0, (%%rcx ) \n\t" \ "vmovhpd %%xmm0, (%%rcx,%%rsi,1) \n\t" \ "vmovlpd %%xmm3, (%%rcx,%%rsi,2) \n\t" \ "vmovhpd %%xmm3, (%%rcx,%%r13 ) \n\t" #define CGEMM_INPUT_SCALE_RS_BETA_NZ \ "vmovups (%%rcx), %%ymm0 \n\t" \ "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" #define CGEMM_OUTPUT_RS \ "vmovups %%ymm0, (%%rcx) \n\t" \ void bli_cgemm_haswell_asm_3x8 ( dim_t k0, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rbx \n\t" " \n\t" // initialize loop by pre-loading "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(scomplex) " \n\t" "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; " \n\t" "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".CLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 32 * 8(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps -2 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastss 6 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 7 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 8 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 9 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 10 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 11 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 0 * 32(%%rbx), %%ymm0 \n\t" "vmovaps 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 38 * 8(%%rax) \n\t" " \n\t" "vbroadcastss 12 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 13 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 14 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 15 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 16 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 17 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 2 * 32(%%rbx), %%ymm0 \n\t" "vmovaps 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastss 18 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 19 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 20 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 21 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 22 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 23 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 3 * 8, %%rax \n\t" // a += 4*3 (unroll x mr) "addq $4 * 8 * 8, %%rbx \n\t" // b += 4*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .CLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".CCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".CLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 32 * 8(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rax), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rax), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 3 * 8, %%rax \n\t" // a += 1*3 (unroll x mr) "addq $1 * 8 * 8, %%rbx \n\t" // b += 1*8 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rbx), %%ymm0 \n\t" "vmovaps -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".CPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" // permute even and odd elements " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" "vpermilps $0xb1, %%ymm7, %%ymm7 \n\t" "vpermilps $0xb1, %%ymm10, %%ymm10 \n\t" "vpermilps $0xb1, %%ymm11, %%ymm11 \n\t" "vpermilps $0xb1, %%ymm14, %%ymm14 \n\t" "vpermilps $0xb1, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" // subtract/add even/odd elements "vaddsubps %%ymm6, %%ymm4, %%ymm4 \n\t" "vaddsubps %%ymm7, %%ymm5, %%ymm5 \n\t" " \n\t" "vaddsubps %%ymm10, %%ymm8, %%ymm8 \n\t" "vaddsubps %%ymm11, %%ymm9, %%ymm9 \n\t" " \n\t" "vaddsubps %%ymm14, %%ymm12, %%ymm12 \n\t" "vaddsubps %%ymm15, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate "vbroadcastss 4(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate " \n\t" " \n\t" "vpermilps $0xb1, %%ymm4, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm4, %%ymm4 \n\t" " \n\t" "vpermilps $0xb1, %%ymm5, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm5, %%ymm5 \n\t" " \n\t" " \n\t" "vpermilps $0xb1, %%ymm8, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm8, %%ymm8 \n\t" " \n\t" "vpermilps $0xb1, %%ymm9, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm9, %%ymm9 \n\t" " \n\t" " \n\t" "vpermilps $0xb1, %%ymm12, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm12, %%ymm12 \n\t" " \n\t" "vpermilps $0xb1, %%ymm13, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %5, %%rbx \n\t" // load address of beta "vbroadcastss (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate "vbroadcastss 4(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate " \n\t" " \n\t" " \n\t" " \n\t" "movq %8, %%rsi \n\t" // load cs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(scomplex) "leaq (,%%rsi,4), %%rdx \n\t" // rdx = 4*cs_c; "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*cs_c; " \n\t" " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomiss %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. "jne .CBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .CROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".CGENSTORED: \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .CDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".CROWSTORED: \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_RS_BETA_NZ "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_RS "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_RS_BETA_NZ "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_RS "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_RS_BETA_NZ "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_RS "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_RS_BETA_NZ "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_RS "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_RS_BETA_NZ "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_RS "addq %%rdx, %%rcx \n\t" // c += 4*cs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_RS_BETA_NZ "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_RS " \n\t" " \n\t" " \n\t" "jmp .CDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".CBETAZERO: \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .CROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".CGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovaps %%ymm4, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" "vmovaps %%ymm5, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c " \n\t" " \n\t" " \n\t" "vmovaps %%ymm8, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" "vmovaps %%ymm9, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c " \n\t" " \n\t" " \n\t" "vmovaps %%ymm12, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" "vmovaps %%ymm13, %%ymm0 \n\t" CGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .CDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".CROWSTORBZ: \n\t" " \n\t" " \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" " \n\t" "vmovups %%ymm8, (%%r11) \n\t" "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" " \n\t" "vmovups %%ymm12, (%%r12) \n\t" "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".CDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ "vmovupd (%%rcx), %%xmm0 \n\t" \ "vmovupd (%%rcx,%%rsi), %%xmm3 \n\t" \ "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" // assumes values to output are in ymm0 #define ZGEMM_OUTPUT_GS \ "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ "vmovupd %%xmm0, (%%rcx) \n\t" \ "vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \ #define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ "vmovupd (%%rcx), %%ymm0 \n\t" \ "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" #define ZGEMM_OUTPUT_RS \ "vmovupd %%ymm0, (%%rcx) \n\t" \ void bli_zgemm_haswell_asm_3x4 ( dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rbx \n\t" " \n\t" // initialize loop by pre-loading "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %7, %%rdi \n\t" // load rs_c "leaq (,%%rdi,8), %%rdi \n\t" // rs_c *= sizeof(dcomplex) "leaq (,%%rdi,2), %%rdi \n\t" " \n\t" "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*rs_c; "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*rs_c; " \n\t" "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*rs_c "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*rs_c "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*rs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".ZLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 32 * 16(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd -2 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastsd 6 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 8 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 9 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 10 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 11 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 0 * 32(%%rbx), %%ymm0 \n\t" "vmovapd 1 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 38 * 16(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 14 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 15 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 16 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 17 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 2 * 32(%%rbx), %%ymm0 \n\t" "vmovapd 3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastsd 18 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 19 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 20 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 21 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 22 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 23 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 3 * 16, %%rax \n\t" // a += 4*3 (unroll x mr) "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .ZLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".ZCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".ZLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 32 * 16(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rax), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rax), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 3 * 16, %%rax \n\t" // a += 1*3 (unroll x mr) "addq $1 * 4 * 16, %%rbx \n\t" // b += 1*4 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rbx), %%ymm0 \n\t" "vmovapd -3 * 32(%%rbx), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".ZPOSTACCUM: \n\t" " \n\t" " \n\t" // permute even and odd elements " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" "vpermilpd $0x5, %%ymm10, %%ymm10 \n\t" "vpermilpd $0x5, %%ymm11, %%ymm11 \n\t" "vpermilpd $0x5, %%ymm14, %%ymm14 \n\t" "vpermilpd $0x5, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" // subtract/add even/odd elements "vaddsubpd %%ymm6, %%ymm4, %%ymm4 \n\t" "vaddsubpd %%ymm7, %%ymm5, %%ymm5 \n\t" " \n\t" "vaddsubpd %%ymm10, %%ymm8, %%ymm8 \n\t" "vaddsubpd %%ymm11, %%ymm9, %%ymm9 \n\t" " \n\t" "vaddsubpd %%ymm14, %%ymm12, %%ymm12 \n\t" "vaddsubpd %%ymm15, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate "vbroadcastsd 8(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate " \n\t" " \n\t" "vpermilpd $0x5, %%ymm4, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm4, %%ymm4 \n\t" " \n\t" "vpermilpd $0x5, %%ymm5, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" " \n\t" " \n\t" "vpermilpd $0x5, %%ymm8, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm8, %%ymm8 \n\t" " \n\t" "vpermilpd $0x5, %%ymm9, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" " \n\t" " \n\t" "vpermilpd $0x5, %%ymm12, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm12, %%ymm12 \n\t" " \n\t" "vpermilpd $0x5, %%ymm13, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %5, %%rbx \n\t" // load address of beta "vbroadcastsd (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate "vbroadcastsd 8(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate " \n\t" " \n\t" " \n\t" " \n\t" "movq %8, %%rsi \n\t" // load cs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = cs_c * sizeof(dcomplex) "leaq (,%%rsi,2), %%rsi \n\t" "leaq (,%%rsi,2), %%rdx \n\t" // rdx = 2*cs_c; " \n\t" " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomisd %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. "jne .ZBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. "jz .ZROWSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".ZGENSTORED: \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .ZDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".ZROWSTORED: \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_RS_BETA_NZ "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_RS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_RS_BETA_NZ "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_RS "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_RS_BETA_NZ "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_RS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_RS_BETA_NZ "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_RS "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_RS_BETA_NZ "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_RS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_RS_BETA_NZ "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_RS " \n\t" " \n\t" " \n\t" "jmp .ZDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".ZBETAZERO: \n\t" " \n\t" "cmpq $16, %%rsi \n\t" // set ZF if (16*cs_c) == 16. "jz .ZROWSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".ZGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovapd %%ymm4, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" "vmovapd %%ymm5, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*rs_c " \n\t" " \n\t" " \n\t" "vmovapd %%ymm8, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" "vmovapd %%ymm9, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*rs_c " \n\t" " \n\t" " \n\t" "vmovapd %%ymm12, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*cs_c; " \n\t" " \n\t" "vmovapd %%ymm13, %%ymm0 \n\t" ZGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .ZDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".ZROWSTORBZ: \n\t" " \n\t" " \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "vmovupd %%ymm5, (%%rcx,%%rdx,1) \n\t" " \n\t" "vmovupd %%ymm8, (%%r11) \n\t" "vmovupd %%ymm9, (%%r11,%%rdx,1) \n\t" " \n\t" "vmovupd %%ymm12, (%%r12) \n\t" "vmovupd %%ymm13, (%%r12,%%rdx,1) \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".ZDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } cython-blis-0.9.1/blis/_src/kernels/haswell/3/old/bli_gemm_haswell_asm_d8x6.c000066400000000000000000003426731427272030600270460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define SGEMM_INPUT_GS_BETA_NZ \ "vmovlps (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhps (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhps (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t" \ "vmovlps (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhps (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ /* We can't use vmovhps for loading the last element becauase that might result in reading beyond valid memory. (vmov[lh]psd load pairs of adjacent floats at a time.) So we need to use vmovss instead. But since we're limited to using ymm0 through ymm2 (ymm3 contains beta and ymm4 through ymm15 contain the microtile) and due to the way vmovss zeros out all bits above 31, we have to load element 7 before element 6. */ \ "vmovss (%%rcx,%%r10 ), %%xmm1 \n\t" \ "vpermilps $0xcf, %%xmm1, %%xmm1 \n\t" \ "vmovlps (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ /*"vmovhps (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t"*/ \ "vshufps $0x88, %%xmm1, %%xmm2, %%xmm2 \n\t" \ "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t" #define SGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm2 \n\t" \ "vmovss %%xmm0, (%%rcx ) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%rsi,1) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm0 \n\t" \ "vmovss %%xmm0, (%%rcx,%%rsi,2) \n\t" \ "vpermilps $0x39, %%xmm0, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r13 ) \n\t" \ "vmovss %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r15 ) \n\t" \ "vpermilps $0x39, %%xmm1, %%xmm2 \n\t" \ "vmovss %%xmm2, (%%rcx,%%r13,2) \n\t" \ "vpermilps $0x39, %%xmm2, %%xmm1 \n\t" \ "vmovss %%xmm1, (%%rcx,%%r10 ) \n\t" void bli_sgemm_haswell_asm_16x6 ( dim_t k0, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rax \n\t" " \n\t" // initialize loop by pre-loading "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %8, %%rdi \n\t" // load cs_c "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*cs_c; "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*cs_c "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*cs_c "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*cs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".SLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 128 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps -2 * 32(%%rax), %%ymm0 \n\t" "vmovaps -1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastss 6 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 7 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 8 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 9 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 10 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 11 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 152 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 12 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 13 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 14 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 15 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 16 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 17 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastss 18 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 19 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 20 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 21 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 22 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 23 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 16 * 4, %%rax \n\t" // a += 4*16 (unroll x mr) "addq $4 * 6 * 4, %%rbx \n\t" // b += 4*6 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".SLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 128 * 4(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 16 * 4, %%rax \n\t" // a += 1*16 (unroll x mr) "addq $1 * 6 * 4, %%rbx \n\t" // b += 1*6 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .SLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".SPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastss (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %7, %%rsi \n\t" // load rs_c "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float) " \n\t" "leaq (%%rcx,%%rsi,8), %%rdx \n\t" // load address of c + 8*rs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; "leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; "leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomiss %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. "jz .SCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".SGENSTORED: \n\t" " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm4, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm6, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm8, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm10, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm12, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm14, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm5, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm7, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm9, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm11, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm13, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" SGEMM_INPUT_GS_BETA_NZ "vfmadd213ps %%ymm15, %%ymm3, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SCOLSTORED: \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm4 \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm5 \n\t" "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm6 \n\t" "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm7 \n\t" "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm8 \n\t" "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm9 \n\t" "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm10 \n\t" "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm11 \n\t" "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm12 \n\t" "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm13 \n\t" "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231ps (%%rcx), %%ymm3, %%ymm14 \n\t" "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vfmadd231ps (%%rdx), %%ymm3, %%ymm15 \n\t" "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SBETAZERO: \n\t" " \n\t" "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4. "jz .SCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".SGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovaps %%ymm4, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm6, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm8, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm10, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm12, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm14, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 8*rs_c " \n\t" " \n\t" "vmovaps %%ymm5, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm7, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm9, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm11, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm13, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovaps %%ymm15, %%ymm0 \n\t" SGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" "jmp .SDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".SCOLSTORBZ: \n\t" " \n\t" " \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" "vmovups %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovups %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovups %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovups %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".SDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #define DGEMM_INPUT_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlpd (%%rcx,%%rsi,2), %%xmm1, %%xmm1 \n\t" \ "vmovhpd (%%rcx,%%r13 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm0, %%ymm0 \n\t" /*\ "vmovlpd (%%rcx,%%rsi,4), %%xmm2, %%xmm2 \n\t" \ "vmovhpd (%%rcx,%%r15 ), %%xmm2, %%xmm2 \n\t" \ "vmovlpd (%%rcx,%%r13,2), %%xmm1, %%xmm1 \n\t" \ "vmovhpd (%%rcx,%%r10 ), %%xmm1, %%xmm1 \n\t" \ "vperm2f128 $0x20, %%ymm1, %%ymm2, %%ymm2 \n\t"*/ #define DGEMM_OUTPUT_GS_BETA_NZ \ "vextractf128 $1, %%ymm0, %%xmm1 \n\t" \ "vmovlpd %%xmm0, (%%rcx ) \n\t" \ "vmovhpd %%xmm0, (%%rcx,%%rsi ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%rsi,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r13 ) \n\t" /*\ "vextractf128 $1, %%ymm2, %%xmm1 \n\t" \ "vmovlpd %%xmm2, (%%rcx,%%rsi,4) \n\t" \ "vmovhpd %%xmm2, (%%rcx,%%r15 ) \n\t" \ "vmovlpd %%xmm1, (%%rcx,%%r13,2) \n\t" \ "vmovhpd %%xmm1, (%%rcx,%%r10 ) \n\t"*/ void bli_dgemm_haswell_asm_8x6 ( dim_t k0, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rax \n\t" " \n\t" // initialize loop by pre-loading "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %8, %%rdi \n\t" // load cs_c "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double) " \n\t" "leaq (%%rdi,%%rdi,2), %%r13 \n\t" // r13 = 3*cs_c; "leaq (%%rcx,%%r13,1), %%rdx \n\t" // rdx = c + 3*cs_c; "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c "prefetcht0 7 * 8(%%rdx) \n\t" // prefetch c + 3*cs_c "prefetcht0 7 * 8(%%rdx,%%rdi) \n\t" // prefetch c + 4*cs_c "prefetcht0 7 * 8(%%rdx,%%rdi,2) \n\t" // prefetch c + 5*cs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".DLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 64 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd -2 * 32(%%rax), %%ymm0 \n\t" "vmovapd -1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 8 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 10 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 76 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 14 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 16 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 17 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 19 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 20 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 21 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 22 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 23 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) "addq $4 * 6 * 8, %%rbx \n\t" // b += 4*6 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".DLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 64 * 8(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr) "addq $1 * 6 * 8, %%rbx \n\t" // b += 1*6 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .DLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".DPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "movq %5, %%rbx \n\t" // load address of beta "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate "vbroadcastsd (%%rbx), %%ymm3 \n\t" // load beta and duplicate " \n\t" "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" // scale by alpha "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %7, %%rsi \n\t" // load rs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double) " \n\t" "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c; " \n\t" "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; //"leaq (%%rsi,%%rsi,4), %%r15 \n\t" // r15 = 5*rs_c; //"leaq (%%r13,%%rsi,4), %%r10 \n\t" // r10 = 7*rs_c; " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 " \n\t" "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomisd %%xmm0, %%xmm3 \n\t" // set ZF if beta == 0. "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. "jz .DCOLSTORED \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".DGENSTORED: \n\t" " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm4, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm6, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm8, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm10, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm12, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm14, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm5, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm7, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm9, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm11, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm13, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" DGEMM_INPUT_GS_BETA_NZ "vfmadd213pd %%ymm15, %%ymm3, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DCOLSTORED: \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm4 \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm5 \n\t" "vmovupd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm6 \n\t" "vmovupd %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm7 \n\t" "vmovupd %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm8 \n\t" "vmovupd %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm9 \n\t" "vmovupd %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm10 \n\t" "vmovupd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm11 \n\t" "vmovupd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm12 \n\t" "vmovupd %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm13 \n\t" "vmovupd %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vfmadd231pd (%%rcx), %%ymm3, %%ymm14 \n\t" "vmovupd %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vfmadd231pd (%%rdx), %%ymm3, %%ymm15 \n\t" "vmovupd %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DBETAZERO: \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. "jz .DCOLSTORBZ \n\t" // jump to column storage case " \n\t" " \n\t" " \n\t" ".DGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovapd %%ymm4, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm6, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm8, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm10, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm12, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm14, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "movq %%rdx, %%rcx \n\t" // rcx = c + 4*rs_c " \n\t" " \n\t" "vmovapd %%ymm5, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm7, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm9, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm11, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm13, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ "addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" "vmovapd %%ymm15, %%ymm0 \n\t" DGEMM_OUTPUT_GS_BETA_NZ //"addq %%rdi, %%rcx \n\t" // c += cs_c; " \n\t" " \n\t" " \n\t" "jmp .DDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".DCOLSTORBZ: \n\t" " \n\t" " \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm5, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" "vmovupd %%ymm6, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm7, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm8, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm9, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm10, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm11, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm12, (%%rcx) \n\t" "addq %%rdi, %%rcx \n\t" "vmovupd %%ymm13, (%%rdx) \n\t" "addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" "vmovupd %%ymm14, (%%rcx) \n\t" //"addq %%rdi, %%rcx \n\t" "vmovupd %%ymm15, (%%rdx) \n\t" //"addq %%rdi, %%rdx \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".DDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define CGEMM_INPUT_SCALE_GS_BETA_NZ \ "vmovlpd (%%rcx ), %%xmm0, %%xmm0 \n\t" \ "vmovhpd (%%rcx,%%rsi,1), %%xmm0, %%xmm0 \n\t" \ "vmovlpd (%%rcx,%%rsi,2), %%xmm3, %%xmm3 \n\t" \ "vmovhpd (%%rcx,%%r13 ), %%xmm3, %%xmm3 \n\t" \ "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" // assumes values to output are in ymm0 #define CGEMM_OUTPUT_GS \ "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ "vmovlpd %%xmm0, (%%rcx ) \n\t" \ "vmovhpd %%xmm0, (%%rcx,%%rsi,1) \n\t" \ "vmovlpd %%xmm3, (%%rcx,%%rsi,2) \n\t" \ "vmovhpd %%xmm3, (%%rcx,%%r13 ) \n\t" #define CGEMM_INPUT_SCALE_CS_BETA_NZ \ "vmovups (%%rcx), %%ymm0 \n\t" \ "vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ "vmulps %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulps %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" #define CGEMM_OUTPUT_CS \ "vmovups %%ymm0, (%%rcx) \n\t" \ void bli_cgemm_haswell_asm_8x3 ( dim_t k0, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rax \n\t" " \n\t" // initialize loop by pre-loading "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %8, %%rdi \n\t" // load cs_c "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(scomplex) " \n\t" "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*cs_c; "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*cs_c; " \n\t" "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*cs_c "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*cs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".CLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 32 * 8(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps -2 * 32(%%rax), %%ymm0 \n\t" "vmovaps -1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastss 6 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 7 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 8 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 9 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 10 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 11 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" "vmovaps 1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 38 * 8(%%rax) \n\t" " \n\t" "vbroadcastss 12 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 13 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 14 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 15 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 16 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 17 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovaps 2 * 32(%%rax), %%ymm0 \n\t" "vmovaps 3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastss 18 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 19 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 20 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 21 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 22 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 23 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr) "addq $4 * 3 * 8, %%rbx \n\t" // b += 4*3 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .CLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".CCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".CLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 32 * 8(%%rax) \n\t" " \n\t" "vbroadcastss 0 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 1 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastss 2 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 3 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastss 4 * 4(%%rbx), %%ymm2 \n\t" "vbroadcastss 5 * 4(%%rbx), %%ymm3 \n\t" "vfmadd231ps %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231ps %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231ps %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231ps %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 8 * 8, %%rax \n\t" // a += 1*8 (unroll x mr) "addq $1 * 3 * 8, %%rbx \n\t" // b += 1*3 (unroll x nr) " \n\t" "vmovaps -4 * 32(%%rax), %%ymm0 \n\t" "vmovaps -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .CLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".CPOSTACCUM: \n\t" " \n\t" " \n\t" " \n\t" // permute even and odd elements " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" "vpermilps $0xb1, %%ymm7, %%ymm7 \n\t" "vpermilps $0xb1, %%ymm10, %%ymm10 \n\t" "vpermilps $0xb1, %%ymm11, %%ymm11 \n\t" "vpermilps $0xb1, %%ymm14, %%ymm14 \n\t" "vpermilps $0xb1, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" // subtract/add even/odd elements "vaddsubps %%ymm6, %%ymm4, %%ymm4 \n\t" "vaddsubps %%ymm7, %%ymm5, %%ymm5 \n\t" " \n\t" "vaddsubps %%ymm10, %%ymm8, %%ymm8 \n\t" "vaddsubps %%ymm11, %%ymm9, %%ymm9 \n\t" " \n\t" "vaddsubps %%ymm14, %%ymm12, %%ymm12 \n\t" "vaddsubps %%ymm15, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate "vbroadcastss 4(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate " \n\t" " \n\t" "vpermilps $0xb1, %%ymm4, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm4, %%ymm4 \n\t" " \n\t" "vpermilps $0xb1, %%ymm5, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm5, %%ymm5 \n\t" " \n\t" " \n\t" "vpermilps $0xb1, %%ymm8, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm8, %%ymm8 \n\t" " \n\t" "vpermilps $0xb1, %%ymm9, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm9, %%ymm9 \n\t" " \n\t" " \n\t" "vpermilps $0xb1, %%ymm12, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm12, %%ymm12 \n\t" " \n\t" "vpermilps $0xb1, %%ymm13, %%ymm3 \n\t" "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubps %%ymm3, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %5, %%rbx \n\t" // load address of beta "vbroadcastss (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate "vbroadcastss 4(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate " \n\t" " \n\t" " \n\t" " \n\t" "movq %7, %%rsi \n\t" // load rs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(scomplex) "leaq (,%%rsi,4), %%rdx \n\t" // rdx = 4*rs_c; "leaq (%%rsi,%%rsi,2), %%r13 \n\t" // r13 = 3*rs_c; " \n\t" " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomiss %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); "vucomiss %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. "jne .CBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*cs_c) == 8. "jz .CCOLSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".CGENSTORED: \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_GS_BETA_NZ "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .CDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".CCOLSTORED: \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_CS_BETA_NZ "vaddps %%ymm4, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_CS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_CS_BETA_NZ "vaddps %%ymm5, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_CS "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_CS_BETA_NZ "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_CS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_CS_BETA_NZ "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_CS "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c " \n\t" " \n\t" " \n\t" CGEMM_INPUT_SCALE_CS_BETA_NZ "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_CS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" CGEMM_INPUT_SCALE_CS_BETA_NZ "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" CGEMM_OUTPUT_CS " \n\t" " \n\t" " \n\t" "jmp .CDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".CBETAZERO: \n\t" " \n\t" "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8. "jz .CCOLSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".CGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovaps %%ymm4, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" "vmovaps %%ymm5, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c " \n\t" " \n\t" " \n\t" "vmovaps %%ymm8, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" "vmovaps %%ymm9, %%ymm0 \n\t" CGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c " \n\t" " \n\t" " \n\t" "vmovaps %%ymm12, %%ymm0 \n\t" CGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 4*rs_c; " \n\t" " \n\t" "vmovaps %%ymm13, %%ymm0 \n\t" CGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .CDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".CCOLSTORBZ: \n\t" " \n\t" " \n\t" "vmovups %%ymm4, (%%rcx) \n\t" "vmovups %%ymm5, (%%rcx,%%rdx,1) \n\t" " \n\t" "vmovups %%ymm8, (%%r11) \n\t" "vmovups %%ymm9, (%%r11,%%rdx,1) \n\t" " \n\t" "vmovups %%ymm12, (%%r12) \n\t" "vmovups %%ymm13, (%%r12,%%rdx,1) \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".CDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define ZGEMM_INPUT_SCALE_GS_BETA_NZ \ "vmovupd (%%rcx), %%xmm0 \n\t" \ "vmovupd (%%rcx,%%rsi), %%xmm3 \n\t" \ "vinsertf128 $1, %%xmm3, %%ymm0, %%ymm0 \n\t" \ "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" // assumes values to output are in ymm0 #define ZGEMM_OUTPUT_GS \ "vextractf128 $1, %%ymm0, %%xmm3 \n\t" \ "vmovupd %%xmm0, (%%rcx) \n\t" \ "vmovupd %%xmm3, (%%rcx,%%rsi ) \n\t" \ #define ZGEMM_INPUT_SCALE_CS_BETA_NZ \ "vmovups (%%rcx), %%ymm0 \n\t" \ "vpermilpd $0x5, %%ymm0, %%ymm3 \n\t" \ "vmulpd %%ymm1, %%ymm0, %%ymm0 \n\t" \ "vmulpd %%ymm2, %%ymm3, %%ymm3 \n\t" \ "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" #define ZGEMM_OUTPUT_CS \ "vmovupd %%ymm0, (%%rcx) \n\t" \ void bli_zgemm_haswell_asm_4x3 ( dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; __asm__ volatile ( " \n\t" "vzeroall \n\t" // zero all xmm/ymm registers. " \n\t" " \n\t" "movq %2, %%rax \n\t" // load address of a. "movq %3, %%rbx \n\t" // load address of b. //"movq %9, %%r15 \n\t" // load address of b_next. " \n\t" "addq $32 * 4, %%rax \n\t" " \n\t" // initialize loop by pre-loading "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" "movq %6, %%rcx \n\t" // load address of c "movq %8, %%rdi \n\t" // load cs_c "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(dcomplex) "leaq (,%%rdi,2), %%rdi \n\t" " \n\t" "leaq (%%rcx,%%rdi,1), %%r11 \n\t" // r11 = c + 1*cs_c; "leaq (%%rcx,%%rdi,2), %%r12 \n\t" // r12 = c + 2*cs_c; " \n\t" "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c "prefetcht0 7 * 8(%%r11) \n\t" // prefetch c + 1*cs_c "prefetcht0 7 * 8(%%r12) \n\t" // prefetch c + 2*cs_c " \n\t" " \n\t" " \n\t" " \n\t" "movq %0, %%rsi \n\t" // i = k_iter; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".ZLOOPKITER: \n\t" // MAIN LOOP " \n\t" " \n\t" " \n\t" // iteration 0 "prefetcht0 32 * 16(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd -2 * 32(%%rax), %%ymm0 \n\t" "vmovapd -1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 1 "vbroadcastsd 6 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 7 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 8 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 9 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 10 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 11 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" "vmovapd 1 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 2 "prefetcht0 38 * 16(%%rax) \n\t" " \n\t" "vbroadcastsd 12 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 13 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 14 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 15 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 16 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 17 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "vmovapd 2 * 32(%%rax), %%ymm0 \n\t" "vmovapd 3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" // iteration 3 "vbroadcastsd 18 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 19 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 20 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 21 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 22 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 23 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $4 * 4 * 16, %%rax \n\t" // a += 4*4 (unroll x mr) "addq $4 * 3 * 16, %%rbx \n\t" // b += 4*3 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .ZLOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".ZCONSIDKLEFT: \n\t" " \n\t" "movq %1, %%rsi \n\t" // i = k_left; "testq %%rsi, %%rsi \n\t" // check i via logical AND. "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".ZLOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "prefetcht0 32 * 16(%%rax) \n\t" " \n\t" "vbroadcastsd 0 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 1 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm4 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm6 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm7 \n\t" " \n\t" "vbroadcastsd 2 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 3 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm10 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm11 \n\t" " \n\t" "vbroadcastsd 4 * 8(%%rbx), %%ymm2 \n\t" "vbroadcastsd 5 * 8(%%rbx), %%ymm3 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm12 \n\t" "vfmadd231pd %%ymm1, %%ymm2, %%ymm13 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vfmadd231pd %%ymm1, %%ymm3, %%ymm15 \n\t" " \n\t" "addq $1 * 4 * 16, %%rax \n\t" // a += 1*4 (unroll x mr) "addq $1 * 3 * 16, %%rbx \n\t" // b += 1*3 (unroll x nr) " \n\t" "vmovapd -4 * 32(%%rax), %%ymm0 \n\t" "vmovapd -3 * 32(%%rax), %%ymm1 \n\t" " \n\t" " \n\t" "decq %%rsi \n\t" // i -= 1; "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".ZPOSTACCUM: \n\t" " \n\t" " \n\t" // permute even and odd elements " \n\t" // of ymm6/7, ymm10/11, ymm/14/15 "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" "vpermilpd $0x5, %%ymm10, %%ymm10 \n\t" "vpermilpd $0x5, %%ymm11, %%ymm11 \n\t" "vpermilpd $0x5, %%ymm14, %%ymm14 \n\t" "vpermilpd $0x5, %%ymm15, %%ymm15 \n\t" " \n\t" " \n\t" " \n\t" // subtract/add even/odd elements "vaddsubpd %%ymm6, %%ymm4, %%ymm4 \n\t" "vaddsubpd %%ymm7, %%ymm5, %%ymm5 \n\t" " \n\t" "vaddsubpd %%ymm10, %%ymm8, %%ymm8 \n\t" "vaddsubpd %%ymm11, %%ymm9, %%ymm9 \n\t" " \n\t" "vaddsubpd %%ymm14, %%ymm12, %%ymm12 \n\t" "vaddsubpd %%ymm15, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %4, %%rax \n\t" // load address of alpha "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha_r and duplicate "vbroadcastsd 8(%%rax), %%ymm1 \n\t" // load alpha_i and duplicate " \n\t" " \n\t" "vpermilpd $0x5, %%ymm4, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm4, %%ymm4 \n\t" " \n\t" "vpermilpd $0x5, %%ymm5, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm5, %%ymm5 \n\t" " \n\t" " \n\t" "vpermilpd $0x5, %%ymm8, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm8, %%ymm8 \n\t" " \n\t" "vpermilpd $0x5, %%ymm9, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm9, %%ymm9 \n\t" " \n\t" " \n\t" "vpermilpd $0x5, %%ymm12, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm12, %%ymm12 \n\t" " \n\t" "vpermilpd $0x5, %%ymm13, %%ymm3 \n\t" "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" "vaddsubpd %%ymm3, %%ymm13, %%ymm13 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "movq %5, %%rbx \n\t" // load address of beta "vbroadcastsd (%%rbx), %%ymm1 \n\t" // load beta_r and duplicate "vbroadcastsd 8(%%rbx), %%ymm2 \n\t" // load beta_i and duplicate " \n\t" " \n\t" " \n\t" " \n\t" "movq %7, %%rsi \n\t" // load rs_c "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(dcomplex) "leaq (,%%rsi,2), %%rsi \n\t" "leaq (,%%rsi,2), %%rdx \n\t" // rdx = 2*rs_c; " \n\t" " \n\t" " \n\t" " \n\t" // now avoid loading C if beta == 0 "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero. "vucomisd %%xmm0, %%xmm1 \n\t" // set ZF if beta_r == 0. "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 ); "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta_i == 0. "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 ); "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1. "jne .ZBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case " \n\t" " \n\t" "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. "jz .ZCOLSTORED \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".ZGENSTORED: \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_GS_BETA_NZ "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .ZDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".ZCOLSTORED: \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_CS_BETA_NZ "vaddpd %%ymm4, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_CS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_CS_BETA_NZ "vaddpd %%ymm5, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_CS "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_CS_BETA_NZ "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_CS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_CS_BETA_NZ "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_CS "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c " \n\t" " \n\t" " \n\t" ZGEMM_INPUT_SCALE_CS_BETA_NZ "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_CS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" ZGEMM_INPUT_SCALE_CS_BETA_NZ "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" ZGEMM_OUTPUT_CS " \n\t" " \n\t" " \n\t" "jmp .ZDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".ZBETAZERO: \n\t" " \n\t" "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16. "jz .ZCOLSTORBZ \n\t" // jump to row storage case " \n\t" " \n\t" " \n\t" ".ZGENSTORBZ: \n\t" " \n\t" " \n\t" "vmovapd %%ymm4, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" "vmovapd %%ymm5, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r11, %%rcx \n\t" // rcx = c + 1*cs_c " \n\t" " \n\t" " \n\t" "vmovapd %%ymm8, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" "vmovapd %%ymm9, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "movq %%r12, %%rcx \n\t" // rcx = c + 2*cs_c " \n\t" " \n\t" " \n\t" "vmovapd %%ymm12, %%ymm0 \n\t" ZGEMM_OUTPUT_GS "addq %%rdx, %%rcx \n\t" // c += 2*rs_c; " \n\t" " \n\t" "vmovapd %%ymm13, %%ymm0 \n\t" ZGEMM_OUTPUT_GS " \n\t" " \n\t" " \n\t" "jmp .ZDONE \n\t" // jump to end. " \n\t" " \n\t" " \n\t" ".ZCOLSTORBZ: \n\t" " \n\t" " \n\t" "vmovupd %%ymm4, (%%rcx) \n\t" "vmovupd %%ymm5, (%%rcx,%%rdx,1) \n\t" " \n\t" "vmovupd %%ymm8, (%%r11) \n\t" "vmovupd %%ymm9, (%%r11,%%rdx,1) \n\t" " \n\t" "vmovupd %%ymm12, (%%r12) \n\t" "vmovupd %%ymm13, (%%r12,%%rdx,1) \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" " \n\t" ".ZDONE: \n\t" " \n\t" "vzeroupper \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), // 0 "m" (k_left), // 1 "m" (a), // 2 "m" (b), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (c), // 6 "m" (rs_c), // 7 "m" (cs_c)/*, // 8 "m" (b_next), // 9 "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/000077500000000000000000000000001427272030600217115ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c000066400000000000000000001462351427272030600304650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. NOTE: These kernels implicitly support column-oriented IO, implemented via an a high-level transposition of the entire operation. A and B will effectively remain row- and column-stored, respectively, but C will then effectively appear column-stored. Thus, this kernel may be used for both rrc and crc cases. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rd_haswell_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rd_haswell_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c*sizeof(double) = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.DLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter .. 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 3*ii*rs_c; lea(mem(r14), rax) // rax = a + 3*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4[0] = sum(ymm4); ymm4[1] = sum(ymm7) // ymm4[2] = sum(ymm10); ymm4[3] = sum(ymm13) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm5[0] = sum(ymm5); ymm5[1] = sum(ymm8) // ymm5[2] = sum(ymm11); ymm5[3] = sum(ymm14) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm6[0] = sum(ymm6); ymm6[1] = sum(ymm9) // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_ii; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) // xmm8[0:1] = sum(ymm8) sum(ymm9) // xmm10[0:1] = sum(ymm10) sum(ymm11) // xmm12[0:1] = sum(ymm12) sum(ymm13) // xmm14[0:1] = sum(ymm14) sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c lea(mem(r14, r8, 4), r14) // lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_dgemmsup_rd_haswell_asm_3x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c000066400000000000000000001747261427272030600304740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. NOTE: These kernels implicitly support column-oriented IO, implemented via an a high-level transposition of the entire operation. A and B will effectively remain row- and column-stored, respectively, but C will then effectively appear column-stored. Thus, this kernel may be used for both rrc and crc cases. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rd_haswell_asm_6x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 6; // First check whether this is a edge case in the n dimension. If so, // dispatch other ?x8m kernels, as needed. if ( m_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; #if 1 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m0 ) { dgemmsup_ker_ft ker_fp1 = NULL; dgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; // These kernels don't make any attempt to optimize the cases of // inflated MR blocksizes because they don't benefit from the // load balancing that the "rv" kernels do. That is, if m0 = 7, // there is no benefit to executing that case as 4x8n followed // by 3x8n because 4x8n isn't implemented, and more generally // because these kernels are implemented as loops over their // true blocksizes, which are MR=3 NR=4. if ( m0 == 7 ) { mr1 = 6; mr2 = 1; ker_fp1 = bli_dgemmsup_rd_haswell_asm_6x8n; ker_fp2 = bli_dgemmsup_rd_haswell_asm_1x8n; } else if ( m0 == 8 ) { mr1 = 6; mr2 = 2; ker_fp1 = bli_dgemmsup_rd_haswell_asm_6x8n; ker_fp2 = bli_dgemmsup_rd_haswell_asm_2x8n; } else // if ( m0 == 9 ) { mr1 = 6; mr2 = 3; ker_fp1 = bli_dgemmsup_rd_haswell_asm_6x8n; ker_fp2 = bli_dgemmsup_rd_haswell_asm_3x8n; } ker_fp1 ( conja, conjb, mr1, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_dgemmsup_rd_haswell_asm_3x8n //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8n //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { #if 1 const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x8n //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_TRANSPOSE, conja, k0, n0, alpha, bj, rs_b0, cs_b0, ai, cs_a0, beta, cij, cs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r9) // ii = 0; label(.DLOOP3X4I) // LOOP OVER ii = [ 0 1 ... ] mov(var(a), rdx) // load address of a mov(var(b), r14) // load address of b mov(var(c), r12) // load address of c lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(rdi, rsi) // rsi *= rs_c lea(mem(r12, rsi, 1), r12) // r12 = c + 3*ii*rs_c; lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(r8, rsi) // rsi *= rs_a; lea(mem(rdx, rsi, 1), rdx) // rax = a + 3*ii*rs_a; mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4[0] = sum(ymm4); ymm4[1] = sum(ymm7) // ymm4[2] = sum(ymm10); ymm4[3] = sum(ymm13) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm5[0] = sum(ymm5); ymm5[1] = sum(ymm8) // ymm5[2] = sum(ymm11); ymm5[3] = sum(ymm14) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm6[0] = sum(ymm6); ymm6[1] = sum(ymm9) // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. add(imm(3), r9) // ii += 3; cmp(imm(6), r9) // compare ii to 6 jl(.DLOOP3X4I) // if ii < 6, jump to beginning // of ii loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 6; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_rd_haswell_asm_6x1 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rd_haswell_asm_3x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4[0] = sum(ymm4); ymm4[1] = sum(ymm7) // ymm4[2] = sum(ymm10); ymm4[3] = sum(ymm13) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm5[0] = sum(ymm5); ymm5[1] = sum(ymm8) // ymm5[2] = sum(ymm11); ymm5[3] = sum(ymm14) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm6[0] = sum(ymm6); ymm6[1] = sum(ymm9) // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_3x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_rd_haswell_asm_3x1 //bli_dgemmsup_r_haswell_ref_3x1 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rd_haswell_asm_2x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4[0] = sum(ymm4); ymm4[1] = sum(ymm7) // ymm4[2] = sum(ymm10); ymm4[3] = sum(ymm13) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm5[0] = sum(ymm5); ymm5[1] = sum(ymm8) // ymm5[2] = sum(ymm11); ymm5[3] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 2; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_rd_haswell_asm_2x1 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rd_haswell_asm_1x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4[0] = sum(ymm4); ymm4[1] = sum(ymm7) // ymm4[2] = sum(ymm10); ymm4[3] = sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 1; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_1x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x1 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_ddotxv_ex ( conja, conjb, k0, alpha, ai, cs_a0, bj, rs_b0, beta, cij, cntx, NULL ); #endif } } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c000066400000000000000000002527511427272030600305640ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. NOTE: These kernels implicitly support column-oriented IO, implemented via an a high-level transposition of the entire operation. A and B will effectively remain row- and column-stored, respectively, but C will then effectively appear column-stored. Thus, this kernel may be used for both rrc and crc cases. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x16m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 16; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rd_haswell_asm_6x12m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rd_haswell_asm_6x8m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rd_haswell_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_haswell_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(16), r15) // compare jj to 16 jl(.SLOOP3X4J) // if jj < 16, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 16; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x16 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x16 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_haswell_asm_6x12m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(12), r15) // compare jj to 12 jl(.SLOOP3X4J) // if jj < 12, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 12; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x12 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x12 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_haswell_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.SLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_haswell_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_ii; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x4 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_haswell_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_ii; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*4)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*4)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*rs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rax, r8, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovss(mem(rax, r8, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovss(mem(rax, r13, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rax, r8, 4), xmm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovss(mem(rax, r15, 1), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) vhaddps( ymm7, ymm6, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm6 ) vhaddps( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm8 ) vhaddps( ymm11, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm10 ) vhaddps( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm12 ) vhaddps( ymm15, ymm14, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm14 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) // xmm8[0:1] = sum(ymm8) sum(ymm9) // xmm10[0:1] = sum(ymm10) sum(ymm11) // xmm12[0:1] = sum(ymm12) sum(ymm13) // xmm14[0:1] = sum(ymm14) sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c lea(mem(r14, r8, 4), r14) // lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_sgemmsup_rd_haswell_asm_3x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c000066400000000000000000001766231427272030600305700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. NOTE: These kernels implicitly support column-oriented IO, implemented via an a high-level transposition of the entire operation. A and B will effectively remain row- and column-stored, respectively, but C will then effectively appear column-stored. Thus, this kernel may be used for both rrc and crc cases. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 6; // First check whether this is a edge case in the n dimension. If so, // dispatch other ?x8m kernels, as needed. if ( m_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; #if 1 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m0 ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; // These kernels don't make any attempt to optimize the cases of // inflated MR blocksizes because they don't benefit from the // load balancing that the "rv" kernels do. That is, if m0 = 7, // there is no benefit to executing that case as 4x16n followed // by 3x16n because 4x16n isn't implemented, and more generally // because these kernels are implemented as loops over their // true blocksizes, which are MR=3 NR=4. if ( m0 == 7 ) { mr1 = 6; mr2 = 1; ker_fp1 = bli_sgemmsup_rd_haswell_asm_6x16n; ker_fp2 = bli_sgemmsup_rd_haswell_asm_1x16n; } else if ( m0 == 8 ) { mr1 = 6; mr2 = 2; ker_fp1 = bli_sgemmsup_rd_haswell_asm_6x16n; ker_fp2 = bli_sgemmsup_rd_haswell_asm_2x16n; } else // if ( m0 == 9 ) { mr1 = 6; mr2 = 3; ker_fp1 = bli_sgemmsup_rd_haswell_asm_6x16n; ker_fp2 = bli_sgemmsup_rd_haswell_asm_3x16n; } ker_fp1 ( conja, conjb, mr1, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_sgemmsup_rd_haswell_asm_3x16n //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x16n //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { #if 1 const dim_t mr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x16n //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_TRANSPOSE, conja, k0, n0, alpha, bj, rs_b0, cs_b0, ai, cs_a0, beta, cij, cs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r9) // ii = 0; label(.SLOOP3X4I) // LOOP OVER ii = [ 0 1 ... ] mov(var(b), r14) // load address of b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(rdi, rsi) // rsi *= rs_c lea(mem(r12, rsi, 1), r12) // r12 = c + 3*ii*rs_c; lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(r8, rsi) // rsi *= rs_a; lea(mem(rdx, rsi, 1), rdx) // rax = a + 3*ii*rs_a; mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(32*4), r10) // r10 += 32*rs_b = 32*4; #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. add(imm(3), r9) // ii += 3; cmp(imm(6), r9) // compare jj to 6 jl(.SLOOP3X4I) // if ii < 6, jump to beginning // of ii loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 6; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_haswell_asm_6x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_rd_haswell_asm_6x1 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rd_haswell_asm_3x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(32*4), r10) // r10 += 32*rs_b = 32*4; #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_haswell_asm_3x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_rd_haswell_asm_3x1 //bli_sgemmsup_r_haswell_ref_3x1 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rd_haswell_asm_2x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(32*4), r10) // r10 += 32*rs_b = 32*4; #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 2; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_rd_haswell_asm_2x1 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rd_haswell_asm_1x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif //lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(32*4), r10) // r10 += 32*rs_b = 32*4; #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the n dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 1; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_haswell_asm_1x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x1 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sdotxv_ex ( conja, conjb, k0, alpha, ai, cs_a0, bj, rs_b0, beta, cij, cntx, NULL ); #endif } } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c000066400000000000000000002407661427272030600305130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rv_haswell_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_dgemmsup_rv_haswell_asm_6x6m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rv_haswell_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rv_haswell_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else dim_t ps_a0 = bli_auxinfo_ps_a( data ); if ( ps_a0 == 6 * rs_a0 ) { // Since A is not packed, we can use one gemv. bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } else { const dim_t mr = 6; // Since A is packed into row panels, we must use a loop over // gemv. dim_t m_iter = ( m0 + mr - 1 ) / mr; dim_t m_left = m0 % mr; double* restrict ai_ii = ai; double* restrict cij_ii = cij; for ( dim_t ii = 0; ii < m_iter; ii += 1 ) { dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, beta, cij_ii, rs_c0, cntx, NULL ); cij_ii += mr*rs_c0; ai_ii += ps_a0; } } #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a8 = ps_a * sizeof( double ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.DLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a8), rdx) // load ps_a8 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a8), rax) // load ps_a8 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 dec(r11) // ii -= 1; jne(.DLOOP6X8I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a8] "m" (ps_a8), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; //double* restrict ai = a + i_edge*rs_a; //double* restrict ai = a + ( i_edge / 6 ) * ps_a; double* restrict ai = a + m_iter * ps_a; double* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { dgemmsup_ker_ft ker_fp1 = NULL; dgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8; ker_fp2 = bli_dgemmsup_rv_haswell_asm_3x8; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8; ker_fp2 = bli_dgemmsup_rv_haswell_asm_4x8; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8; ker_fp2 = bli_dgemmsup_rv_haswell_asm_5x8; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif dgemmsup_ker_ft ker_fps[6] = { NULL, bli_dgemmsup_rv_haswell_asm_1x8, bli_dgemmsup_rv_haswell_asm_2x8, bli_dgemmsup_rv_haswell_asm_3x8, bli_dgemmsup_rv_haswell_asm_4x8, bli_dgemmsup_rv_haswell_asm_5x8 }; dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_dgemmsup_rv_haswell_asm_6x6m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a8 = ps_a * sizeof( double ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.DLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower vxorpd(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us vxorpd(ymm5, ymm5, ymm5) // down. vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a8), rdx) // load ps_a8 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(xmm0, xmm13, xmm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(xmm0, xmm15, xmm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a8), rax) // load ps_a8 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 dec(r11) // ii -= 1; jne(.DLOOP6X8I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a8] "m" (ps_a8), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 6; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; //double* restrict ai = a + i_edge*rs_a; //double* restrict ai = a + ( i_edge / 6 ) * ps_a; double* restrict ai = a + m_iter * ps_a; double* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { dgemmsup_ker_ft ker_fp1 = NULL; dgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x6; ker_fp2 = bli_dgemmsup_rv_haswell_asm_3x6; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x6; ker_fp2 = bli_dgemmsup_rv_haswell_asm_4x6; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x6; ker_fp2 = bli_dgemmsup_rv_haswell_asm_5x6; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif dgemmsup_ker_ft ker_fps[6] = { NULL, bli_dgemmsup_rv_haswell_asm_1x6, bli_dgemmsup_rv_haswell_asm_2x6, bli_dgemmsup_rv_haswell_asm_3x6, bli_dgemmsup_rv_haswell_asm_4x6, bli_dgemmsup_rv_haswell_asm_5x6 }; dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_dgemmsup_rv_haswell_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a8 = ps_a * sizeof( double ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.DLOOP6X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm14, ymm14, ymm14) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a8), rdx) // load ps_a8 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a8), rax) // load ps_a8 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 dec(r11) // ii -= 1; jne(.DLOOP6X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a8] "m" (ps_a8), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; //double* restrict ai = a + i_edge*rs_a; //double* restrict ai = a + ( i_edge / 6 ) * ps_a; double* restrict ai = a + m_iter * ps_a; double* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { dgemmsup_ker_ft ker_fp1 = NULL; dgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x4; ker_fp2 = bli_dgemmsup_rv_haswell_asm_3x4; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x4; ker_fp2 = bli_dgemmsup_rv_haswell_asm_4x4; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x4; ker_fp2 = bli_dgemmsup_rv_haswell_asm_5x4; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif dgemmsup_ker_ft ker_fps[6] = { NULL, bli_dgemmsup_rv_haswell_asm_1x4, bli_dgemmsup_rv_haswell_asm_2x4, bli_dgemmsup_rv_haswell_asm_3x4, bli_dgemmsup_rv_haswell_asm_4x4, bli_dgemmsup_rv_haswell_asm_5x4 }; dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_dgemmsup_rv_haswell_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a8 = ps_a * sizeof( double ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.DLOOP6X2I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(xmm4, xmm4, xmm4) vxorpd(xmm6, xmm6, xmm6) vxorpd(xmm8, xmm8, xmm8) vxorpd(xmm10, xmm10, xmm10) vxorpd(xmm12, xmm12, xmm12) vxorpd(xmm14, xmm14, xmm14) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 1*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a8), rdx) // load ps_a8 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a8 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14) vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a8), rax) // load ps_a8 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a8 dec(r11) // ii -= 1; jne(.DLOOP6X2I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a8] "m" (ps_a8), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; //double* restrict ai = a + i_edge*rs_a; //double* restrict ai = a + ( i_edge / 6 ) * ps_a; double* restrict ai = a + m_iter * ps_a; double* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { dgemmsup_ker_ft ker_fp1 = NULL; dgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x2; ker_fp2 = bli_dgemmsup_rv_haswell_asm_3x2; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x2; ker_fp2 = bli_dgemmsup_rv_haswell_asm_4x2; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x2; ker_fp2 = bli_dgemmsup_rv_haswell_asm_5x2; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif dgemmsup_ker_ft ker_fps[6] = { NULL, bli_dgemmsup_rv_haswell_asm_1x2, bli_dgemmsup_rv_haswell_asm_2x2, bli_dgemmsup_rv_haswell_asm_3x2, bli_dgemmsup_rv_haswell_asm_4x2, bli_dgemmsup_rv_haswell_asm_5x2 }; dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c000066400000000000000000003363321427272030600305070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rv_haswell_asm_6x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 6; // First check whether this is a edge case in the m dimension. If so, // dispatch other ?x8m kernels, as needed. if ( m_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; #if 1 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m0 ) { dgemmsup_ker_ft ker_fp1 = NULL; dgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m0 == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8n; ker_fp2 = bli_dgemmsup_rv_haswell_asm_3x8n; } else if ( m0 == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8n; ker_fp2 = bli_dgemmsup_rv_haswell_asm_4x8n; } else // if ( m0 == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_dgemmsup_rv_haswell_asm_4x8n; ker_fp2 = bli_dgemmsup_rv_haswell_asm_5x8n; } ker_fp1 ( conja, conjb, mr1, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif dgemmsup_ker_ft ker_fps[6] = { NULL, bli_dgemmsup_rv_haswell_asm_1x8n, bli_dgemmsup_rv_haswell_asm_2x8n, bli_dgemmsup_rv_haswell_asm_3x8n, bli_dgemmsup_rv_haswell_asm_4x8n, bli_dgemmsup_rv_haswell_asm_5x8n }; dgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b8 = ps_b * sizeof( double ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.DLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b8), rdx) // load ps_b8 lea(mem(rbx, rdx, 1), rdx) // rdx = b + ps_b8 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) // b_prefetch += rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b8), rbx) // load ps_b8 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b8 dec(r11) // jj -= 1; jne(.DLOOP6X8J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b8] "m" (ps_b8), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 6; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; //double* restrict bj = b + j_edge*cs_b; //double* restrict bj = b + ( j_edge / 8 ) * ps_b; double* restrict bj = b + n_iter * ps_b; if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_dgemmsup_rv_haswell_asm_6x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rv_haswell_asm_6x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rv_haswell_asm_6x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref_6x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rv_haswell_asm_5x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b8 = ps_b * sizeof( double ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.DLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b8), rdx) // load ps_b8 lea(mem(rbx, rdx, 1), rdx) // rdx = b + ps_b8 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) // b_prefetch += rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm12, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm13, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm12, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm13, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b8), rbx) // load ps_b8 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b8 dec(r11) // jj -= 1; jne(.DLOOP6X8J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b8] "m" (ps_b8), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 5; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; //double* restrict bj = b + j_edge*cs_b; //double* restrict bj = b + ( j_edge / 8 ) * ps_b; double* restrict bj = b + n_iter * ps_b; if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_dgemmsup_rv_haswell_asm_5x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rv_haswell_asm_5x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rv_haswell_asm_5x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref_5x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rv_haswell_asm_4x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b8 = ps_b * sizeof( double ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.DLOOP4X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b8), rdx) // load ps_b8 lea(mem(rbx, rdx, 1), rdx) // rdx = b + ps_b8 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) // b_prefetch += rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b8), rbx) // load ps_b8 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b8 dec(r11) // jj -= 1; jne(.DLOOP4X8J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b8] "m" (ps_b8), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 4; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; //double* restrict bj = b + j_edge*cs_b; //double* restrict bj = b + ( j_edge / 8 ) * ps_b; double* restrict bj = b + n_iter * ps_b; if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_dgemmsup_rv_haswell_asm_4x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rv_haswell_asm_4x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rv_haswell_asm_4x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref_4x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rv_haswell_asm_3x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b8 = ps_b * sizeof( double ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.DLOOP4X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(r12, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b8), rdx) // load ps_b8 lea(mem(rbx, rdx, 1), rdx) // rdx = b + ps_b8 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) // b_prefetch += rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx ), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vextractf128(imm(0x1), ymm9, xmm14) vextractf128(imm(0x1), ymm11, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm5) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm9) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm11) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) vmovupd(xmm9, mem(rcx, rsi, 2)) vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx ), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vextractf128(imm(0x1), ymm9, xmm14) vextractf128(imm(0x1), ymm11, xmm15) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) vmovupd(xmm9, mem(rcx, rsi, 2)) vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b8), rbx) // load ps_b8 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b8 dec(r11) // jj -= 1; jne(.DLOOP4X8J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b8] "m" (ps_b8), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; //double* restrict bj = b + j_edge*cs_b; //double* restrict bj = b + ( j_edge / 8 ) * ps_b; double* restrict bj = b + n_iter * ps_b; if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_dgemmsup_rv_haswell_asm_3x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rv_haswell_asm_3x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rv_haswell_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref_3x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rv_haswell_asm_2x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b8 = ps_b * sizeof( double ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.DLOOP2X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(r12, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b8), rdx) // load ps_b8 lea(mem(rbx, rdx, 1), rdx) // rdx = b + ps_b8 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) // b_prefetch += rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx ), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx ), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b8), rbx) // load ps_b8 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b8 dec(r11) // jj -= 1; jne(.DLOOP2X8J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b8] "m" (ps_b8), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 2; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; //double* restrict bj = b + j_edge*cs_b; //double* restrict bj = b + ( j_edge / 8 ) * ps_b; double* restrict bj = b + n_iter * ps_b; if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_dgemmsup_rv_haswell_asm_2x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rv_haswell_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rv_haswell_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref_2x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rv_haswell_asm_1x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b8 = ps_b * sizeof( double ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.DLOOP1X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(r12, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b8), rdx) // load ps_b8 lea(mem(rbx, rdx, 1), rdx) // rdx = b + ps_b8 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) // b_prefetch += rs_b; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm4, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm5, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vmovupd(ymm4, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vmovupd(ymm5, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b8), rbx) // load ps_b8 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b8 dec(r11) // jj -= 1; jne(.DLOOP1X8J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b8] "m" (ps_b8), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 1; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; //double* restrict bj = b + j_edge*cs_b; //double* restrict bj = b + ( j_edge / 8 ) * ps_b; double* restrict bj = b + n_iter * ps_b; if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_dgemmsup_rv_haswell_asm_1x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rv_haswell_asm_1x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rv_haswell_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref_1x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_ddotxv_ex ( conja, conjb, k0, alpha, ai, cs_a0, bj, rs_b0, beta, cij, cntx, NULL ); #endif } } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c000066400000000000000000004101631427272030600305770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rv_haswell_asm_6x16m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 16; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rv_haswell_asm_6x12m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_haswell_asm_6x8m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_sgemmsup_rv_haswell_asm_6x6m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_haswell_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_haswell_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else dim_t ps_a0 = bli_auxinfo_ps_a( data ); if ( ps_a0 == 6 * rs_a0 ) { // Since A is not packed, we can use one gemv. bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } else { const dim_t mr = 6; // Since A is packed into row panels, we must use a loop over // gemv. dim_t m_iter = ( m0 + mr - 1 ) / mr; dim_t m_left = m0 % mr; float* restrict ai_ii = ai; float* restrict cij_ii = cij; for ( dim_t ii = 0; ii < m_iter; ii += 1 ) { dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, beta, cij_ii, rs_c0, cntx, NULL ); cij_ii += mr*rs_c0; ai_ii += ps_a0; } } } return; #endif } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2,15*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a4), rdx) // load ps_a4 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm14, mem(rcx, 0*32)) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 16; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; //float* restrict ai = a + i_edge*rs_a; //float* restrict ai = a + ( i_edge / 6 ) * ps_a; float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_3x16; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_4x16; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_5x16; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x16, bli_sgemmsup_rv_haswell_asm_2x16, bli_sgemmsup_rv_haswell_asm_3x16, bli_sgemmsup_rv_haswell_asm_4x16, bli_sgemmsup_rv_haswell_asm_5x16 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_haswell_asm_6x12m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 11*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1,11*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2,11*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 11*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1,11*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2,11*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a4), rdx) // load ps_a4 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) vmulps(ymm0, ymm8, ymm8) vmulps(xmm0, xmm9, xmm9) vmulps(ymm0, ymm10, ymm10) vmulps(xmm0, xmm11, xmm11) vmulps(ymm0, ymm12, ymm12) vmulps(xmm0, xmm13, xmm13) vmulps(ymm0, ymm14, ymm14) vmulps(xmm0, xmm15, xmm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(ymm15, ymm13, ymm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm14, mem(rcx, 0*32)) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(ymm15, ymm13, ymm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 12; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; //float* restrict ai = a + i_edge*rs_a; //float* restrict ai = a + ( i_edge / 6 ) * ps_a; float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_3x16; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_4x16; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_5x16; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x12, bli_sgemmsup_rv_haswell_asm_2x12, bli_sgemmsup_rv_haswell_asm_3x12, bli_sgemmsup_rv_haswell_asm_4x12, bli_sgemmsup_rv_haswell_asm_5x12 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_haswell_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm12, ymm12, ymm12) vxorps(ymm14, ymm14, ymm14) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a4), rdx) // load ps_a4 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; //float* restrict ai = a + i_edge*rs_a; //float* restrict ai = a + ( i_edge / 6 ) * ps_a; float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x8; ker_fp2 = bli_sgemmsup_rv_haswell_asm_3x8; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x8; ker_fp2 = bli_sgemmsup_rv_haswell_asm_4x8; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x8; ker_fp2 = bli_sgemmsup_rv_haswell_asm_5x8; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x8, bli_sgemmsup_rv_haswell_asm_2x8, bli_sgemmsup_rv_haswell_asm_3x8, bli_sgemmsup_rv_haswell_asm_4x8, bli_sgemmsup_rv_haswell_asm_5x8 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_haswell_asm_6x6m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm12, ymm12, ymm12) vxorps(ymm14, ymm14, ymm14) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 5*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 5*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a4), rdx) // load ps_a4 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm5) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm7) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm9) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm11) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm12, xmm13) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm13) vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm14, xmm15) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm15) vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm12, xmm13) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm14, xmm15) vmovups(xmm14, mem(rcx, 0*4)) vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 6; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; //float* restrict ai = a + i_edge*rs_a; //float* restrict ai = a + ( i_edge / 6 ) * ps_a; float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x6; ker_fp2 = bli_sgemmsup_rv_haswell_asm_3x6; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x6; ker_fp2 = bli_sgemmsup_rv_haswell_asm_4x6; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x6; ker_fp2 = bli_sgemmsup_rv_haswell_asm_5x6; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x6, bli_sgemmsup_rv_haswell_asm_2x6, bli_sgemmsup_rv_haswell_asm_3x6, bli_sgemmsup_rv_haswell_asm_4x6, bli_sgemmsup_rv_haswell_asm_5x6 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_haswell_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm12, ymm12, ymm12) vxorps(ymm14, ymm14, ymm14) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 3*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 3*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 3*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 3*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a4), rdx) // load ps_a4 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(xmm14, xmm12, xmm0) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(xmm14, xmm12, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(ymm14, ymm12, ymm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; //float* restrict ai = a + i_edge*rs_a; //float* restrict ai = a + ( i_edge / 6 ) * ps_a; float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x4; ker_fp2 = bli_sgemmsup_rv_haswell_asm_3x4; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x4; ker_fp2 = bli_sgemmsup_rv_haswell_asm_4x4; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x4; ker_fp2 = bli_sgemmsup_rv_haswell_asm_5x4; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x4, bli_sgemmsup_rv_haswell_asm_2x4, bli_sgemmsup_rv_haswell_asm_3x4, bli_sgemmsup_rv_haswell_asm_4x4, bli_sgemmsup_rv_haswell_asm_5x4 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_haswell_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm12, ymm12, ymm12) vxorps(ymm14, ymm14, ymm14) #endif mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 1*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 1*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) //lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_a4), rdx) // load ps_a4 lea(mem(rax, rdx, 1), rdx) // rdx = a + ps_a4 lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; // use rcx, rdx for prefetching lines // from next upanel of a. #else lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(xmm14, xmm12, xmm0) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; //float* restrict ai = a + i_edge*rs_a; //float* restrict ai = a + ( i_edge / 6 ) * ps_a; float* restrict ai = a + m_iter * ps_a; float* restrict bj = b; #if 0 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m_left ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m_left == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_3x16; } else if ( m_left == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_4x16; } else // if ( m_left == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16; ker_fp2 = bli_sgemmsup_rv_haswell_asm_5x16; } ker_fp1 ( conja, conjb, mr1, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x2, bli_sgemmsup_rv_haswell_asm_2x2, bli_sgemmsup_rv_haswell_asm_3x2, bli_sgemmsup_rv_haswell_asm_4x2, bli_sgemmsup_rv_haswell_asm_5x2 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c000066400000000000000000004334401427272030600306030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rv_haswell_asm_6x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 6; // First check whether this is a edge case in the m dimension. If so, // dispatch other ?x8m kernels, as needed. if ( m_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; #if 1 // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m0 ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m0 == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16n; ker_fp2 = bli_sgemmsup_rv_haswell_asm_3x16n; } else if ( m0 == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16n; ker_fp2 = bli_sgemmsup_rv_haswell_asm_4x16n; } else // if ( m0 == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_haswell_asm_4x16n; ker_fp2 = bli_sgemmsup_rv_haswell_asm_5x16n; } ker_fp1 ( conja, conjb, mr1, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } #endif sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_haswell_asm_1x16n, bli_sgemmsup_rv_haswell_asm_2x16n, bli_sgemmsup_rv_haswell_asm_3x16n, bli_sgemmsup_rv_haswell_asm_4x16n, bli_sgemmsup_rv_haswell_asm_5x16n }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2,15*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 5*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 5*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 5*4)) // prefetch c + 12*cs_c lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 5*4)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rcx, 1, 5*4)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b4), rdx) // load ps_b4 lea(mem(rbx, rdx, 1), rdx) // rdx = a + ps_b4 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm14, mem(rcx, 0*32)) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rsi, 8), r12) // lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 16*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X8J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 6; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; //float* restrict bj = b + j_edge*cs_b; //float* restrict bj = b + ( j_edge / 8 ) * ps_b; float* restrict bj = b + n_iter * ps_b; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rv_haswell_asm_6x12 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_haswell_asm_6x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_sgemmsup_rv_haswell_asm_6x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_haswell_asm_6x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_haswell_asm_6x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref_6x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rv_haswell_asm_5x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1,15*4)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 4*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 4*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 4*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 4*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 4*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 4*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 4*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 4*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 4*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 4*4)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 4*4)) // prefetch c + 12*cs_c lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*4)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 4*4)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rcx, 1, 4*4)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b4), rdx) // load ps_b4 lea(mem(rbx, rdx, 1), rdx) // rdx = a + ps_b4 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) lea(mem(r12, rsi, 8), r12) // lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 16*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X8J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 5; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; //float* restrict bj = b + j_edge*cs_b; //float* restrict bj = b + ( j_edge / 8 ) * ps_b; float* restrict bj = b + n_iter * ps_b; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rv_haswell_asm_5x12 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_haswell_asm_5x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_sgemmsup_rv_haswell_asm_5x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_haswell_asm_5x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_haswell_asm_5x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref_5x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rv_haswell_asm_4x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 15*4)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 3*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 3*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 3*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 3*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 3*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 3*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 3*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 3*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 3*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 3*4)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 3*4)) // prefetch c + 12*cs_c lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*4)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 3*4)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rcx, 1, 3*4)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b4), rdx) // load ps_b4 lea(mem(rbx, rdx, 1), rdx) // rdx = a + ps_b4 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) lea(mem(r12, rsi, 8), r12) // lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 16*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X8J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 4; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; //float* restrict bj = b + j_edge*cs_b; //float* restrict bj = b + ( j_edge / 8 ) * ps_b; float* restrict bj = b + n_iter * ps_b; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rv_haswell_asm_4x12 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_haswell_asm_4x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_sgemmsup_rv_haswell_asm_4x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_haswell_asm_4x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_haswell_asm_4x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref_4x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rv_haswell_asm_3x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(r12, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2,15*4)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 2*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 2*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 2*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 2*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 2*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 2*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 2*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 2*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 2*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 2*4)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 2*4)) // prefetch c + 12*cs_c lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*4)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 2*4)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rcx, 1, 2*4)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b4), rdx) // load ps_b4 lea(mem(rbx, rdx, 1), rdx) // rdx = a + ps_b4 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm9, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm9, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 4), rcx) label(.SDONE) lea(mem(r12, rsi, 8), r12) // lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 16*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X8J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; //float* restrict bj = b + j_edge*cs_b; //float* restrict bj = b + ( j_edge / 8 ) * ps_b; float* restrict bj = b + n_iter * ps_b; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rv_haswell_asm_3x12 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_haswell_asm_3x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_sgemmsup_rv_haswell_asm_3x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_haswell_asm_3x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_haswell_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref_3x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rv_haswell_asm_2x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(r12, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1,15*4)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 1*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 1*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 1*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 1*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 1*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 1*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 1*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 1*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 1*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 1*4)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 1*4)) // prefetch c + 12*cs_c lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*4)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 1*4)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rcx, 1, 1*4)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b4), rdx) // load ps_b4 lea(mem(rbx, rdx, 1), rdx) // rdx = a + ps_b4 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) lea(mem(r12, rsi, 8), r12) // lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 16*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X8J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 2; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; //float* restrict bj = b + j_edge*cs_b; //float* restrict bj = b + ( j_edge / 8 ) * ps_b; float* restrict bj = b + n_iter * ps_b; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rv_haswell_asm_2x12 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_haswell_asm_2x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_sgemmsup_rv_haswell_asm_2x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_haswell_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_haswell_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref_2x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rv_haswell_asm_1x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 / 16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X8J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) #endif mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(r12, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 15*4)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rcx) // rcx = 3*cs_c; prefetch(0, mem(r12, 0*4)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 0*4)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 0*4)) // prefetch c + 2*cs_c prefetch(0, mem(r12, rcx, 1, 0*4)) // prefetch c + 3*cs_c prefetch(0, mem(r12, rsi, 4, 0*4)) // prefetch c + 4*cs_c lea(mem(r12, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*4)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 0*4)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rcx, 1, 0*4)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 0*4)) // prefetch c + 8*cs_c lea(mem(r12, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*4)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 0*4)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rcx, 1, 0*4)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 0*4)) // prefetch c + 12*cs_c lea(mem(r12, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*4)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 0*4)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rcx, 1, 0*4)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 mov(var(ps_b4), rdx) // load ps_b4 lea(mem(rbx, rdx, 1), rdx) // rdx = a + ps_b4 lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; // use rcx, rdx for prefetching lines // from next upanel of b. #else lea(mem(rbx, r8, 8), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 8), rdx) // from next upanel of b. lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, r10, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 #if 0 prefetch(0, mem(rdx, 5*8)) #else prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r10, 4), rdx) // b_prefetch += 4*rs_b; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 1 prefetch(0, mem(rdx, 5*8)) add(r10, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rcx, rax, 2), xmm1) vmovss(mem(rcx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vmovups(ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rcx, rax, 2), xmm1) vmovss(mem(rcx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vmovups(ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) lea(mem(r12, rsi, 8), r12) // lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 16*cs_c //add(imm(8*8), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // b_jj = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X8J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 1; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; //float* restrict bj = b + j_edge*cs_b; //float* restrict bj = b + ( j_edge / 8 ) * ps_b; float* restrict bj = b + n_iter * ps_b; if ( 12 <= n_left ) { const dim_t nr_cur = 12; bli_sgemmsup_rv_haswell_asm_1x12 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_haswell_asm_1x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 6 <= n_left ) { const dim_t nr_cur = 6; bli_sgemmsup_rv_haswell_asm_1x6 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_haswell_asm_1x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_haswell_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_haswell_ref_1x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/000077500000000000000000000000001427272030600225025ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c000066400000000000000000000133431427272030600307420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) // NOTE: Normally, for any "?x1" kernel, we would call the reference kernel. // However, at least one other subconfiguration (zen) uses this kernel set, so // we need to be able to call a set of "?x1" kernels that we know will actually // exist regardless of which subconfiguration these kernels were used by. Thus, // the compromise employed here is to inline the reference kernel so it gets // compiled as part of the haswell kernel set, and hence can unconditionally be // called by other kernels within that kernel set. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mdim ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ for ( dim_t i = 0; i < mdim; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ /* for ( dim_t j = 0; j < 1; ++j ) */ \ { \ ctype* restrict cij = ci /*[ j*cs_c ]*/ ; \ ctype* restrict bj = b /*[ j*cs_b ]*/ ; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(d,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } GENTFUNC( double, d, gemmsup_r_haswell_ref_6x1, 6 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_5x1, 5 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_4x1, 4 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_3x1, 3 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_2x1, 2 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_1x1, 1 ) cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c000066400000000000000000001304661427272030600311200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rd_haswell_asm_6x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm14, ymm14, ymm14) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 // ymm6 // ymm8 // ymm10 // ymm12 // ymm14 vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) vhaddpd( ymm6, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm8, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm10, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm12, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm14, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) // xmm8[0] = sum(ymm8) // xmm10[0] = sum(ymm10) // xmm12[0] = sum(ymm12) // xmm14[0] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_3x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm8, ymm8, ymm8) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 // ymm6 // ymm8 vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) vhaddpd( ymm6, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm8, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) // xmm8[0] = sum(ymm8) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm6, ymm6, ymm6) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 // ymm6 vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) vhaddpd( ymm6, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 vhaddpd( ymm4, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm4[0] = sum(ymm4) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231pd(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c000066400000000000000000001375531427272030600311250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rd_haswell_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 6; //uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) // xmm8[0:1] = sum(ymm8) sum(ymm9) // xmm10[0:1] = sum(ymm10) sum(ymm11) // xmm12[0:1] = sum(ymm12) sum(ymm13) // xmm14[0:1] = sum(ymm14) sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 6; //uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) // xmm8[0:1] = sum(ymm8) sum(ymm9) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 6; //uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; //uint64_t m_iter = m0 / 6; //uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c000066400000000000000000001131611427272030600311140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rd_haswell_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter .. 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_ii; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // xmm5[0:3] = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // xmm6[0:3] = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif //lea(mem(r12), rcx) // rcx = c; //lea(mem(r14), rax) // rax = a; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // xmm5[0:3] = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif //lea(mem(r12), rcx) // rcx = c; //lea(mem(r14), rax) // rax = a; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c //prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // xmm4[0:3] = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c000066400000000000000000001261351427272030600311250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rd_haswell_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rd_haswell_asm_6x4 //bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2 //bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_dgemmsup_rd_haswell_asm_6x1 //bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c*sizeof(double) = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.DLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x8 //bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c*sizeof(double) = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.DLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c*sizeof(double) = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.DLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c000066400000000000000000002016621427272030600311400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rv_haswell_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 1*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14) vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // r13 = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12) vmovupd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vfmadd213pd(xmm12, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovupd(xmm12, xmm0) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10) vmovupd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8) vmovupd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-1 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-1 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6) vmovupd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-1 vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vfmadd231pd(mem(rcx ), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4) vmovupd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-1 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vfmadd213pd(xmm4, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-1 vmovlpd(xmm4, mem(rcx )) vmovhpd(xmm4, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c000066400000000000000000002116531427272030600311430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rv_haswell_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 4*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm12, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm12, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx ), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx ), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm4, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vmovupd(ymm4, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c000066400000000000000000002455111427272030600311450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) void bli_dgemmsup_rv_haswell_asm_6x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(xmm0, xmm13, xmm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(xmm0, xmm15, xmm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(xmm0, xmm13, xmm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13) vmovupd(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm12, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vfmadd213pd(xmm13, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm12, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm13, ymm0) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11) vmovupd(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9) vmovupd(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx ), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm5) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7) //vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm9) //vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm11) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx ), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7) vmovupd(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm5) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-5 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5) vmovupd(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm4, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-5 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vfmadd213pd(xmm5, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vmovupd(ymm4, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-5 vmovupd(xmm5, xmm0) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c000066400000000000000000002621671427272030600311550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft static dim_t mrs[NUM_MR] = { 6, 4, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rv_haswell_asm_6x8, bli_dgemmsup_rv_haswell_asm_6x4, bli_dgemmsup_rv_haswell_asm_6x2, bli_dgemmsup_r_haswell_ref_6x1 }, /* 4 */ { bli_dgemmsup_rv_haswell_asm_4x8, bli_dgemmsup_rv_haswell_asm_4x4, bli_dgemmsup_rv_haswell_asm_4x2, bli_dgemmsup_r_haswell_ref_4x1 }, /* 2 */ { bli_dgemmsup_rv_haswell_asm_2x8, bli_dgemmsup_rv_haswell_asm_2x4, bli_dgemmsup_rv_haswell_asm_2x2, bli_dgemmsup_r_haswell_ref_2x1 }, /* 1 */ { bli_dgemmsup_rv_haswell_asm_1x8, bli_dgemmsup_rv_haswell_asm_1x4, bli_dgemmsup_rv_haswell_asm_1x2, bli_dgemmsup_r_haswell_ref_1x1 }, }; void bli_dgemmsup_rv_haswell_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif // Use a reference kernel if this is an edge case in the m or n // dimensions. if ( m0 < 6 || n0 < 8 ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif dim_t n_left = n0; double* restrict cj = c; double* restrict bj = b; // Iterate across columns (corresponding to elements of nrs) until // n_left is zero. for ( dim_t j = 0; n_left != 0; ++j ) { const dim_t nr_cur = nrs[ j ]; // Once we find the value of nrs that is less than (or equal to) // n_left, we use the kernels in that column. if ( nr_cur <= n_left ) { dim_t m_left = m0; double* restrict cij = cj; double* restrict ai = a; // Iterate down the current column (corresponding to elements // of mrs) until m_left is zero. for ( dim_t i = 0; m_left != 0; ++i ) { const dim_t mr_cur = mrs[ i ]; // Once we find the value of mrs that is less than (or equal // to) m_left, we select that kernel. if ( mr_cur <= m_left ) { FUNCPTR_T ker_fp = kmap[i][j]; //printf( "executing %d x %d sup kernel.\n", (int)mr_cur, (int)nr_cur ); // Call the kernel using current mrs and nrs values. ker_fp ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); // Advance C and A pointers by the mrs and nrs we just // used, and decrement m_left. cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } } // Advance C and B pointers by the mrs and nrs we just used, and // decrement n_left. cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14) vmovupd(ymm14, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx ), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm14, mem(rcx, 0*32)) vmovupd(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx )) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12) vmovupd(ymm12, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm12, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rdx ), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm13, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm12, mem(rcx, 0*32)) vmovupd(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm12, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm13, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx )) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10) vmovupd(ymm10, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm10, mem(rcx, 0*32)) vmovupd(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx )) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx )) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8) vmovupd(ymm8, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx ), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vextractf128(imm(0x1), ymm9, xmm14) vextractf128(imm(0x1), ymm11, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx ), xmm3, xmm5) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm9) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm11) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) vmovupd(xmm9, mem(rcx, rsi, 2)) vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx ), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm8, mem(rcx, 0*32)) vmovupd(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx )) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vextractf128(imm(0x1), ymm9, xmm14) vextractf128(imm(0x1), ymm11, xmm15) vmovupd(xmm5, mem(rcx )) vmovupd(xmm7, mem(rcx, rsi, 1)) vmovupd(xmm9, mem(rcx, rsi, 2)) vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx )) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6) vmovupd(ymm6, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx ), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx ), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovupd(ymm6, mem(rcx, 0*32)) vmovupd(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx )) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4) vmovupd(ymm4, mem(rcx, 0*32)) vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) // begin I/O on columns 0-3 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm4, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vmovlpd(mem(rcx ), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm5, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx, 0*32)) vmovupd(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) // begin I/O on columns 0-3 vmovupd(ymm4, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // begin I/O on columns 4-7 vmovupd(ymm5, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx )) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/old/000077500000000000000000000000001427272030600232605ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rd_haswell_asm_d6x8.c000066400000000000000000003606271427272030600316620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) #if 0 // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref_6x1 }, /* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref_3x1 }, /* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref_2x1 }, /* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref_1x1 } }; #endif void bli_dgemmsup_rd_haswell_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rd_haswell_asm_6x4 ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2 ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c*sizeof(double) = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c*sizeof(double) = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c*sizeof(double) = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter .. 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 3*ii*rs_c; lea(mem(r14), rax) // rax = a + 3*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; #if 1 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | | | -------- -- -- -- ... | | | | -------- += -- -- -- | | | | -------- | | | | -------- : -------- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 6*ii*rs_c; lea(mem(r14), rax) // rax = a + 6*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) // xmm10 = sum(ymm10) sum(ymm11) // xmm12 = sum(ymm12) sum(ymm13) // xmm14 = sum(ymm14) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c lea(mem(r14, r8, 4), r14) // lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_dgemmsup_rd_haswell_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) // xmm4 = sum(ymm4) sum(ymm5) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/d6x8/old/bli_gemmsup_rv_haswell_asm_d6x8.c000066400000000000000000011227511427272030600316770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft static dim_t mrs[NUM_MR] = { 6, 4, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rv_haswell_asm_6x8, bli_dgemmsup_rv_haswell_asm_6x4, bli_dgemmsup_rv_haswell_asm_6x2, bli_dgemmsup_r_haswell_ref_6x1 }, /* 4 */ { bli_dgemmsup_rv_haswell_asm_4x8, bli_dgemmsup_rv_haswell_asm_4x4, bli_dgemmsup_rv_haswell_asm_4x2, bli_dgemmsup_r_haswell_ref_4x1 }, /* 2 */ { bli_dgemmsup_rv_haswell_asm_2x8, bli_dgemmsup_rv_haswell_asm_2x4, bli_dgemmsup_rv_haswell_asm_2x2, bli_dgemmsup_r_haswell_ref_2x1 }, /* 1 */ { bli_dgemmsup_rv_haswell_asm_1x8, bli_dgemmsup_rv_haswell_asm_1x4, bli_dgemmsup_rv_haswell_asm_1x2, bli_dgemmsup_r_haswell_ref_1x1 }, }; void bli_dgemmsup_rv_haswell_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif // Use a reference kernel if this is an edge case in the m or n // dimensions. if ( m0 < 6 || n0 < 8 ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif dim_t n_left = n0; double* restrict cj = c; double* restrict bj = b; // Iterate across columns (corresponding to elements of nrs) until // n_left is zero. for ( dim_t j = 0; n_left != 0; ++j ) { const dim_t nr_cur = nrs[ j ]; // Once we find the value of nrs that is less than (or equal to) // n_left, we use the kernels in that column. if ( nr_cur <= n_left ) { dim_t m_left = m0; double* restrict cij = cj; double* restrict ai = a; // Iterate down the current column (corresponding to elements // of mrs) until m_left is zero. for ( dim_t i = 0; m_left != 0; ++i ) { const dim_t mr_cur = mrs[ i ]; // Once we find the value of mrs that is less than (or equal // to) m_left, we select that kernel. if ( mr_cur <= m_left ) { FUNCPTR_T ker_fp = kmap[i][j]; //printf( "executing %d x %d sup kernel.\n", (int)mr_cur, (int)nr_cur ); // Call the kernel using current mrs and nrs values. ker_fp ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); // Advance C and A pointers by the mrs and nrs we just // used, and decrement m_left. cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } } // Advance C and B pointers by the mrs and nrs we just used, and // decrement n_left. cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #endif #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9) vmovupd(ymm9, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm11) vmovupd(ymm11, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm13) vmovupd(ymm13, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm14) vmovupd(ymm14, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm15) vmovupd(ymm15, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(ymm11, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) vmovupd(ymm13, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm14, mem(rcx)) vmovupd(ymm15, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm13, ymm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9) vmovupd(ymm9, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm11) vmovupd(ymm11, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm13) vmovupd(ymm13, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovlpd(mem(rdx), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm12, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) #endif lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovlpd(mem(rdx), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm13, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(ymm11, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) vmovupd(ymm13, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovupd(ymm12, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) #endif lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovupd(ymm13, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm11, ymm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9) vmovupd(ymm9, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm11) vmovupd(ymm11, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(ymm11, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) vmovupd(ymm9, mem(rcx, rsi, 2)) vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm9, ymm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm9) vmovupd(ymm9, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vextractf128(imm(0x1), ymm9, xmm14) vextractf128(imm(0x1), ymm11, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm5) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm9) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm11) vmovupd(xmm5, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 1)) vmovupd(xmm9, mem(rcx, rsi, 2)) vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) vextractf128(imm(0x1), ymm9, xmm14) vextractf128(imm(0x1), ymm11, xmm15) vmovupd(xmm5, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 1)) vmovupd(xmm9, mem(rcx, rsi, 2)) vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm7, ymm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm7) vmovupd(ymm7, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), ymm3, ymm5) vmovupd(ymm5, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm4, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm5, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vmovupd(ymm4, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovupd(ymm5, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_6x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(xmm0, xmm13, xmm13) vmulpd(ymm0, ymm14, ymm14) vmulpd(xmm0, xmm15, xmm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9) vmovupd(xmm9, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm11) vmovupd(xmm11, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm13) vmovupd(xmm13, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm14) vmovupd(ymm14, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm15) vmovupd(xmm15, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) //vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) //vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) //vextractf128(imm(0x1), ymm0, xmm2) //vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) //vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) //vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //vmovupd(xmm2, mem(rdx, rsi, 2)) //vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(xmm9, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(xmm11, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) vmovupd(xmm13, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm14, mem(rcx)) vmovupd(xmm15, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) //vextractf128(imm(0x1), ymm0, xmm2) //vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //vmovupd(xmm2, mem(rdx, rsi, 2)) //vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) vmulpd(ymm0, ymm12, ymm12) vmulpd(xmm0, xmm13, xmm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9) vmovupd(xmm9, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm11) vmovupd(xmm11, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm13) vmovupd(xmm13, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovlpd(mem(rdx), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm12, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) #endif lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) //vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) //vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) //vextractf128(imm(0x1), ymm0, xmm2) //vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) //vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) //vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //vmovupd(xmm2, mem(rdx, rsi, 2)) //vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovlpd(mem(rdx), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) //vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) //vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) //vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(xmm13, xmm3, xmm0) //vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) //vmovlpd(xmm1, mem(rdx, rsi, 2)) //vmovhpd(xmm1, mem(rdx, rax, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(xmm9, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(xmm11, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) vmovupd(xmm13, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovupd(ymm12, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) #endif lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm15, ymm13, ymm0) vunpckhpd(ymm15, ymm13, ymm1) //vextractf128(imm(0x1), ymm0, xmm2) //vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //vmovupd(xmm2, mem(rdx, rsi, 2)) //vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovupd(ymm13, ymm0) //vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) //vmovlpd(xmm1, mem(rdx, rsi, 2)) //vmovhpd(xmm1, mem(rdx, rax, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(ymm0, ymm10, ymm10) vmulpd(xmm0, xmm11, xmm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9) vmovupd(xmm9, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm11) vmovupd(xmm11, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm5) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm7) //vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) //vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(xmm9, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) vmovupd(xmm11, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(ymm5, mem(rcx)) vmovupd(ymm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(ymm0, ymm8, ymm8) vmulpd(xmm0, xmm9, xmm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm9) vmovupd(xmm9, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) //vextractf128(imm(0x1), ymm9, xmm14) //vextractf128(imm(0x1), ymm11, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm5) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm9) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm11) vmovupd(xmm5, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 1)) //vmovupd(xmm9, mem(rcx, rsi, 2)) //vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) //vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) //vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) //vmovsd(xmm14, mem(rdx, rsi, 2)) //vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(xmm9, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) vperm2f128(imm(0x31), ymm2, ymm0, ymm9) vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vextractf128(imm(0x1), ymm5, xmm12) vextractf128(imm(0x1), ymm7, xmm13) //vextractf128(imm(0x1), ymm9, xmm14) //vextractf128(imm(0x1), ymm11, xmm15) vmovupd(xmm5, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 1)) //vmovupd(xmm9, mem(rcx, rsi, 2)) //vmovupd(xmm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) //vmovsd(xmm14, mem(rdx, rsi, 2)) //vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(ymm0, ymm6, ymm6) vmulpd(xmm0, xmm7, xmm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm7) vmovupd(xmm7, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm5) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm7) //vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm9) //vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm11) vmovupd(xmm5, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(xmm5, mem(rcx, rsi, 4)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm7, ymm5, ymm0) vunpckhpd(ymm7, ymm5, ymm1) vunpcklpd(ymm11, ymm9, ymm2) vunpckhpd(ymm11, ymm9, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm5) vinsertf128(imm(0x1), xmm3, ymm1, ymm7) //vperm2f128(imm(0x31), ymm2, ymm0, ymm9) //vperm2f128(imm(0x31), ymm3, ymm1, ymm11) vmovupd(xmm5, mem(rcx)) vmovupd(xmm7, mem(rcx, rsi, 1)) //vmovupd(ymm9, mem(rcx, rsi, 2)) //vmovupd(ymm11, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 1 prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) vfmadd231pd(mem(rcx, rsi, 4), xmm3, xmm5) vmovupd(xmm5, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm4, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) //vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) //vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) //vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(xmm5, xmm3, xmm0) //vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) //vmovlpd(xmm1, mem(rcx, rsi, 2)) //vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(xmm5, mem(rcx, rsi, 4)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vmovupd(ymm4, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) vmovupd(xmm5, xmm0) //vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) //vmovlpd(xmm1, mem(rcx, rsi, 2)) //vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm14) vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) add(rdi, rcx) vmovupd(ymm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #endif #if 0 lea(mem(rax, r9, 8), rdx) // use rdx for prefetching b. lea(mem(rdx, r9, 8), rdx) // rdx = b + 16*rs_b; #else #if 1 mov(r9, rsi) // rsi = rs_b; sal(imm(5), rsi) // rsi = 16*rs_b; lea(mem(rax, rsi, 1), rdx) // rdx = b + 16*rs_b; #endif #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) vmulpd(ymm0, ymm12, ymm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm12) vmovupd(ymm12, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rdx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rdx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovlpd(mem(rdx), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm12, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(ymm14, ymm12, ymm0) vunpckhpd(ymm14, ymm12, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) vmovupd(xmm2, mem(rdx, rsi, 2)) vmovupd(xmm4, mem(rdx, rax, 1)) #else vmovupd(ymm12, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) vmovlpd(xmm1, mem(rdx, rsi, 2)) vmovhpd(xmm1, mem(rdx, rax, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm0, ymm10, ymm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm10) vmovupd(ymm10, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vfmadd231pd(mem(rcx, rsi, 2), ymm3, ymm8) vfmadd231pd(mem(rcx, rax, 1), ymm3, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm10, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) vmovupd(ymm8, mem(rcx, rsi, 2)) vmovupd(ymm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) vmulpd(ymm0, ymm8, ymm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm8) vmovupd(ymm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vperm2f128(imm(0x31), ymm2, ymm0, ymm8) vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) vextractf128(imm(0x1), ymm8, xmm14) vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) vmovupd(xmm8, mem(rcx, rsi, 2)) vmovupd(xmm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) vmovsd(xmm14, mem(rdx, rsi, 2)) vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm6, ymm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vfmadd231pd(mem(rcx), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm2) vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm4) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vextractf128(imm(0x1), ymm1, xmm4) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) vmovupd(xmm2, mem(rcx, rsi, 2)) vmovupd(xmm4, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vperm2f128(imm(0x20), ymm1, ymm0, ymm0) vfmadd213pd(ymm4, ymm3, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vmovupd(ymm4, ymm0) vextractf128(imm(1), ymm0, xmm1) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) vmovlpd(xmm1, mem(rcx, rsi, 2)) vmovhpd(xmm1, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 1*8)) // prefetch c + 5*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rcx, rsi, 2), rdx) // //lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) vbroadcastsd(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) vfmadd231pd(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_5x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) vbroadcastsd(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // r13 = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vfmadd231pd(mem(rdx), xmm3, xmm0) vfmadd231pd(mem(rdx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) #else vmovlpd(mem(rdx), xmm0, xmm0) vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) vfmadd213pd(xmm12, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) #if 0 vunpcklpd(xmm14, xmm12, xmm0) vunpckhpd(xmm14, xmm12, xmm1) vmovupd(xmm0, mem(rdx)) vmovupd(xmm1, mem(rdx, rsi, 1)) #else vmovupd(xmm12, xmm0) vmovlpd(xmm0, mem(rdx)) vmovhpd(xmm0, mem(rdx, rsi, 1)) #endif //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_4x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) vbroadcastsd(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) vfmadd231pd(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), ymm3, ymm4) vfmadd231pd(mem(rcx, rsi, 1), ymm3, ymm6) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vunpcklpd(xmm10, xmm8, xmm2) vunpckhpd(xmm10, xmm8, xmm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) vmovupd(ymm4, mem(rcx)) vmovupd(ymm6, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rcx, rsi, 2), rdx) // //lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) vbroadcastsd(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) //vperm2f128(imm(0x31), ymm2, ymm0, ymm8) //vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) //vextractf128(imm(0x1), ymm8, xmm14) //vextractf128(imm(0x1), ymm10, xmm15) vbroadcastsd(mem(rbx), ymm3) vfmadd231pd(mem(rcx), xmm3, xmm4) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm6) //vfmadd231pd(mem(rcx, rsi, 2), xmm3, xmm8) //vfmadd231pd(mem(rcx, rax, 1), xmm3, xmm10) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) //vmovupd(xmm8, mem(rcx, rsi, 2)) //vmovupd(xmm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vfmadd231sd(mem(rdx), xmm3, xmm12) vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13) //vfmadd231sd(mem(rdx, rsi, 2), xmm3, xmm14) //vfmadd231sd(mem(rdx, rax, 1), xmm3, xmm15) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) //vmovsd(xmm14, mem(rdx, rsi, 2)) //vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(ymm6, ymm4, ymm0) vunpckhpd(ymm6, ymm4, ymm1) vunpcklpd(ymm10, ymm8, ymm2) vunpckhpd(ymm10, ymm8, ymm3) vinsertf128(imm(0x1), xmm2, ymm0, ymm4) vinsertf128(imm(0x1), xmm3, ymm1, ymm6) //vperm2f128(imm(0x31), ymm2, ymm0, ymm8) //vperm2f128(imm(0x31), ymm3, ymm1, ymm10) vextractf128(imm(0x1), ymm4, xmm12) vextractf128(imm(0x1), ymm6, xmm13) //vextractf128(imm(0x1), ymm8, xmm14) //vextractf128(imm(0x1), ymm10, xmm15) vmovupd(xmm4, mem(rcx)) vmovupd(xmm6, mem(rcx, rsi, 1)) //vmovupd(xmm8, mem(rcx, rsi, 2)) //vmovupd(xmm10, mem(rcx, rax, 1)) //lea(mem(rcx, rsi, 4), rcx) vmovsd(xmm12, mem(rdx)) vmovsd(xmm13, mem(rdx, rsi, 1)) //vmovsd(xmm14, mem(rdx, rsi, 2)) //vmovsd(xmm15, mem(rdx, rax, 1)) //lea(mem(rdx, rsi, 4), rdx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rcx, rsi, 2), rdx) // //lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vbroadcastsd(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) vfmadd231pd(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vfmadd231pd(mem(rcx), xmm3, xmm0) vfmadd231pd(mem(rcx, rsi, 1), xmm3, xmm1) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vunpcklpd(xmm6, xmm4, xmm0) vunpckhpd(xmm6, xmm4, xmm1) vmovupd(xmm0, mem(rcx)) vmovupd(xmm1, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rv_haswell_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) //lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c #else cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLPFETCH) // jump to column storage case label(.DROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c jmp(.DPOSTPFETCH) // jump to end of prefetching c label(.DCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(double) //lea(mem(rcx, rsi, 2), rdx) // //lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c label(.DPOSTPFETCH) // done prefetching c #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) // ---------------------------------- iteration 3 vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231pd(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), r14) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORED) // jump to column storage case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORED) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vfmadd213pd(xmm4, xmm3, xmm0) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.DCOLSTORBZ) // jump to column storage case label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DCOLSTORBZ) vmovlpd(xmm4, mem(rcx)) vmovhpd(xmm4, mem(rcx, rsi, 1)) //lea(mem(rcx, rsi, 4), rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } // ----------------------------------------------------------------------------- // NOTE: Normally, for any "?x1" kernel, we would call the reference kernel. // However, at least one other subconfiguration (zen) uses this kernel set, so // we need to be able to call a set of "?x1" kernels that we know will actually // exist regardless of which subconfiguration these kernels were used by. Thus, // the compromise employed here is to inline the reference kernel so it gets // compiled as part of the haswell kernel set, and hence can unconditionally be // called by other kernels within that kernel set. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mdim ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ for ( dim_t i = 0; i < mdim; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ /* for ( dim_t j = 0; j < 1; ++j ) */ \ { \ ctype* restrict cij = ci /*[ j*cs_c ]*/ ; \ ctype* restrict bj = b /*[ j*cs_b ]*/ ; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(d,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } GENTFUNC( double, d, gemmsup_r_haswell_ref_6x1, 6 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_5x1, 5 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_4x1, 4 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_3x1, 3 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_2x1, 2 ) GENTFUNC( double, d, gemmsup_r_haswell_ref_1x1, 1 ) cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/old/000077500000000000000000000000001427272030600224675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8.c000066400000000000000000004243301427272030600310610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8, bli_dgemmsup_rd_haswell_asm_6x4, bli_dgemmsup_rd_haswell_asm_6x2, bli_dgemmsup_r_haswell_ref }, /* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8, bli_dgemmsup_rd_haswell_asm_3x4, bli_dgemmsup_rd_haswell_asm_3x2, bli_dgemmsup_r_haswell_ref }, /* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8, bli_dgemmsup_rd_haswell_asm_2x4, bli_dgemmsup_rd_haswell_asm_2x2, bli_dgemmsup_r_haswell_ref }, /* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8, bli_dgemmsup_rd_haswell_asm_1x4, bli_dgemmsup_rd_haswell_asm_1x2, bli_dgemmsup_r_haswell_ref } }; void bli_dgemmsup_rd_haswell_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Use a reference kernel if this is an edge case in the m or n // dimensions. if ( m0 < 6 || n0 < 8 ) { dim_t n_left = n0; double* restrict cj = c; double* restrict bj = b; // Iterate across columns (corresponding to elements of nrs) until // n_left is zero. for ( dim_t j = 0; n_left != 0; ++j ) { const dim_t nr_cur = nrs[ j ]; // Once we find the value of nrs that is less than (or equal to) // n_left, we use the kernels in that column. if ( nr_cur <= n_left ) { dim_t m_left = m0; double* restrict cij = cj; double* restrict ai = a; // Iterate down the current column (corresponding to elements // of mrs) until m_left is zero. for ( dim_t i = 0; m_left != 0; ++i ) { const dim_t mr_cur = mrs[ i ]; // Once we find the value of mrs that is less than (or equal // to) m_left, we select that kernel. if ( mr_cur <= m_left ) { FUNCPTR_T ker_fp = kmap[i][j]; //printf( "executing %d x %d sup kernel.\n", (int)mr_cur, (int)nr_cur ); // Call the kernel using current mrs and nrs values. ker_fp ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); // Advance C and A pointers by the mrs and nrs we just // used, and decrement m_left. cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } } // Advance C and B pointers by the mrs and nrs we just used, and // decrement n_left. cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r12) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r10) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r10 = rcx = c // r12 = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj #if 1 mov(imm(0), r9) // ii = 0; label(.DLOOP3X4I) // LOOP OVER ii = [ 0 1 ... ] lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(rdi, rsi) // rsi *= rs_c; lea(mem(r10, rsi, 1), rdx) // rdx = c_jj + 3*ii*rs_c; lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(r8, rsi) // rsi *= rs_a; lea(mem(r12, rsi, 1), r12) // rax = a + 3*ii*rs_a; mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] vzeroall() // zero all xmm/ymm registers. lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(rdx, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(r14, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem( , r12, 1), rax) // rax = a_ii; #endif #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) #if 1 add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. add(imm(3), r9) // ii += 3; cmp(imm(3), r9) // compare ii to 3 jle(.DLOOP3X4I) // if ii <= 3, jump to beginning #endif label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_3x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r12) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r10) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r10 = rcx = c // r12 = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] vzeroall() // zero all xmm/ymm registers. lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r10, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(r14, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem( , r12, 1), rax) // rax = a; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r12) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r10) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r10 = rcx = c // r12 = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] vzeroall() // zero all xmm/ymm registers. lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r10, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(r14, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem( , r12, 1), rax) // rax = a; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r12) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r10) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r10 = rcx = c // r12 = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] vzeroall() // zero all xmm/ymm registers. lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r10, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(r14, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem( , r12, 1), rax) // rax = a; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r12) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r10) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r10 = rcx = c // r12 = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r9) // ii = 0; label(.DLOOP3X4I) // LOOP OVER ii = [ 0 1 ... ] vzeroall() // zero all xmm/ymm registers. lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(rdi, rsi) // rsi *= rs_c; lea(mem(r10, rsi, 1), rcx) // rcx = c + 3*ii*rs_c; lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(r8, rsi) // rsi *= rs_a; lea(mem(r12, rsi, 1), rax) // rax = a + 3*ii*rs_a; lea(mem( , r14, 1), rbx) // rbx = b; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(3), r9) // ii += 3; cmp(imm(3), r9) // compare ii to 3 jle(.DLOOP3X4I) // if ii <= 3, jump to beginning // of ii loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | | | -------- -- -- -- ... | | | | -------- += -- -- -- | | | | -------- | | | | -------- : -------- : */ // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) // xmm10 = sum(ymm10) sum(ymm11) // xmm12 = sum(ymm12) sum(ymm13) // xmm14 = sum(ymm14) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) // xmm4 = sum(ymm4) sum(ymm5) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c000066400000000000000000004443601427272030600312430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft #if 0 static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref }, /* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref }, /* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref }, /* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref } }; #endif void bli_dgemmsup_rd_haswell_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rd_haswell_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 0 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x8m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj // r10 = unused mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter .. 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 3*ii*rs_c; lea(mem(r14), rax) // rax = a + 3*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; #if 0 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x4m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x4m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | | | -------- -- -- -- ... | | | | -------- += -- -- -- | | | | -------- | | | | -------- : -------- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 6*ii*rs_c; lea(mem(r14), rax) // rax = a + 6*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) // xmm10 = sum(ymm10) sum(ymm11) // xmm12 = sum(ymm12) sum(ymm13) // xmm14 = sum(ymm14) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c lea(mem(r14, r8, 4), r14) // lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_dgemmsup_rd_haswell_asm_3x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) // xmm4 = sum(ymm4) sum(ymm5) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.newji000066400000000000000000004465211427272030600323570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft #if 0 static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref }, /* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref }, /* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref }, /* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref } }; #endif void bli_dgemmsup_rd_haswell_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rd_haswell_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 0 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x8m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj // r10 = unused mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter .. 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 3*ii*rs_c; lea(mem(r14), rax) // rax = a + 3*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x4m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x4m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | | | -------- -- -- -- ... | | | | -------- += -- -- -- | | | | -------- | | | | -------- : -------- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 6*ii*rs_c; lea(mem(r14), rax) // rax = a + 6*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 3*8)) // prefetch c + 5*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) // xmm10 = sum(ymm10) sum(ymm11) // xmm12 = sum(ymm12) sum(ymm13) // xmm14 = sum(ymm14) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c lea(mem(r14, r8, 4), r14) // lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_dgemmsup_rd_haswell_asm_3x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) // xmm4 = sum(ymm4) sum(ymm5) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8m.c.worksij000066400000000000000000004471741427272030600327400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft #if 0 static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8m, bli_dgemmsup_rd_haswell_asm_6x4m, bli_dgemmsup_rd_haswell_asm_6x2m, bli_dgemmsup_r_haswell_ref }, /* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8m, bli_dgemmsup_rd_haswell_asm_3x4m, bli_dgemmsup_rd_haswell_asm_3x2m, bli_dgemmsup_r_haswell_ref }, /* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8m, bli_dgemmsup_rd_haswell_asm_2x4m, bli_dgemmsup_rd_haswell_asm_2x2m, bli_dgemmsup_r_haswell_ref }, /* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8m, bli_dgemmsup_rd_haswell_asm_1x4m, bli_dgemmsup_rd_haswell_asm_1x2m, bli_dgemmsup_r_haswell_ref } }; #endif void bli_dgemmsup_rd_haswell_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_dgemmsup_rd_haswell_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a_ii; #if 0 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x8m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.DLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*8), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 0 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c #else prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c #endif mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.DLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj // r10 = unused mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter .. 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 3*ii*rs_c; lea(mem(r14), rax) // rax = a + 3*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; #if 0 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x4m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x4m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | | | -------- -- -- -- ... | | | | -------- += -- -- -- | | | | -------- | | | | -------- : -------- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 6*ii*rs_c; lea(mem(r14), rax) // rax = a + 6*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 3*8)) // prefetch c + 5*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) // xmm10 = sum(ymm10) sum(ymm11) // xmm12 = sum(ymm12) sum(ymm13) // xmm14 = sum(ymm14) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c lea(mem(r14, r8, 4), r14) // lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_dgemmsup_rd_haswell_asm_3x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x2m ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) // xmm4 = sum(ymm4) sum(ymm5) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/old/bli_gemmsup_rd_haswell_asm_d6x8n.c000066400000000000000000004622051427272030600312420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_r_haswell_ref ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 4 #define FUNCPTR_T dgemmsup_ker_ft #if 0 static dim_t mrs[NUM_MR] = { 6, 3, 2, 1 }; static dim_t nrs[NUM_NR] = { 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 8 4 2 1 */ /* 6 */ { bli_dgemmsup_rd_haswell_asm_6x8n, bli_dgemmsup_rd_haswell_asm_6x4n, bli_dgemmsup_rd_haswell_asm_6x2n, bli_dgemmsup_r_haswell_ref }, /* 3 */ { bli_dgemmsup_rd_haswell_asm_3x8n, bli_dgemmsup_rd_haswell_asm_3x4n, bli_dgemmsup_rd_haswell_asm_3x2n, bli_dgemmsup_r_haswell_ref }, /* 2 */ { bli_dgemmsup_rd_haswell_asm_2x8n, bli_dgemmsup_rd_haswell_asm_2x4n, bli_dgemmsup_rd_haswell_asm_2x2n, bli_dgemmsup_r_haswell_ref }, /* 1 */ { bli_dgemmsup_rd_haswell_asm_1x8n, bli_dgemmsup_rd_haswell_asm_1x4n, bli_dgemmsup_rd_haswell_asm_1x2n, bli_dgemmsup_r_haswell_ref } }; #endif void bli_dgemmsup_rd_haswell_asm_6x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif uint64_t m_left = m0 % 6; // First check whether this is a edge case in the n dimension. If so, // dispatch other ?x8m kernels, as needed. if ( m_left ) { double* restrict cij = c; double* restrict bj = b; double* restrict ai = a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_dgemmsup_rd_haswell_asm_3x8n ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x8n ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { #if 0 const dim_t mr_cur = 1; //bli_dgemmsup_r_haswell_ref bli_dgemmsup_rd_haswell_asm_1x8n ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_TRANSPOSE, conja, k0, n0, alpha, bj, rs_b0, cs_b0, ai, cs_a0, beta, cij, cs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r9) // ii = 0; label(.DLOOP3X4I) // LOOP OVER ii = [ 0 1 ... ] mov(var(b), r14) // load address of b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(rdi, rsi) // rsi *= rs_c lea(mem(r12, rsi, 1), r12) // r12 = c + 3*ii*rs_c; lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(r8, rsi) // rsi *= rs_a; lea(mem(rdx, rsi, 1), rdx) // rax = a + 3*ii*rs_a; mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b add(imm(8*8), r10) // r10 += 8*rs_b = 8*8; #else prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. add(imm(3), r9) // ii += 3; cmp(imm(3), r9) // compare ii to 3 jle(.DLOOP3X4I) // if ii <= 3, jump to beginning // of ii loop; otherwise, loop ends. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 6; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_6x2n ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; //bli_dgemmsup_rd_haswell_asm_6x1n bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rd_haswell_asm_3x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b add(imm(8*8), r10) // r10 += 8*rs_b = 8*8; #else prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_3x2n ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; //bli_dgemmsup_rd_haswell_asm_3x1n bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rd_haswell_asm_2x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b add(imm(8*8), r10) // r10 += 8*rs_b = 8*8; #else prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 2; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x2n ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; //bli_dgemmsup_rd_haswell_asm_2x1n bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_dgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_dgemmsup_rd_haswell_asm_1x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rdx) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.DLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; #if 1 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c #endif lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b add(imm(8*8), r10) // r10 += 8*rs_b = 8*8; #else prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 #if 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 #if 1 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; #endif vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) add(imm(4*8), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.DLOOP3X4J) // iterate again if jj != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 1; const dim_t j_edge = n0 - ( dim_t )n_left; double* restrict cij = c + j_edge*cs_c; double* restrict ai = a; double* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_dgemmsup_rd_haswell_asm_1x2n ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 0 const dim_t nr_cur = 1; //bli_dgemmsup_rd_haswell_asm_1x1n bli_dgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_ddotxv_ex ( conja, conjb, k0, alpha, ai, cs_a0, bj, rs_b0, beta, cij, cntx, NULL ); #endif } } } void bli_dgemmsup_rd_haswell_asm_6x4n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj // r10 = unused mov(var(m_iter), r9) // ii = m_iter; label(.DLOOP3X4I) // LOOP OVER ii = [ m_iter .. 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c + 3*ii*rs_c; lea(mem(r14), rax) // rax = a + 3*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; #if 0 mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.DLOOP3X4I) // iterate again if ii != 0. label(.DRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; double* restrict cij = c + i_edge*rs_c; double* restrict bj = b; double* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_dgemmsup_rd_haswell_asm_2x4n ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_dgemmsup_rd_haswell_asm_1x4n ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_dgemmsup_rd_haswell_asm_3x4n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) vmovupd(mem(rax, r8, 2), ymm2) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) vmovsd(mem(rax, r8, 2), xmm2) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vfmadd231pd(ymm2, ymm3, ymm6) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vfmadd231pd(ymm2, ymm3, ymm9) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vfmadd231pd(ymm2, ymm3, ymm12) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) vfmadd231pd(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) vhaddpd( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm6 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm0, ymm6, ymm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm6) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) add(rdi, rcx) vmovupd(ymm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x4n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) vmovupd(mem(rax, r8, 1), ymm1) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) vmovsd(mem(rax, r8, 1), xmm1) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vfmadd231pd(ymm1, ymm3, ymm8) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) vfmadd231pd(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) vhaddpd( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) vhaddpd( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) vperm2f128(imm(0x20), ymm2, ymm0, ymm5 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha vmulpd(ymm0, ymm5, ymm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), ymm3, ymm5) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm5, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x4n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | | | -------- -- -- -- ... | | | | -------- += -- -- -- | | | | -------- | | | | -------- : -------- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm13, ymm13, ymm13) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rax ), ymm0) add(imm(4*8), rax) // a += 4*cs_b = 4*8; vmovupd(mem(rbx ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovupd(mem(rbx, r11, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovupd(mem(rbx, r11, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovupd(mem(rbx, r13, 1), ymm3) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rax ), xmm0) add(imm(1*8), rax) // a += 1*cs_b = 1*8; vmovsd(mem(rbx ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vmovsd(mem(rbx, r11, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm7) vmovsd(mem(rbx, r11, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vmovsd(mem(rbx, r13, 1), xmm3) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vfmadd231pd(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddpd( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddpd( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddpd( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vperm2f128(imm(0x20), ymm2, ymm0, ymm4 ) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(ymm0, ymm4, ymm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), ymm3, ymm4) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(ymm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_6x2n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) vxorpd(ymm10, ymm10, ymm10) vxorpd(ymm11, ymm11, ymm11) vxorpd(ymm12, ymm12, ymm12) vxorpd(ymm13, ymm13, ymm13) vxorpd(ymm14, ymm14, ymm14) vxorpd(ymm15, ymm15, ymm15) #endif lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovupd(mem(rax, r13, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovupd(mem(rax, r8, 4), ymm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovupd(mem(rax, r15, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) vmovsd(mem(rax, r13, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vmovsd(mem(rax, r8, 4), xmm3) vfmadd231pd(ymm0, ymm3, ymm12) vfmadd231pd(ymm1, ymm3, ymm13) vmovsd(mem(rax, r15, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) vhaddpd( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm10 ) vhaddpd( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm12 ) vhaddpd( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm14 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) // xmm10 = sum(ymm10) sum(ymm11) // xmm12 = sum(ymm12) sum(ymm13) // xmm14 = sum(ymm14) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm10) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm12) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm14) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) add(rdi, rcx) vmovupd(xmm10, mem(rcx)) add(rdi, rcx) vmovupd(xmm12, mem(rcx)) add(rdi, rcx) vmovupd(xmm14, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_3x2n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) vxorpd(ymm8, ymm8, ymm8) vxorpd(ymm9, ymm9, ymm9) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovupd(mem(rax, r8, 2), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vmovsd(mem(rax, r8, 2), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm8) vfmadd231pd(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) vhaddpd( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm8 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm8) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) add(rdi, rcx) vmovupd(xmm8, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_2x2n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) vxorpd(ymm6, ymm6, ymm6) vxorpd(ymm7, ymm7, ymm7) #endif mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovupd(mem(rax, r8, 1), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) vmovsd(mem(rax, r8, 1), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddpd( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm6 ) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231pd(mem(rcx), xmm3, xmm6) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) add(rdi, rcx) vmovupd(xmm6, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_dgemmsup_rd_haswell_asm_1x2n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, double* restrict alpha, double* restrict a, inc_t rs_a0, inc_t cs_a0, double* restrict b, inc_t rs_b0, inc_t cs_b0, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_dgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter16 = k0 / 16; uint64_t k_left16 = k0 % 16; uint64_t k_iter4 = k_left16 / 4; uint64_t k_left1 = k_left16 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; /* rrc: -------- -- -- -- | | -------- -- -- -- ... | | -------- += -- -- -- | | -------- -- -- -- | | -------- -- -- -- : -------- -- -- -- : */ // ------------------------------------------------------------------------- begin_asm() #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorpd(ymm4, ymm4, ymm4) vxorpd(ymm5, ymm5, ymm5) #endif mov(var(a), rax) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 8), r8) // rs_a *= sizeof(double) //lea(mem(, r9, 8), r9) // cs_a *= sizeof(double) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 8), r10) // rs_b *= sizeof(double) lea(mem(, r11, 8), r11) // cs_b *= sizeof(double) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c mov(var(k_iter16), rsi) // i = k_iter16; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKITER4) // if i == 0, jump to code that // contains the k_iter4 loop. label(.DLOOPKITER16) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER16) // iterate again if i != 0. label(.DCONSIDKITER4) mov(var(k_iter4), rsi) // i = k_iter4; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter4 loop. label(.DLOOPKITER4) // EDGE LOOP (ymm) vmovupd(mem(rbx ), ymm0) vmovupd(mem(rbx, r11, 1), ymm1) add(imm(4*8), rbx) // b += 4*rs_b = 4*8; vmovupd(mem(rax ), ymm3) add(imm(4*8), rax) // a += 4*cs_a = 4*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKITER4) // iterate again if i != 0. label(.DCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.DLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovsd(mem(rbx ), xmm0) vmovsd(mem(rbx, r11, 1), xmm1) add(imm(1*8), rbx) // b += 1*rs_b = 1*8; vmovsd(mem(rax ), xmm3) add(imm(1*8), rax) // a += 1*cs_a = 1*8; vfmadd231pd(ymm0, ymm3, ymm4) vfmadd231pd(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.DLOOPKLEFT1) // iterate again if i != 0. label(.DPOSTACCUM) // ymm4 ymm5 vhaddpd( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddpd( xmm0, xmm1, xmm4 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) // xmm4 = sum(ymm4) sum(ymm5) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rax), ymm0) // load alpha and duplicate vbroadcastsd(mem(rbx), ymm3) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(double) // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case label(.DROWSTORED) vfmadd231pd(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.DDONE) // jump to end. label(.DBETAZERO) label(.DROWSTORBZ) vmovupd(xmm4, mem(rcx)) //add(rdi, rcx) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter16] "m" (k_iter16), [k_iter4] "m" (k_iter4), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/000077500000000000000000000000001427272030600226005ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c000066400000000000000000000167331427272030600310650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) // NOTE: Normally, for any "?x1" kernel, we would call the reference kernel. // However, at least one other subconfiguration (zen) uses this kernel set, so // we need to be able to call a set of "?x1" kernels that we know will actually // exist regardless of which subconfiguration these kernels were used by. Thus, // the compromise employed here is to inline the reference kernel so it gets // compiled as part of the haswell kernel set, and hence can unconditionally be // called by other kernels within that kernel set. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mdim ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ for ( dim_t i = 0; i < mdim; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ /* for ( dim_t j = 0; j < 1; ++j ) */ \ { \ ctype* restrict cij = ci /*[ j*cs_c ]*/ ; \ ctype* restrict bj = b /*[ j*cs_b ]*/ ; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(d,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } GENTFUNC( float, s, gemmsup_r_haswell_ref_6x1, 6 ) GENTFUNC( float, s, gemmsup_r_haswell_ref_5x1, 5 ) GENTFUNC( float, s, gemmsup_r_haswell_ref_4x1, 4 ) GENTFUNC( float, s, gemmsup_r_haswell_ref_3x1, 3 ) GENTFUNC( float, s, gemmsup_r_haswell_ref_2x1, 2 ) GENTFUNC( float, s, gemmsup_r_haswell_ref_1x1, 1 ) // ----------------------------------------------------------------------------- #if 0 // Temporary definition of general-purpose sup kernel. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(d,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } GENTFUNC( float, s, gemmsup_r_haswell_ref ) #endif cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c000066400000000000000000001326601427272030600312330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm12, ymm12, ymm12) vxorps(ymm14, ymm14, ymm14) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 0*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 0*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 0*4)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 0*4)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 0*4)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 0*4)) // prefetch c + 5*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rax, r8, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovss(mem(rax, r8, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm8) vmovss(mem(rax, r13, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rax, r8, 4), xmm3) vfmadd231ps(ymm0, ymm3, ymm12) vmovss(mem(rax, r15, 1), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 // ymm6 // ymm8 // ymm10 // ymm12 // ymm14 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) vhaddps( ymm7, ymm6, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm6 ) vhaddps( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm8 ) vhaddps( ymm11, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm10 ) vhaddps( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm12 ) vhaddps( ymm15, ymm14, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm14 ) // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) // xmm8[0] = sum(ymm8) // xmm10[0] = sum(ymm10) // xmm12[0] = sum(ymm12) // xmm14[0] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovss(xmm6, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovss(xmm8, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovss(xmm10, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovss(xmm12, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm14) vmovss(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovss(xmm4, mem(rcx)) add(rdi, rcx) vmovss(xmm6, mem(rcx)) add(rdi, rcx) vmovss(xmm8, mem(rcx)) add(rdi, rcx) vmovss(xmm10, mem(rcx)) add(rdi, rcx) vmovss(xmm12, mem(rcx)) add(rdi, rcx) vmovss(xmm14, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_3x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 0*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 0*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 0*4)) // prefetch c + 2*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rax, r8, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm6) vmovss(mem(rax, r8, 2), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 // ymm6 // ymm8 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) vhaddps( ymm7, ymm6, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm6 ) vhaddps( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm8 ) // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) // xmm8[0] = sum(ymm8) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovss(xmm6, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovss(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovss(xmm4, mem(rcx)) add(rdi, rcx) vmovss(xmm6, mem(rcx)) add(rdi, rcx) vmovss(xmm8, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_2x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 0*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 0*4)) // prefetch c + 1*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rax, r8, 1), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 // ymm6 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) vhaddps( ymm7, ymm6, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm6 ) // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovss(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovss(xmm4, mem(rcx)) add(rdi, rcx) vmovss(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_1x1 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 0*4)) // prefetch c + 0*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 // ymm6 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) // xmm4[0] = sum(ymm4) // xmm6[0] = sum(ymm6) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovss(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovss(xmm4, mem(rcx)) add(rdi, rcx) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovss(xmm4, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c000066400000000000000000001221721427272030600313120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(12), r15) // compare jj to 12 jl(.SLOOP3X4J) // if jj < 12, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_2x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(12), r15) // compare jj to 12 jl(.SLOOP3X4J) // if jj < 12, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_1x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(12), r15) // compare jj to 12 jl(.SLOOP3X4J) // if jj < 12, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c000066400000000000000000001301751427272030600313200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 16; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rd_haswell_asm_6x8 //bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rd_haswell_asm_6x4 //bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_haswell_asm_6x2 //bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_rd_haswell_asm_6x1 //bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(16), r15) // compare jj to 16 jl(.SLOOP3X4J) // if jj < 16, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 16; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_haswell_asm_2x16 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_haswell_asm_1x16 //bli_sgemmsup_r_haswell_ref ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_haswell_asm_2x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(16), r15) // compare jj to 16 jl(.SLOOP3X4J) // if jj < 16, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_1x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. //mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a //lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(16), r15) // compare jj to 16 jl(.SLOOP3X4J) // if jj < 16, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c000066400000000000000000001415121427272030600312300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*4)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*4)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rax, r8, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovss(mem(rax, r8, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovss(mem(rax, r13, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rax, r8, 4), xmm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovss(mem(rax, r15, 1), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) vhaddps( ymm7, ymm6, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm6 ) vhaddps( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm8 ) vhaddps( ymm11, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm10 ) vhaddps( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm12 ) vhaddps( ymm15, ymm14, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm14 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) // xmm8[0:1] = sum(ymm8) sum(ymm9) // xmm10[0:1] = sum(ymm10) sum(ymm11) // xmm12[0:1] = sum(ymm12) sum(ymm13) // xmm14[0:1] = sum(ymm14) sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(xmm14, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rax, r8, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovss(mem(rax, r8, 2), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) vhaddps( ymm7, ymm6, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm6 ) vhaddps( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm8 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) // xmm8[0:1] = sum(ymm8) sum(ymm9) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rax, r8, 1), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) vhaddps( ymm7, ymm6, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm6 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) // xmm6[0:1] = sum(ymm6) sum(ymm7) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; //uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) //lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) #endif //lea(mem(r12), rcx) // rcx = c_ii; //lea(mem(r14), rax) // rax = a_ii; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) //lea(mem(rcx, rdi, 2), r10) // //lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 8*rs_b = 8*4; vmovss(mem(rax ), xmm3) add(imm(1*4), rax) // a += 8*cs_a = 8*4; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm4 ) // xmm4[0:1] = sum(ymm4) sum(ymm5) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c000066400000000000000000001155211427272030600312330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_ii; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) #endif //lea(mem(r12), rcx) // rcx = c; //lea(mem(r14), rax) // rax = a; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) #endif //lea(mem(r12), rcx) // rcx = c; //lea(mem(r14), rax) // rax = a; //lea(mem(rdx), rbx) // rbx = b; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) label(.SDONE) label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c000066400000000000000000001221611427272030600312350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rd_haswell_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; //uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a //mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a //mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(b), rdx) // load address of b mov(var(c), r12) // load address of c lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) #endif lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c #endif lea(mem(r8, r8, 4), rbp) // rbp = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm6) // xmm6[0] = sum(ymm6); xmm6[1] = sum(ymm9) // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) //add(rdi, rcx) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.SLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a #endif vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm5) // xmm5[0] = sum(ymm5); xmm5[1] = sum(ymm8) // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.SLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_haswell_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a //mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) //lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. //mov(var(rs_b), r10) // load rs_b mov(var(cs_b), r11) // load cs_b //lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b //lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] #if 0 vzeroall() // zero all xmm/ymm registers. #else // skylake can execute 3 vxorps ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) #endif lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c*sizeof(float) = 1*4 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; #if 1 //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*rs_c #endif mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) #if 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a #endif vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 8*cs_a = 8*4; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 8*rs_b = 8*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_a = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vhaddps( xmm1, xmm0, xmm0 ) vpermilps(imm(0xd8), xmm0, xmm0) vhaddps( xmm0, xmm0, xmm0 ) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm3 ) vhaddps( xmm3, xmm2, xmm2 ) vpermilps(imm(0xd8), xmm2, xmm2) vhaddps( xmm2, xmm2, xmm2 ) vshufps(imm(0x44), xmm2, xmm0, xmm4) // xmm4[0] = sum(ymm4); xmm4[1] = sum(ymm7) // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13) //mov(var(rs_c), rdi) // load rs_c //lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha //mov(var(cs_c), rsi) // load cs_c //lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) //add(rdi, rcx) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 8 jl(.SLOOP3X4J) // if jj < 8, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c000066400000000000000000003150531427272030600313360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rv_haswell_asm_6x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 5*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 5*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 5*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 5*8)) // prefetch c + 11*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) vmulps(ymm0, ymm8, ymm8) vmulps(xmm0, xmm9, xmm9) vmulps(ymm0, ymm10, ymm10) vmulps(xmm0, xmm11, xmm11) vmulps(ymm0, ymm12, ymm12) vmulps(xmm0, xmm13, xmm13) vmulps(ymm0, ymm14, ymm14) vmulps(xmm0, xmm15, xmm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(ymm15, ymm13, ymm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(xmm13, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm14, mem(rcx, 0*32)) vmovups(xmm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(ymm15, ymm13, ymm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_5x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 4*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 4*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 4*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 4*8)) // prefetch c + 11*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) vmulps(ymm0, ymm8, ymm8) vmulps(xmm0, xmm9, xmm9) vmulps(ymm0, ymm10, ymm10) vmulps(xmm0, xmm11, xmm11) vmulps(ymm0, ymm12, ymm12) vmulps(xmm0, xmm13, xmm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13) vmovups(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm13, ymm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(xmm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm13, ymm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_4x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 3*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 3*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 3*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 3*8)) // prefetch c + 11*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) vmulps(ymm0, ymm8, ymm8) vmulps(xmm0, xmm9, xmm9) vmulps(ymm0, ymm10, ymm10) vmulps(xmm0, xmm11, xmm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11) vmovups(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(xmm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 4), rcx) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_3x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 2*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 2*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 2*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 2*8)) // prefetch c + 11*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) vmulps(ymm0, ymm8, ymm8) vmulps(xmm0, xmm9, xmm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9) vmovups(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(ymm7, ymm5, ymm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm9, ymm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(xmm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(ymm7, ymm5, ymm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm9, ymm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) //lea(mem(rcx, rsi, 4), rcx) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_2x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 1*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 1*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 1*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 1*8)) // prefetch c + 11*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(ymm0, ymm6, ymm6) vmulps(xmm0, xmm7, xmm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7) vmovups(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(ymm7, ymm5, ymm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(xmm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-11 vunpcklps(ymm7, ymm5, ymm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(ymm7, ymm5, ymm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_1x12 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 0*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 0*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 0*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 0*8)) // prefetch c + 11*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), xmm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5) vmovups(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rcx, rax, 2), xmm1) vmovss(mem(rcx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-11 vmovups(ymm5, ymm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(xmm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-11 vmovups(ymm5, ymm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c000066400000000000000000003501551427272030600313440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) // Define parameters and variables for edge case kernel map. #define NUM_MR 4 #define NUM_NR 6 #define FUNCPTR_T sgemmsup_ker_ft static dim_t mrs[NUM_MR] = { 6, 4, 2, 1 }; static dim_t nrs[NUM_NR] = { 16, 12, 8, 4, 2, 1 }; static FUNCPTR_T kmap[NUM_MR][NUM_NR] = { /* 16 12 8 4 2 1 */ /* 6 */ { bli_sgemmsup_rv_haswell_asm_6x16, bli_sgemmsup_rv_haswell_asm_6x12, bli_sgemmsup_rv_haswell_asm_6x8, bli_sgemmsup_rv_haswell_asm_6x4, bli_sgemmsup_rv_haswell_asm_6x2, bli_sgemmsup_r_haswell_ref_6x1 }, /* 4 */ { bli_sgemmsup_rv_haswell_asm_4x16, bli_sgemmsup_rv_haswell_asm_4x12, bli_sgemmsup_rv_haswell_asm_4x8, bli_sgemmsup_rv_haswell_asm_4x4, bli_sgemmsup_rv_haswell_asm_4x2, bli_sgemmsup_r_haswell_ref_4x1 }, /* 2 */ { bli_sgemmsup_rv_haswell_asm_2x16, bli_sgemmsup_rv_haswell_asm_2x12, bli_sgemmsup_rv_haswell_asm_2x8, bli_sgemmsup_rv_haswell_asm_2x4, bli_sgemmsup_rv_haswell_asm_2x2, bli_sgemmsup_r_haswell_ref_2x1 }, /* 1 */ { bli_sgemmsup_rv_haswell_asm_1x16, bli_sgemmsup_rv_haswell_asm_1x12, bli_sgemmsup_rv_haswell_asm_1x8, bli_sgemmsup_rv_haswell_asm_1x4, bli_sgemmsup_rv_haswell_asm_1x2, bli_sgemmsup_r_haswell_ref_1x1 } }; void bli_sgemmsup_rv_haswell_asm_6x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 0 bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif // Use a reference kernel if this is an edge case in the m or n // dimensions. if ( m0 < 6 || n0 < 16 ) { #if 0 bli_sgemmsup_r_haswell_ref ( conja, conjb, m0, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; #endif dim_t n_left = n0; float* restrict cj = c; float* restrict bj = b; // Iterate across columns (corresponding to elements of nrs) until // n_left is zero. for ( dim_t j = 0; n_left != 0; ++j ) { const dim_t nr_cur = nrs[ j ]; // Once we find the value of nrs that is less than (or equal to) // n_left, we use the kernels in that column. if ( nr_cur <= n_left ) { dim_t m_left = m0; float* restrict cij = cj; float* restrict ai = a; // Iterate down the current column (corresponding to elements // of mrs) until m_left is zero. for ( dim_t i = 0; m_left != 0; ++i ) { const dim_t mr_cur = mrs[ i ]; // Once we find the value of mrs that is less than (or equal // to) m_left, we select that kernel. if ( mr_cur <= m_left ) { FUNCPTR_T ker_fp = kmap[i][j]; //printf( "executing %d x %d sup kernel.\n", (int)mr_cur, (int)nr_cur ); // Call the kernel using current mrs and nrs values. ker_fp ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); // Advance C and A pointers by the mrs and nrs we just // used, and decrement m_left. cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } } // Advance C and B pointers by the mrs and nrs we just used, and // decrement n_left. cj += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 5*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 5*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 5*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 5*8)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 5*8)) // prefetch c + 12*cs_c lea(mem(rcx, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rbp, 1, 5*8)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm14, mem(rcx, 0*32)) vmovups(ymm15, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_5x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 4*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 4*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 4*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 4*8)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 4*8)) // prefetch c + 12*cs_c lea(mem(rcx, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rbp, 1, 4*8)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) vmovups(ymm13, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_4x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 3*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 3*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 3*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 3*8)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 3*8)) // prefetch c + 12*cs_c lea(mem(rcx, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rbp, 1, 3*8)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) vmovups(ymm11, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 4), rcx) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_3x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 2*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 2*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 2*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 2*8)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 2*8)) // prefetch c + 12*cs_c lea(mem(rcx, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rbp, 1, 2*8)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm9, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) vmovups(ymm9, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm9, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 4), rcx) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_2x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 1*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 1*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 1*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 1*8)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 1*8)) // prefetch c + 12*cs_c lea(mem(rcx, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rbp, 1, 1*8)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) vmovups(ymm7, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vunpcklps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm7, ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_1x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 0*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 0*8)) // prefetch c + 7*cs_c prefetch(0, mem(rdx, rsi, 4, 0*8)) // prefetch c + 8*cs_c lea(mem(rcx, rsi, 8), rdx) // rdx = c + 8*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 9*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 10*cs_c prefetch(0, mem(rdx, rbp, 1, 0*8)) // prefetch c + 11*cs_c prefetch(0, mem(rdx, rsi, 4, 0*8)) // prefetch c + 12*cs_c lea(mem(rcx, rcx, 4), rdx) // rdx = c + 12*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 13*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 14*cs_c prefetch(0, mem(rdx, rbp, 1, 0*8)) // prefetch c + 15*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rcx, rax, 2), xmm1) vmovss(mem(rcx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vmovups(ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rcx, rax, 2), xmm1) vmovss(mem(rcx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) vmovups(ymm5, mem(rcx, 1*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c // begin I/O on columns 8-15 vmovups(ymm5, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c000066400000000000000000002055351427272030600312600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rv_haswell_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 1*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(xmm14, xmm12, xmm0) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(xmm14, xmm12, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_5x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovsd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm12, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm12, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_4x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovsd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovsd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm8, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm8, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovsd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovsd(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-1 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) //lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovsd(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; //lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; //lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovsd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-1 vmovups(xmm4, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma00 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma01 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-1 vmovups(xmm4, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma00 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma01 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c000066400000000000000000002205211427272030600312520ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rv_haswell_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(xmm14, xmm12, xmm0) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(xmm14, xmm12, xmm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm12, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(xmm14, xmm12, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vunpckhps(xmm14, xmm12, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_5x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 4*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm12, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm10, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm12, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_4x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx ), xmm3, xmm0) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(xmm6, xmm4, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm8, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma20 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma21 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma22 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma23 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(xmm6, xmm4, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(xmm8, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma20 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma21 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma22 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma23 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(xmm6, xmm4, xmm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(xmm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vunpcklps(xmm6, xmm4, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vunpckhps(xmm6, xmm4, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-3 vmovups(xmm4, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma00 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma01 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma02 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma03 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-3 vmovups(xmm4, xmm0) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma00 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma01 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma02 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma03 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c000066400000000000000000002747221427272030600312700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rv_haswell_asm_6x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*4)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*4)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*4)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*4)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*4)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 5*4)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 5*4)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*4)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm5) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm7) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm9) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm11) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm12, xmm13) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm13) vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm14, xmm15) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14) vmovups(xmm14, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm15) vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm12, xmm13) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(xmm13, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm14, xmm15) vmovups(xmm14, mem(rcx, 0*4)) vmovsd(xmm15, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_5x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*4)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*4)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*4)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*4)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*4)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 4*4)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 4*4)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*4)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm5) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm7) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm9) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm11) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm12, xmm13) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm13) vmovsd(xmm13, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm12, xmm13) vmovups(xmm12, mem(rcx, 0*4)) vmovsd(xmm13, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_4x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*4)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*4)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*4)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*4)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*4)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 3*4)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 3*4)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*4)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm5) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm7) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm9) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm11) vmovsd(xmm11, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm10, xmm11) vmovups(xmm10, mem(rcx, 0*4)) vmovsd(xmm11, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_3x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*4)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*4)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*4)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*4)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*4)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 2*4)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 2*4)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*4)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm5) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm7) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm9) vmovsd(xmm9, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm8, xmm9) vmovups(xmm8, mem(rcx, 0*4)) vmovsd(xmm9, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_2x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*4)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*4)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*4)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*4)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*4)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 1*4)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 1*4)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*4)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm5) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm7) vmovsd(xmm7, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) add(rdi, rcx) vextractf128(imm(0x1), ymm6, xmm7) vmovups(xmm6, mem(rcx, 0*4)) vmovsd(xmm7, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-5 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_1x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*4)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*4)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*4)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*4)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 0*4)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 0*4)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*4)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*4), xmm0) vmovsd(mem(rbx, 4*4), xmm1) vinsertf128(imm(0x1), xmm1, ymm0, ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; //lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vextractf128(imm(0x1), ymm4, xmm5) vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(mem(rcx, 4*4), xmm1) vfmadd231ps(xmm1, xmm3, xmm5) vmovsd(xmm5, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-5 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vextractf128(imm(0x1), ymm4, xmm5) vmovups(xmm4, mem(rcx, 0*4)) vmovsd(xmm5, mem(rcx, 4*4)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } #if 0 void bli_sgemmsup_rv_haswell_asm_1x6 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 0*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 0*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rcx, rax, 2), xmm1) vmovss(mem(rcx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } #endif cython-blis-0.9.1/blis/_src/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c000066400000000000000000002452031427272030600312620ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref ) void bli_sgemmsup_rv_haswell_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 5*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 5*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 5*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14) vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx ), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(mem(rdx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rdx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rdx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(mem(rdx, rax, 2), xmm1, xmm1) vmovhpd(mem(rdx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm14, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx )) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rdx, rbx, 1)) // store ( gamma45..gamma55 ) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx, rsi, 2)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rax, 1)) // store ( gamma43..gamma53 ) vmovlpd(xmm2, mem(rdx, rax, 2)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rbp, 1)) // store ( gamma47..gamma57 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_5x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 4*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 4*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12) vmovups(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm12, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_4x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 3*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 3*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10) vmovups(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx ), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbx, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx, rsi, 2), xmm3, xmm0) vfmadd231ps(mem(rcx, rax, 2), xmm3, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx, rax, 1), xmm3, xmm1) vfmadd231ps(mem(rcx, rbp, 1), xmm3, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm10, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx )) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma35 ) vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma36 ) vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx, rax, 1)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_3x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 2*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 2*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8) vmovups(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rdx ), xmm1) vmovss(mem(rdx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rdx, rsi, 2), xmm1) vmovss(mem(rdx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rdx, rsi, 4), xmm1) vmovss(mem(rdx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rdx, rax, 2), xmm1) vmovss(mem(rdx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm8, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vmovups(ymm8, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rdx )) // store ( gamma40 ) vmovss(xmm4, mem(rdx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rdx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rdx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rdx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rdx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rdx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rdx, rbp, 1)) // store ( gamma47 ) //lea(mem(rdx, rsi, 8), rdx) // rdx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 1*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 1*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6) vmovups(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx ), xmm1, xmm1) vmovhpd(mem(rcx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(mem(rcx, rsi, 4), xmm1, xmm1) vmovhpd(mem(rcx, rbx, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rcx, rsi, 2), xmm1, xmm1) vmovhpd(mem(rcx, rax, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(mem(rcx, rax, 2), xmm1, xmm1) vmovhpd(mem(rcx, rbp, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) add(rdi, rcx) vmovups(ymm6, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vunpcklps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx )) // store ( gamma00..gamma10 ) vmovhpd(xmm0, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovlpd(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma14 ) vmovhpd(xmm2, mem(rcx, rbx, 1)) // store ( gamma05..gamma15 ) vunpckhps(ymm6, ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovhpd(xmm0, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) vmovlpd(xmm2, mem(rcx, rax, 2)) // store ( gamma06..gamma16 ) vmovhpd(xmm2, mem(rcx, rbp, 1)) // store ( gamma07..gamma17 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_haswell_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rsi, rsi, 2), rbp) // rbp = 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rcx, rsi, 4, 0*8)) // prefetch c + 4*cs_c lea(mem(rcx, rsi, 4), rdx) // rdx = c + 4*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 5*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rbp, 1, 0*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c #if 1 lea(mem(rax, r9, 8), rdx) // lea(mem(rdx, r9, 8), rdx) // rdx = a + 16*cs_a; #endif mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 #if 1 prefetch(0, mem(rdx, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 1 #if 0 prefetch(0, mem(rdx, r9, 1, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 2 #if 1 prefetch(0, mem(rdx, r9, 2, 4*8)) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 3 #if 1 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP #if 0 prefetch(0, mem(rdx, 5*8)) add(r9, rdx) #endif vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; lea(mem(rsi, rsi, 4), rbx) // rbx = 5*cs_c; lea(mem(rax, rsi, 4), rbp) // rbp = 7*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4) vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(mem(rcx ), xmm1) vmovss(mem(rcx, rsi, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(mem(rcx, rsi, 2), xmm1) vmovss(mem(rcx, rax, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(mem(rcx, rsi, 4), xmm1) vmovss(mem(rcx, rbx, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(mem(rcx, rax, 2), xmm1) vmovss(mem(rcx, rbp, 1), xmm6) vfmadd231ps(xmm1, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx, 0*32)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) // begin I/O on columns 0-7 vmovups(ymm4, ymm0) vextractf128(imm(0x1), ymm0, xmm8) vpermilps(imm(0xe4), xmm0, xmm2) vpermilps(imm(0x39), xmm0, xmm4) vmovss(xmm2, mem(rcx )) // store ( gamma40 ) vmovss(xmm4, mem(rcx, rsi, 1)) // store ( gamma41 ) vpermilps(imm(0x4e), xmm0, xmm2) vpermilps(imm(0x93), xmm0, xmm4) vmovss(xmm2, mem(rcx, rsi, 2)) // store ( gamma42 ) vmovss(xmm4, mem(rcx, rax, 1)) // store ( gamma43 ) vpermilps(imm(0xe4), xmm8, xmm2) vpermilps(imm(0x39), xmm8, xmm4) vmovss(xmm2, mem(rcx, rsi, 4)) // store ( gamma44 ) vmovss(xmm4, mem(rcx, rbx, 1)) // store ( gamma45 ) vpermilps(imm(0x4e), xmm8, xmm2) vpermilps(imm(0x93), xmm8, xmm4) vmovss(xmm2, mem(rcx, rax, 2)) // store ( gamma46 ) vmovss(xmm4, mem(rcx, rbp, 1)) // store ( gamma47 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/haswell/bli_kernels_haswell.h000066400000000000000000000303651427272030600251300ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- level-1m ----------------------------------------------------------------- // packm (asm) PACKM_KER_PROT( float, s, packm_haswell_asm_6xk ) PACKM_KER_PROT( float, s, packm_haswell_asm_16xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_6xk ) PACKM_KER_PROT( double, d, packm_haswell_asm_8xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_3xk ) PACKM_KER_PROT( scomplex, c, packm_haswell_asm_8xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_3xk ) PACKM_KER_PROT( dcomplex, z, packm_haswell_asm_4xk ) // -- level-3 ------------------------------------------------------------------ // gemm (asm d6x8) GEMM_UKR_PROT( float, s, gemm_haswell_asm_6x16 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_6x8 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_3x8 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_3x4 ) // gemm (asm d8x6) GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // gemmtrsm_l (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_l_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_haswell_asm_6x8 ) // gemmtrsm_u (asm d6x8) GEMMTRSM_UKR_PROT( float, s, gemmtrsm_u_haswell_asm_6x16 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_haswell_asm_6x8 ) // gemm (asm d8x6) //GEMM_UKR_PROT( float, s, gemm_haswell_asm_16x6 ) //GEMM_UKR_PROT( double, d, gemm_haswell_asm_8x6 ) //GEMM_UKR_PROT( scomplex, c, gemm_haswell_asm_8x3 ) //GEMM_UKR_PROT( dcomplex, z, gemm_haswell_asm_4x3 ) // -- level-3 sup -------------------------------------------------------------- // -- single real -- // gemmsup_r GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_haswell_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x12 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x12m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rd_haswell_asm_1x16n ) // -- double real -- // gemmsup_r GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_5x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_4x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_r_haswell_ref_1x1 ) // gemmsup_rv GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x6 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x2 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x6m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_5x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_4x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rv_haswell_asm_1x8n ) // gemmsup_rd GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x4 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x2 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x1 ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x1 ) // gemmsup_rd (mkernel in m dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x4m ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x2m ) // gemmsup_rd (mkernel in n dim) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_6x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_3x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_2x8n ) GEMMSUP_KER_PROT( double, d, gemmsup_rd_haswell_asm_1x8n ) cython-blis-0.9.1/blis/_src/kernels/knc/000077500000000000000000000000001427272030600200545ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/knc/3/000077500000000000000000000000001427272030600202165ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/knc/3/bli_dgemm_knc_asm_30x8.c000066400000000000000000000510771427272030600245700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #define A_L1_PREFETCH_DIST 4 #define B_L1_PREFETCH_DIST 2 #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. //Alternate code path uused if C is not row-major #define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \ { \ __asm kmov k3, ebx \ __asm GATHER##NUM: \ __asm vgatherdpd zmm31{k3}, [BASE_DEST + zmm30 * 8] \ __asm jknzd k3, GATHER##NUM \ \ __asm vmulpd REG1, REG1, 0[r12]{1to8} /*scale by alpha*/ \ __asm vfmadd132pd zmm31, REG1, 0[r13]{1to8} /*scale by beta, add in result*/\ __asm kmov k3, ebx \ \ __asm SCATTER##NUM: \ __asm vscatterdpd [BASE_DEST + zmm30 * 8]{k3}, zmm31 \ __asm jknzd k3, SCATTER##NUM \ __asm add BASE_DEST, r11 \ } //One iteration of the k_r loop. //Each iteration, we prefetch A into L1 and into L2 #define ONE_ITER_MAIN_LOOP(C_ADDR, COUNTER) \ {\ __asm vbroadcastf64x4 zmm30, 0[r15] \ __asm vmovapd zmm31, 0[rbx] \ \ __asm vfmadd231pd zmm0, zmm31, zmm30{aaaa} \ __asm vfmadd231pd zmm4, zmm31, 4*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ __asm vfmadd231pd zmm5, zmm31, 5*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ __asm vfmadd231pd zmm6, zmm31, 6*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ __asm vfmadd231pd zmm7, zmm31, 7*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ __asm vfmadd231pd zmm8, zmm31, 8*8[r15]{1to8} \ \ __asm vprefetch1 0[r15 + r14] \ __asm vfmadd231pd zmm9, zmm31, 9*8[r15]{1to8} \ __asm vfmadd231pd zmm1, zmm31, zmm30{bbbb} \ __asm vfmadd231pd zmm2, zmm31, zmm30{cccc} \ __asm vfmadd231pd zmm3, zmm31, zmm30{dddd} \ __asm vfmadd231pd zmm10, zmm31, 10*8[r15]{1to8} \ \ __asm vprefetch1 64[r15 + r14] \ __asm vfmadd231pd zmm11, zmm31, 11*8[r15]{1to8} \ __asm vfmadd231pd zmm12, zmm31, 12*8[r15]{1to8} \ __asm vfmadd231pd zmm13, zmm31, 13*8[r15]{1to8} \ __asm vfmadd231pd zmm14, zmm31, 14*8[r15]{1to8} \ __asm vfmadd231pd zmm15, zmm31, 15*8[r15]{1to8} \ \ __asm vprefetch1 2*64[r15 + r14] \ __asm vfmadd231pd zmm16, zmm31, 16*8[r15]{1to8} \ __asm vfmadd231pd zmm17, zmm31, 17*8[r15]{1to8} \ __asm vfmadd231pd zmm18, zmm31, 18*8[r15]{1to8} \ __asm vfmadd231pd zmm19, zmm31, 19*8[r15]{1to8} \ __asm vfmadd231pd zmm20, zmm31, 20*8[r15]{1to8} \ \ __asm vprefetch1 3*64[r15 + r14] \ __asm vfmadd231pd zmm21, zmm31, 21*8[r15]{1to8} \ __asm add r15, r12 \ __asm vfmadd231pd zmm22, zmm31, -10*8[r15]{1to8}\ __asm vfmadd231pd zmm23, zmm31, -9*8[r15]{1to8} \ __asm vfmadd231pd zmm24, zmm31, -8*8[r15]{1to8} \ __asm dec COUNTER \ __asm vfmadd231pd zmm25, zmm31, -7*8[r15]{1to8} \ \ \ __asm vprefetch1 0[rbx + r13] \ __asm vfmadd231pd zmm26, zmm31, -6*8[r15]{1to8} \ __asm vprefetch0 B_L1_PREFETCH_DIST*8*8[rbx] \ __asm vfmadd231pd zmm27, zmm31, -5*8[r15]{1to8} \ __asm add rbx, r9 \ __asm vfmadd231pd zmm28, zmm31, -4*8[r15]{1to8} \ __asm cmp COUNTER, 0 \ __asm vfmadd231pd zmm29, zmm31, -3*8[r15]{1to8} \ } //One iteration of the k_r loop. //Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch one line of C into the L2 cache //Current placement of this prefetch instruction is somewhat arbitrary. #define ONE_ITER_PC_L2(C_ADDR) \ {\ __asm vbroadcastf64x4 zmm30, 0[r15] \ __asm vmovapd zmm31, 0[rbx] \ \ __asm vfmadd231pd zmm0, zmm31, zmm30{aaaa} \ __asm vfmadd231pd zmm4, zmm31, 4*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ __asm vfmadd231pd zmm5, zmm31, 5*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ __asm vfmadd231pd zmm6, zmm31, 6*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ __asm vfmadd231pd zmm7, zmm31, 7*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ __asm vfmadd231pd zmm8, zmm31, 8*8[r15]{1to8} \ \ __asm vprefetch1 0[r15 + r14] \ __asm vfmadd231pd zmm9, zmm31, 9*8[r15]{1to8} \ __asm vfmadd231pd zmm1, zmm31, zmm30{bbbb} \ __asm vfmadd231pd zmm2, zmm31, zmm30{cccc} \ __asm vfmadd231pd zmm3, zmm31, zmm30{dddd} \ __asm vfmadd231pd zmm10, zmm31, 10*8[r15]{1to8} \ \ __asm vprefetch1 64[r15 + r14] \ __asm vfmadd231pd zmm11, zmm31, 11*8[r15]{1to8} \ __asm vprefetch1 0[C_ADDR] \ __asm vfmadd231pd zmm12, zmm31, 12*8[r15]{1to8} \ __asm vfmadd231pd zmm13, zmm31, 13*8[r15]{1to8} \ __asm vfmadd231pd zmm14, zmm31, 14*8[r15]{1to8} \ __asm vfmadd231pd zmm15, zmm31, 15*8[r15]{1to8} \ \ __asm vprefetch1 2*64[r15 + r14] \ __asm vfmadd231pd zmm16, zmm31, 16*8[r15]{1to8} \ __asm vfmadd231pd zmm17, zmm31, 17*8[r15]{1to8} \ __asm vfmadd231pd zmm18, zmm31, 18*8[r15]{1to8} \ __asm vfmadd231pd zmm19, zmm31, 19*8[r15]{1to8} \ __asm vfmadd231pd zmm20, zmm31, 20*8[r15]{1to8} \ \ __asm vprefetch1 3*64[r15 + r14] \ __asm vfmadd231pd zmm21, zmm31, 21*8[r15]{1to8} \ __asm add r15, r12 \ __asm vfmadd231pd zmm22, zmm31, -10*8[r15]{1to8}\ __asm vfmadd231pd zmm23, zmm31, -9*8[r15]{1to8} \ __asm add C_ADDR, r11 \ __asm vfmadd231pd zmm24, zmm31, -8*8[r15]{1to8} \ __asm dec r8 \ __asm vfmadd231pd zmm25, zmm31, -7*8[r15]{1to8} \ \ \ __asm vprefetch1 0[rbx + r13] \ __asm vfmadd231pd zmm26, zmm31, -6*8[r15]{1to8} \ __asm vprefetch0 B_L1_PREFETCH_DIST*8*8[rbx] \ __asm vfmadd231pd zmm27, zmm31, -5*8[r15]{1to8} \ __asm add rbx, r9 \ __asm vfmadd231pd zmm28, zmm31, -4*8[r15]{1to8} \ __asm cmp r8, 0 \ __asm vfmadd231pd zmm29, zmm31, -3*8[r15]{1to8} \ \ } //One iteration of the k_r loop. //Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch 3 cache lines of C into the L1 cache //Current placement of these prefetch instructions is somewhat arbitrary. #define ONE_ITER_PC_L1(C_ADDR) \ {\ __asm vbroadcastf64x4 zmm30, 0[r15] \ __asm vmovapd zmm31, 0[rbx] \ \ __asm vfmadd231pd zmm0, zmm31, zmm30{aaaa} \ __asm vfmadd231pd zmm4, zmm31, 4*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ __asm vfmadd231pd zmm5, zmm31, 5*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ __asm vfmadd231pd zmm6, zmm31, 6*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ __asm vfmadd231pd zmm7, zmm31, 7*8[r15]{1to8} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ __asm vfmadd231pd zmm8, zmm31, 8*8[r15]{1to8} \ \ __asm vprefetch1 0[r15 + r14] \ __asm vfmadd231pd zmm9, zmm31, 9*8[r15]{1to8} \ __asm vprefetch0 0[C_ADDR] \ __asm vfmadd231pd zmm1, zmm31, zmm30{bbbb} \ __asm add C_ADDR, r11 \ __asm vfmadd231pd zmm2, zmm31, zmm30{cccc} \ __asm vfmadd231pd zmm3, zmm31, zmm30{dddd} \ __asm vfmadd231pd zmm10, zmm31, 10*8[r15]{1to8} \ \ __asm vprefetch1 64[r15 + r14] \ __asm vfmadd231pd zmm11, zmm31, 11*8[r15]{1to8} \ __asm vprefetch0 0[C_ADDR] \ __asm vfmadd231pd zmm12, zmm31, 12*8[r15]{1to8} \ __asm add C_ADDR, r11 \ __asm vfmadd231pd zmm13, zmm31, 13*8[r15]{1to8} \ __asm vfmadd231pd zmm14, zmm31, 14*8[r15]{1to8} \ __asm vfmadd231pd zmm15, zmm31, 15*8[r15]{1to8} \ \ __asm vprefetch1 2*64[r15 + r14] \ __asm vfmadd231pd zmm16, zmm31, 16*8[r15]{1to8} \ __asm vprefetch0 0[C_ADDR] \ __asm vfmadd231pd zmm17, zmm31, 17*8[r15]{1to8} \ __asm add C_ADDR, r11 \ __asm vfmadd231pd zmm18, zmm31, 18*8[r15]{1to8} \ __asm vfmadd231pd zmm19, zmm31, 19*8[r15]{1to8} \ __asm vfmadd231pd zmm20, zmm31, 20*8[r15]{1to8} \ \ __asm vprefetch1 3*64[r15 + r14] \ __asm vfmadd231pd zmm21, zmm31, 21*8[r15]{1to8} \ __asm add r15, r12 \ __asm vfmadd231pd zmm22, zmm31, -10*8[r15]{1to8}\ __asm vfmadd231pd zmm23, zmm31, -9*8[r15]{1to8} \ __asm vfmadd231pd zmm24, zmm31, -8*8[r15]{1to8} \ __asm dec r8 \ __asm vfmadd231pd zmm25, zmm31, -7*8[r15]{1to8} \ \ \ __asm vprefetch1 0[rbx + r13] \ __asm vfmadd231pd zmm26, zmm31, -6*8[r15]{1to8} \ __asm vprefetch0 B_L1_PREFETCH_DIST*8*8[rbx] \ __asm vfmadd231pd zmm27, zmm31, -5*8[r15]{1to8} \ __asm add rbx, r9 \ __asm vfmadd231pd zmm28, zmm31, -4*8[r15]{1to8} \ __asm cmp r8, 0 \ __asm vfmadd231pd zmm29, zmm31, -3*8[r15]{1to8} \ \ } //This is an array used for the scattter/gather instructions. extern int offsets[16]; //#define MONITORS //#define LOOPMON void bli_dgemm_knc_asm_30x8 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { double * a_next = bli_auxinfo_next_a( data ); double * b_next = bli_auxinfo_next_b( data ); int * offsetPtr = &offsets[0]; uint64_t k64 = k; GEMM_UKR_SETUP_CT( d, 30, 8, true ); #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif __asm { #ifdef MONITORS rdtsc mov topl, eax mov toph, edx #endif vpxord zmm0, zmm0, zmm0 vmovaps zmm1, zmm0 //clear out registers vmovaps zmm2, zmm0 mov rsi, k64 //loop index vmovaps zmm3, zmm0 mov r11, rs_c //load row stride vmovaps zmm4, zmm0 sal r11, 3 //scale row stride vmovaps zmm5, zmm0 mov r15, a //load address of a vmovaps zmm6, zmm0 mov rbx, b //load address of b vmovaps zmm7, zmm0 vmovaps zmm8, zmm0 lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11 vmovaps zmm9, zmm0 vmovaps zmm10, zmm0 mov rdi, r11 vmovaps zmm11, zmm0 sal rdi, 2 //rdi has 4*r11 vmovaps zmm12, zmm0 mov rcx, c //load address of c for prefetching vmovaps zmm13, zmm0 vmovaps zmm14, zmm0 mov r8, k64 vmovaps zmm15, zmm0 vmovaps zmm16, zmm0 vmovaps zmm17, zmm0 mov r13, L2_PREFETCH_DIST*8*8 vmovaps zmm18, zmm0 mov r14, L2_PREFETCH_DIST*8*32 vmovaps zmm19, zmm0 vmovaps zmm20, zmm0 vmovaps zmm21, zmm0 vmovaps zmm22, zmm0 vmovaps zmm23, zmm0 sub r8, 30 + L2_PREFETCH_DIST //Check if we have over 40 operations to do. vmovaps zmm24, zmm0 mov r8, 30 vmovaps zmm25, zmm0 mov r9, 8*8 //amount to increment b* by each iteration vmovaps zmm26, zmm0 mov r12, 32*8 //amount to increment a* by each iteration vmovaps zmm27, zmm0 vmovaps zmm28, zmm0 vmovaps zmm29, zmm0 #ifdef MONITORS rdtsc mov midl, eax mov midh, edx #endif jle CONSIDER_UNDER_40 sub rsi, 30 + L2_PREFETCH_DIST //First 30 iterations LOOPREFECHCL2: ONE_ITER_PC_L2(rcx) jne LOOPREFECHCL2 mov rcx, c //Main Loop. LOOPMAIN: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN //Penultimate 22 iterations. //Break these off from the main loop to avoid prefetching extra shit. mov r14, a_next mov r13, b_next sub r14, r15 sub r13, rbx mov rsi, L2_PREFETCH_DIST-10 LOOPMAIN2: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN2 //Last 10 iterations mov r8, 10 LOOPREFETCHCL1: ONE_ITER_PC_L1(rcx) jne LOOPREFETCHCL1 jmp POSTACCUM //Alternate main loop, with no prefetching of C //Used when <= 40 iterations CONSIDER_UNDER_40: mov rsi, k64 test rsi, rsi je POSTACCUM LOOP_UNDER_40: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOP_UNDER_40 POSTACCUM: #ifdef MONITORS rdtsc mov mid2l, eax mov mid2h, edx #endif mov r9, c //load address of c for update mov r12, alpha //load address of alpha mov r14, beta vbroadcastsd zmm31, 0[r14] vmulpd zmm0, zmm0, 0[r12]{1to8} vmulpd zmm1, zmm1, 0[r12]{1to8} vmulpd zmm2, zmm2, 0[r12]{1to8} vmulpd zmm3, zmm3, 0[r12]{1to8} vfmadd231pd zmm0, zmm31, [r9+0] vfmadd231pd zmm1, zmm31, [r9+r11+0] vfmadd231pd zmm2, zmm31, [r9+2*r11+0] vfmadd231pd zmm3, zmm31, [r9+r10+0] vmovapd [r9+0], zmm0 vmovapd [r9+r11+0], zmm1 vmovapd [r9+2*r11+0], zmm2 vmovapd [r9+r10+0], zmm3 add r9, rdi vmulpd zmm4, zmm4, 0[r12]{1to8} vmulpd zmm5, zmm5, 0[r12]{1to8} vmulpd zmm6, zmm6, 0[r12]{1to8} vmulpd zmm7, zmm7, 0[r12]{1to8} vfmadd231pd zmm4, zmm31, [r9+0] vfmadd231pd zmm5, zmm31, [r9+r11+0] vfmadd231pd zmm6, zmm31, [r9+2*r11+0] vfmadd231pd zmm7, zmm31, [r9+r10+0] vmovapd [r9+0], zmm4 vmovapd [r9+r11+0], zmm5 vmovapd [r9+2*r11+0], zmm6 vmovapd [r9+r10+0], zmm7 add r9, rdi vmulpd zmm8, zmm8, 0[r12]{1to8} vmulpd zmm9, zmm9, 0[r12]{1to8} vmulpd zmm10, zmm10, 0[r12]{1to8} vmulpd zmm11, zmm11, 0[r12]{1to8} vfmadd231pd zmm8, zmm31, [r9+0] vfmadd231pd zmm9, zmm31, [r9+r11+0] vfmadd231pd zmm10, zmm31, [r9+2*r11+0] vfmadd231pd zmm11, zmm31, [r9+r10+0] vmovapd [r9+0], zmm8 vmovapd [r9+r11+0], zmm9 vmovapd [r9+2*r11+0], zmm10 vmovapd [r9+r10+0], zmm11 add r9, rdi vmulpd zmm12, zmm12, 0[r12]{1to8} vmulpd zmm13, zmm13, 0[r12]{1to8} vmulpd zmm14, zmm14, 0[r12]{1to8} vmulpd zmm15, zmm15, 0[r12]{1to8} vfmadd231pd zmm12, zmm31, [r9+0] vfmadd231pd zmm13, zmm31, [r9+r11+0] vfmadd231pd zmm14, zmm31, [r9+2*r11+0] vfmadd231pd zmm15, zmm31, [r9+r10+0] vmovapd [r9+0], zmm12 vmovapd [r9+r11+0], zmm13 vmovapd [r9+2*r11+0], zmm14 vmovapd [r9+r10+0], zmm15 add r9, rdi vmulpd zmm16, zmm16, 0[r12]{1to8} vmulpd zmm17, zmm17, 0[r12]{1to8} vmulpd zmm18, zmm18, 0[r12]{1to8} vmulpd zmm19, zmm19, 0[r12]{1to8} vfmadd231pd zmm16, zmm31, [r9+0] vfmadd231pd zmm17, zmm31, [r9+r11+0] vfmadd231pd zmm18, zmm31, [r9+2*r11+0] vfmadd231pd zmm19, zmm31, [r9+r10+0] vmovapd [r9+0], zmm16 vmovapd [r9+r11+0], zmm17 vmovapd [r9+2*r11+0], zmm18 vmovapd [r9+r10+0], zmm19 add r9, rdi vmulpd zmm20, zmm20, 0[r12]{1to8} vmulpd zmm21, zmm21, 0[r12]{1to8} vmulpd zmm22, zmm22, 0[r12]{1to8} vmulpd zmm23, zmm23, 0[r12]{1to8} vfmadd231pd zmm20, zmm31, [r9+0] vfmadd231pd zmm21, zmm31, [r9+r11+0] vfmadd231pd zmm22, zmm31, [r9+2*r11+0] vfmadd231pd zmm23, zmm31, [r9+r10+0] vmovapd [r9+0], zmm20 vmovapd [r9+r11+0], zmm21 vmovapd [r9+2*r11+0], zmm22 vmovapd [r9+r10+0], zmm23 add r9, rdi vmulpd zmm24, zmm24, 0[r12]{1to8} vmulpd zmm25, zmm25, 0[r12]{1to8} vmulpd zmm26, zmm26, 0[r12]{1to8} vmulpd zmm27, zmm27, 0[r12]{1to8} vfmadd231pd zmm24, zmm31, [r9+0] vfmadd231pd zmm25, zmm31, [r9+r11+0] vfmadd231pd zmm26, zmm31, [r9+2*r11+0] vfmadd231pd zmm27, zmm31, [r9+r10+0] vmovapd [r9+0], zmm24 vmovapd [r9+r11+0], zmm25 vmovapd [r9+2*r11+0], zmm26 vmovapd [r9+r10+0], zmm27 add r9, rdi vmulpd zmm28, zmm28, 0[r12]{1to8} vmulpd zmm29, zmm29, 0[r12]{1to8} vfmadd231pd zmm28, zmm31, [r9+0] vfmadd231pd zmm29, zmm31, [r9+r11+0] vmovapd [r9+0], zmm28 vmovapd [r9+r11+0], zmm29 END: #ifdef MONITORS rdtsc mov botl, eax mov both, edx #endif } GEMM_UKR_FLUSH_CT( d ); #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif #ifdef MONITORS dim_t top = ((dim_t)toph << 32) | topl; dim_t mid = ((dim_t)midh << 32) | midl; dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; dim_t bot = ((dim_t)both << 32) | botl; printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); #endif } cython-blis-0.9.1/blis/_src/kernels/knc/3/bli_sgemm_knc_asm_30x16.c000066400000000000000000000514041427272030600246600ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #define A_L1_PREFETCH_DIST 4 #define B_L1_PREFETCH_DIST 2 #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. //Alternate code path uused if C is not row-major #define UPDATE_C_ROW_SCATTERED(REG1, NUM, BASE_DEST) \ { \ __asm kmov k3, ebx \ __asm GATHER##NUM: \ __asm vgatherdps zmm31{k3}, [BASE_DEST + zmm30 * 4] \ __asm jknzd k3, GATHER##NUM \ \ __asm vmulps REG1, REG1, 0[r12]{1to16} /*scale by alpha*/ \ __asm vfmadd132ps zmm31, REG1, 0[r13]{1to16} /*scale by beta, add in result*/\ __asm kmov k3, ebx \ \ __asm SCATTER##NUM: \ __asm vscatterdps [BASE_DEST + zmm30 * 4]{k3}, zmm31 \ __asm jknzd k3, SCATTER##NUM \ __asm add BASE_DEST, r11 \ } //One iteration of the k_r loop. //Each iteration, we prefetch A into L1 and into L2 #define ONE_ITER_MAIN_LOOP(C_ADDR, COUNTER) \ {\ __asm vbroadcastf32x4 zmm30, 0[r15] \ __asm vmovaps zmm31, 0[rbx] \ \ __asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \ __asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ __asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ __asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ __asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ __asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \ \ __asm vprefetch1 0[r15 + r14] \ __asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \ __asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \ __asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \ __asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \ __asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \ \ __asm vprefetch1 64[r15 + r14] \ __asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \ __asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \ __asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \ __asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \ __asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \ \ __asm vprefetch1 2*64[r15 + r14] \ __asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \ __asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \ __asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \ __asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \ __asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \ \ __asm vprefetch1 3*64[r15 + r14] \ __asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \ __asm add r15, r12 \ __asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\ __asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \ __asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \ __asm dec COUNTER \ __asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \ \ \ __asm vprefetch1 0[rbx + r13] \ __asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \ __asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \ __asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \ __asm add rbx, r9 \ __asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \ __asm cmp COUNTER, 0 \ __asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \ } //One iteration of the k_r loop. //Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch one line of C into the L2 cache //Current placement of this prefetch instruction is somewhat arbitrary. #define ONE_ITER_PC_L2(C_ADDR) \ {\ __asm vbroadcastf32x4 zmm30, 0[r15] \ __asm vmovaps zmm31, 0[rbx] \ \ __asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \ __asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ __asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ __asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ __asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ __asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \ \ __asm vprefetch1 0[r15 + r14] \ __asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \ __asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \ __asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \ __asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \ __asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \ \ __asm vprefetch1 64[r15 + r14] \ __asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \ __asm vprefetch1 0[C_ADDR] \ __asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \ __asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \ __asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \ __asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \ \ __asm vprefetch1 2*64[r15 + r14] \ __asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \ __asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \ __asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \ __asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \ __asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \ \ __asm vprefetch1 3*64[r15 + r14] \ __asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \ __asm add r15, r12 \ __asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\ __asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \ __asm add C_ADDR, r11 \ __asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \ __asm dec r8 \ __asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \ \ \ __asm vprefetch1 0[rbx + r13] \ __asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \ __asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \ __asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \ __asm add rbx, r9 \ __asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \ __asm cmp r8, 0 \ __asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \ \ } //One iteration of the k_r loop. //Same as ONE_ITER_MAIN_LOOP, but additionally, we prefetch 3 cache lines of C into the L1 cache //Current placement of these prefetch instructions is somewhat arbitrary. #define ONE_ITER_PC_L1(C_ADDR) \ {\ __asm vbroadcastf32x4 zmm30, 0[r15] \ __asm vmovaps zmm31, 0[rbx] \ \ __asm vfmadd231ps zmm0, zmm31, zmm30{aaaa} \ __asm vfmadd231ps zmm4, zmm31, 4*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256[r15] \ __asm vfmadd231ps zmm5, zmm31, 5*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+64[r15] \ __asm vfmadd231ps zmm6, zmm31, 6*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+128[r15]\ __asm vfmadd231ps zmm7, zmm31, 7*4[r15]{1to16} \ __asm vprefetch0 A_L1_PREFETCH_DIST*256+192[r15]\ __asm vfmadd231ps zmm8, zmm31, 8*4[r15]{1to16} \ \ __asm vprefetch1 0[r15 + r14] \ __asm vfmadd231ps zmm9, zmm31, 9*4[r15]{1to16} \ __asm vprefetch0 0[C_ADDR] \ __asm vfmadd231ps zmm1, zmm31, zmm30{bbbb} \ __asm add C_ADDR, r11 \ __asm vfmadd231ps zmm2, zmm31, zmm30{cccc} \ __asm vfmadd231ps zmm3, zmm31, zmm30{dddd} \ __asm vfmadd231ps zmm10, zmm31, 10*4[r15]{1to16} \ \ __asm vprefetch1 64[r15 + r14] \ __asm vfmadd231ps zmm11, zmm31, 11*4[r15]{1to16} \ __asm vprefetch0 0[C_ADDR] \ __asm vfmadd231ps zmm12, zmm31, 12*4[r15]{1to16} \ __asm add C_ADDR, r11 \ __asm vfmadd231ps zmm13, zmm31, 13*4[r15]{1to16} \ __asm vfmadd231ps zmm14, zmm31, 14*4[r15]{1to16} \ __asm vfmadd231ps zmm15, zmm31, 15*4[r15]{1to16} \ \ __asm vprefetch1 2*64[r15 + r14] \ __asm vfmadd231ps zmm16, zmm31, 16*4[r15]{1to16} \ __asm vprefetch0 0[C_ADDR] \ __asm vfmadd231ps zmm17, zmm31, 17*4[r15]{1to16} \ __asm add C_ADDR, r11 \ __asm vfmadd231ps zmm18, zmm31, 18*4[r15]{1to16} \ __asm vfmadd231ps zmm19, zmm31, 19*4[r15]{1to16} \ __asm vfmadd231ps zmm20, zmm31, 20*4[r15]{1to16} \ \ __asm vprefetch1 3*64[r15 + r14] \ __asm vfmadd231ps zmm21, zmm31, 21*4[r15]{1to16} \ __asm add r15, r12 \ __asm vfmadd231ps zmm22, zmm31, -10*4[r15]{1to16}\ __asm vfmadd231ps zmm23, zmm31, -9*4[r15]{1to16} \ __asm vfmadd231ps zmm24, zmm31, -8*4[r15]{1to16} \ __asm dec r8 \ __asm vfmadd231ps zmm25, zmm31, -7*4[r15]{1to16} \ \ \ __asm vprefetch1 0[rbx + r13] \ __asm vfmadd231ps zmm26, zmm31, -6*4[r15]{1to16} \ __asm vprefetch0 B_L1_PREFETCH_DIST*16*4[rbx] \ __asm vfmadd231ps zmm27, zmm31, -5*4[r15]{1to16} \ __asm add rbx, r9 \ __asm vfmadd231ps zmm28, zmm31, -4*4[r15]{1to16} \ __asm cmp r8, 0 \ __asm vfmadd231ps zmm29, zmm31, -3*4[r15]{1to16} \ \ } //This is an array used for the scattter/gather instructions. int offsets[16] __attribute__((aligned(0x1000))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; //#define MONITORS //#define LOOPMON void bli_sgemm_knc_asm_30x16 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { float * a_next = bli_auxinfo_next_a( data ); float * b_next = bli_auxinfo_next_b( data ); int * offsetPtr = &offsets[0]; uint64_t k64 = k; GEMM_UKR_SETUP_CT( s, 30, 16, true ); #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif __asm { #ifdef MONITORS rdtsc mov topl, eax mov toph, edx #endif vpxord zmm0, zmm0, zmm0 vmovaps zmm1, zmm0 //clear out registers vmovaps zmm2, zmm0 mov rsi, k64 //loop index vmovaps zmm3, zmm0 mov r11, rs_c //load row stride vmovaps zmm4, zmm0 sal r11, 2 //scale row stride vmovaps zmm5, zmm0 mov r15, a //load address of a vmovaps zmm6, zmm0 mov rbx, b //load address of b vmovaps zmm7, zmm0 vmovaps zmm8, zmm0 lea r10, [r11 + 2*r11 + 0] //r10 has 3 * r11 vmovaps zmm9, zmm0 vmovaps zmm10, zmm0 mov rdi, r11 vmovaps zmm11, zmm0 sal rdi, 2 //rdi has 4*r11 vmovaps zmm12, zmm0 mov rcx, c //load address of c for prefetching vmovaps zmm13, zmm0 vmovaps zmm14, zmm0 mov r8, k64 vmovaps zmm15, zmm0 vmovaps zmm16, zmm0 vmovaps zmm17, zmm0 mov r13, L2_PREFETCH_DIST*4*16 vmovaps zmm18, zmm0 mov r14, L2_PREFETCH_DIST*4*32 vmovaps zmm19, zmm0 vmovaps zmm20, zmm0 vmovaps zmm21, zmm0 vmovaps zmm22, zmm0 vmovaps zmm23, zmm0 sub r8, 30 + L2_PREFETCH_DIST //Check if we have over 40 operations to do. vmovaps zmm24, zmm0 mov r8, 30 vmovaps zmm25, zmm0 mov r9, 16*4 //amount to increment b* by each iteration vmovaps zmm26, zmm0 mov r12, 32*4 //amount to increment a* by each iteration vmovaps zmm27, zmm0 vmovaps zmm28, zmm0 vmovaps zmm29, zmm0 #ifdef MONITORS rdtsc mov midl, eax mov midh, edx #endif jle CONSIDER_UNDER_40 sub rsi, 30 + L2_PREFETCH_DIST //First 30 iterations LOOPREFECHCL2: ONE_ITER_PC_L2(rcx) jne LOOPREFECHCL2 mov rcx, c //Main Loop. LOOPMAIN: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN //Penultimate 22 iterations. //Break these off from the main loop to avoid prefetching extra shit. mov r14, a_next mov r13, b_next sub r14, r15 sub r13, rbx mov rsi, L2_PREFETCH_DIST-10 LOOPMAIN2: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOPMAIN2 //Last 10 iterations mov r8, 10 LOOPREFETCHCL1: ONE_ITER_PC_L1(rcx) jne LOOPREFETCHCL1 jmp POSTACCUM //Alternate main loop, with no prefetching of C //Used when <= 40 iterations CONSIDER_UNDER_40: mov rsi, k64 test rsi, rsi je POSTACCUM LOOP_UNDER_40: ONE_ITER_MAIN_LOOP(rcx, rsi) jne LOOP_UNDER_40 POSTACCUM: #ifdef MONITORS rdtsc mov mid2l, eax mov mid2h, edx #endif mov r9, c //load address of c for update mov r12, alpha //load address of alpha mov r14, beta vbroadcastss zmm31, 0[r14] vmulps zmm0, zmm0, 0[r12]{1to16} vmulps zmm1, zmm1, 0[r12]{1to16} vmulps zmm2, zmm2, 0[r12]{1to16} vmulps zmm3, zmm3, 0[r12]{1to16} vfmadd231ps zmm0, zmm31, [r9+0] vfmadd231ps zmm1, zmm31, [r9+r11+0] vfmadd231ps zmm2, zmm31, [r9+2*r11+0] vfmadd231ps zmm3, zmm31, [r9+r10+0] vmovaps [r9+0], zmm0 vmovaps [r9+r11+0], zmm1 vmovaps [r9+2*r11+0], zmm2 vmovaps [r9+r10+0], zmm3 add r9, rdi vmulps zmm4, zmm4, 0[r12]{1to16} vmulps zmm5, zmm5, 0[r12]{1to16} vmulps zmm6, zmm6, 0[r12]{1to16} vmulps zmm7, zmm7, 0[r12]{1to16} vfmadd231ps zmm4, zmm31, [r9+0] vfmadd231ps zmm5, zmm31, [r9+r11+0] vfmadd231ps zmm6, zmm31, [r9+2*r11+0] vfmadd231ps zmm7, zmm31, [r9+r10+0] vmovaps [r9+0], zmm4 vmovaps [r9+r11+0], zmm5 vmovaps [r9+2*r11+0], zmm6 vmovaps [r9+r10+0], zmm7 add r9, rdi vmulps zmm8, zmm8, 0[r12]{1to16} vmulps zmm9, zmm9, 0[r12]{1to16} vmulps zmm10, zmm10, 0[r12]{1to16} vmulps zmm11, zmm11, 0[r12]{1to16} vfmadd231ps zmm8, zmm31, [r9+0] vfmadd231ps zmm9, zmm31, [r9+r11+0] vfmadd231ps zmm10, zmm31, [r9+2*r11+0] vfmadd231ps zmm11, zmm31, [r9+r10+0] vmovaps [r9+0], zmm8 vmovaps [r9+r11+0], zmm9 vmovaps [r9+2*r11+0], zmm10 vmovaps [r9+r10+0], zmm11 add r9, rdi vmulps zmm12, zmm12, 0[r12]{1to16} vmulps zmm13, zmm13, 0[r12]{1to16} vmulps zmm14, zmm14, 0[r12]{1to16} vmulps zmm15, zmm15, 0[r12]{1to16} vfmadd231ps zmm12, zmm31, [r9+0] vfmadd231ps zmm13, zmm31, [r9+r11+0] vfmadd231ps zmm14, zmm31, [r9+2*r11+0] vfmadd231ps zmm15, zmm31, [r9+r10+0] vmovaps [r9+0], zmm12 vmovaps [r9+r11+0], zmm13 vmovaps [r9+2*r11+0], zmm14 vmovaps [r9+r10+0], zmm15 add r9, rdi vmulps zmm16, zmm16, 0[r12]{1to16} vmulps zmm17, zmm17, 0[r12]{1to16} vmulps zmm18, zmm18, 0[r12]{1to16} vmulps zmm19, zmm19, 0[r12]{1to16} vfmadd231ps zmm16, zmm31, [r9+0] vfmadd231ps zmm17, zmm31, [r9+r11+0] vfmadd231ps zmm18, zmm31, [r9+2*r11+0] vfmadd231ps zmm19, zmm31, [r9+r10+0] vmovaps [r9+0], zmm16 vmovaps [r9+r11+0], zmm17 vmovaps [r9+2*r11+0], zmm18 vmovaps [r9+r10+0], zmm19 add r9, rdi vmulps zmm20, zmm20, 0[r12]{1to16} vmulps zmm21, zmm21, 0[r12]{1to16} vmulps zmm22, zmm22, 0[r12]{1to16} vmulps zmm23, zmm23, 0[r12]{1to16} vfmadd231ps zmm20, zmm31, [r9+0] vfmadd231ps zmm21, zmm31, [r9+r11+0] vfmadd231ps zmm22, zmm31, [r9+2*r11+0] vfmadd231ps zmm23, zmm31, [r9+r10+0] vmovaps [r9+0], zmm20 vmovaps [r9+r11+0], zmm21 vmovaps [r9+2*r11+0], zmm22 vmovaps [r9+r10+0], zmm23 add r9, rdi vmulps zmm24, zmm24, 0[r12]{1to16} vmulps zmm25, zmm25, 0[r12]{1to16} vmulps zmm26, zmm26, 0[r12]{1to16} vmulps zmm27, zmm27, 0[r12]{1to16} vfmadd231ps zmm24, zmm31, [r9+0] vfmadd231ps zmm25, zmm31, [r9+r11+0] vfmadd231ps zmm26, zmm31, [r9+2*r11+0] vfmadd231ps zmm27, zmm31, [r9+r10+0] vmovaps [r9+0], zmm24 vmovaps [r9+r11+0], zmm25 vmovaps [r9+2*r11+0], zmm26 vmovaps [r9+r10+0], zmm27 add r9, rdi vmulps zmm28, zmm28, 0[r12]{1to16} vmulps zmm29, zmm29, 0[r12]{1to16} vfmadd231ps zmm28, zmm31, [r9+0] vfmadd231ps zmm29, zmm31, [r9+r11+0] vmovaps [r9+0], zmm28 vmovaps [r9+r11+0], zmm29 END: #ifdef MONITORS rdtsc mov botl, eax mov both, edx #endif } GEMM_UKR_FLUSH_CT( s ); #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif #ifdef MONITORS dim_t top = ((dim_t)toph << 32) | topl; dim_t mid = ((dim_t)midh << 32) | midl; dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; dim_t bot = ((dim_t)both << 32) | botl; printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); #endif } cython-blis-0.9.1/blis/_src/kernels/knc/bli_kernels_knc.h000066400000000000000000000033531427272030600233550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( float, s, gemm_knc_asm_30x16 ) GEMM_UKR_PROT( double, d, gemm_knc_asm_30x8 ) cython-blis-0.9.1/blis/_src/kernels/knl/000077500000000000000000000000001427272030600200655ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/knl/1m/000077500000000000000000000000001427272030600204025ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c000066400000000000000000000463451427272030600251400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_INTEL #include "bli_x86_asm_macros.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ VMULPD(ZMM(z0), ZMM(31), MEM(a, o)) \ VMULPD(ZMM(z1), ZMM(31), MEM(a,s1,1,o)) \ VMULPD(ZMM(z2), ZMM(31), MEM(a,s1,2,o)) \ VMULPD(ZMM(z3), ZMM(31), MEM(a,s3,1,o)) \ VMULPD(ZMM(z4), ZMM(31), MEM(a,s1,4,o)) \ VMULPD(ZMM(z5), ZMM(31), MEM(a,s5,1,o)) \ VMULPD(ZMM(z6), ZMM(31), MEM(a,s3,2,o)) \ VMULPD(ZMM(z7), ZMM(31), MEM(a,s7,1,o)) #define LOADMUL8x8_MASK(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7,k) \ \ VMULPD(ZMM(z0) MASK_KZ(k), ZMM(31), MEM(a, o)) \ VMULPD(ZMM(z1) MASK_KZ(k), ZMM(31), MEM(a,s1,1,o)) \ VMULPD(ZMM(z2) MASK_KZ(k), ZMM(31), MEM(a,s1,2,o)) \ VMULPD(ZMM(z3) MASK_KZ(k), ZMM(31), MEM(a,s3,1,o)) \ VMULPD(ZMM(z4) MASK_KZ(k), ZMM(31), MEM(a,s1,4,o)) \ VMULPD(ZMM(z5) MASK_KZ(k), ZMM(31), MEM(a,s5,1,o)) \ VMULPD(ZMM(z6) MASK_KZ(k), ZMM(31), MEM(a,s3,2,o)) \ VMULPD(ZMM(z7) MASK_KZ(k), ZMM(31), MEM(a,s7,1,o)) #define STORE8x8(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ VMOVUPD(MEM(a, o), ZMM(z0)) \ VMOVUPD(MEM(a,s1,1,o), ZMM(z1)) \ VMOVUPD(MEM(a,s1,2,o), ZMM(z2)) \ VMOVUPD(MEM(a,s3,1,o), ZMM(z3)) \ VMOVUPD(MEM(a,s1,4,o), ZMM(z4)) \ VMOVUPD(MEM(a,s5,1,o), ZMM(z5)) \ VMOVUPD(MEM(a,s3,2,o), ZMM(z6)) \ VMOVUPD(MEM(a,s7,1,o), ZMM(z7)) #define TRANSPOSE8x8(a0,a1,a2,a3,a4,a5,a6,a7, \ b0,b1,b2,b3,b4,b5,b6,b7) \ \ VUNPCKLPD(ZMM(b0), ZMM(a0), ZMM(a1)) \ VUNPCKHPD(ZMM(b1), ZMM(a0), ZMM(a1)) \ VUNPCKLPD(ZMM(b2), ZMM(a2), ZMM(a3)) \ VUNPCKHPD(ZMM(b3), ZMM(a2), ZMM(a3)) \ VUNPCKLPD(ZMM(b4), ZMM(a4), ZMM(a5)) \ VUNPCKHPD(ZMM(b5), ZMM(a4), ZMM(a5)) \ VUNPCKLPD(ZMM(b6), ZMM(a6), ZMM(a7)) \ VUNPCKHPD(ZMM(b7), ZMM(a6), ZMM(a7)) \ VSHUFF64X2(ZMM(a0), ZMM(b0), ZMM(b2), IMM(0x44)) \ VSHUFF64X2(ZMM(a1), ZMM(b1), ZMM(b3), IMM(0x44)) \ VSHUFF64X2(ZMM(a2), ZMM(b0), ZMM(b2), IMM(0xEE)) \ VSHUFF64X2(ZMM(a3), ZMM(b1), ZMM(b3), IMM(0xEE)) \ VSHUFF64X2(ZMM(a4), ZMM(b4), ZMM(b6), IMM(0x44)) \ VSHUFF64X2(ZMM(a5), ZMM(b5), ZMM(b7), IMM(0x44)) \ VSHUFF64X2(ZMM(a6), ZMM(b4), ZMM(b6), IMM(0xEE)) \ VSHUFF64X2(ZMM(a7), ZMM(b5), ZMM(b7), IMM(0xEE)) \ VSHUFF64X2(ZMM(b0), ZMM(a0), ZMM(a4), IMM(0x88)) \ VSHUFF64X2(ZMM(b1), ZMM(a1), ZMM(a5), IMM(0x88)) \ VSHUFF64X2(ZMM(b2), ZMM(a0), ZMM(a4), IMM(0xDD)) \ VSHUFF64X2(ZMM(b3), ZMM(a1), ZMM(a5), IMM(0xDD)) \ VSHUFF64X2(ZMM(b4), ZMM(a2), ZMM(a6), IMM(0x88)) \ VSHUFF64X2(ZMM(b5), ZMM(a3), ZMM(a7), IMM(0x88)) \ VSHUFF64X2(ZMM(b6), ZMM(a2), ZMM(a6), IMM(0xDD)) \ VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD)) //This is an array used for the scatter/gather instructions. static int32_t offsets[32] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; void bli_dpackm_knl_asm_8xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, double* restrict kappa_, double* restrict a_, inc_t inca_, inc_t lda_, double* restrict p_, inc_t ldp_, cntx_t* restrict cntx ) { const int32_t* offsetPtr = &offsets[0]; double* a = ( double* )a_; double* p = ( double* )p_; double* kappa = ( double* )kappa_; const int64_t cdim = cdim_; const int64_t mnr = 8; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; if ( cdim == mnr ) { BEGIN_ASM() MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) MOV(R14, VAR(p)) MOV(RDI, VAR(ldp)) TEST(RSI, RSI) JZ(PACK8_DONE) LEA(RBX, MEM(,RBX,8)) //inca in bytes LEA(RCX, MEM(,RCX,8)) //lda in bytes LEA(RDI, MEM(,RDI,8)) //ldp in bytes LEA(R11, MEM(RDI,RDI,2)) //ldp*3 LEA(R12, MEM(RDI,RDI,4)) //ldp*5 LEA(R13, MEM(R11,RDI,4)) //ldp*7 VBROADCASTSD(ZMM(31), VAR(kappa)) CMP(RBX, IMM(8)) JNE(PACK8_T) LABEL(PACK8_N) MOV(RDX, RSI) AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK8_N_TAIL) LEA(R8, MEM(RCX,RCX,2)) //lda*3 LEA(R9, MEM(RCX,RCX,4)) //lda*5 LEA(R10, MEM(R8 ,RCX,4)) //lda*7 LABEL(PACK8_N_LOOP) LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7) LEA(RAX, MEM(RAX,RCX,8)) LEA(R14, MEM(R14,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK8_N_LOOP) TEST(RDX, RDX) JZ(PACK8_DONE) LABEL(PACK8_N_TAIL) VMULPD(ZMM(0), ZMM(31), MEM(RAX)) VMOVUPD(MEM(R14), ZMM(0)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R14, MEM(R14,RDI,1)) SUB(RDX, IMM(1)) JNZ(PACK8_N_TAIL) JMP(PACK8_DONE) LABEL(PACK8_T) CMP(RCX, IMM(8)) JNE(PACK8_G) LEA(R8, MEM(RBX,RBX,2)) //inca*3 LEA(R9, MEM(RBX,RBX,4)) //inca*5 LEA(R10, MEM(R8 ,RBX,4)) //inca*7 MOV(RDX, RSI) AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK8_T_TAIL) LABEL(PACK8_T_LOOP) LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RAX, MEM(RAX,RCX,8)) LEA(R14, MEM(R14,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK8_T_LOOP) TEST(RDX, RDX) JZ(PACK8_DONE) LABEL(PACK8_T_TAIL) MOV(RSI, IMM(1)) SHLX(RSI, RSI, RDX) SUB(RSI, IMM(1)) KMOVW(K(1), ESI) //mask for n%8 elements LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) VMOVUPD(MEM(R14 ), ZMM( 8)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) VMOVUPD(MEM(R14,RDI,1), ZMM( 9)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) VMOVUPD(MEM(R14,RDI,2), ZMM(10)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) VMOVUPD(MEM(R14,R11,1), ZMM(11)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) VMOVUPD(MEM(R14,RDI,4), ZMM(12)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) VMOVUPD(MEM(R14,R12,1), ZMM(13)) SUB(RDX, IMM(1)) JZ(PACK8_DONE) VMOVUPD(MEM(R14,R11,2), ZMM(14)) JMP(PACK8_DONE) LABEL(PACK8_G) VPBROADCASTD(ZMM(3), VAR(inca)) MOV(RBX, VAR(offsetPtr)) VPMULLD(YMM(0), YMM(3), MEM(RBX)) LABEL(PACK8_G_LOOP) KXNORW(K(1), K(0), K(0)) VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) VMULPD(ZMM(3), ZMM(3), ZMM(31)) VMOVUPD(MEM(R14), ZMM(3)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R14, MEM(R14,RDI,1)) SUB(RSI, IMM(1)) JNZ(PACK8_G_LOOP) LABEL(PACK8_DONE) END_ASM( : //output operands : //input operands [n] "m" (n), [kappa] "m" (*kappa), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [offsetPtr] "m" (offsetPtr) : //clobbers "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" ) } else // if ( cdim < mnr ) { bli_dscal2m_ex \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } void bli_dpackm_knl_asm_24xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, double* restrict kappa_, double* restrict a_, inc_t inca_, inc_t lda_, double* restrict p_, inc_t ldp_, cntx_t* restrict cntx ) { const int32_t* offsetPtr = &offsets[0]; double* a = ( double* )a_; double* p = ( double* )p_; double* kappa = ( double* )kappa_; const int64_t cdim = cdim_; const int64_t mnr = 24; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; if ( cdim == mnr ) { BEGIN_ASM() MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) MOV(R15, VAR(p)) MOV(RDI, VAR(ldp)) LEA(RBX, MEM(,RBX,8)) //inca in bytes LEA(RCX, MEM(,RCX,8)) //lda in bytes LEA(RDI, MEM(,RDI,8)) //ldp in bytes LEA(R11, MEM(RDI,RDI,2)) //ldp*3 LEA(R12, MEM(RDI,RDI,4)) //ldp*5 LEA(R13, MEM(R11,RDI,4)) //ldp*7 VBROADCASTSD(ZMM(31), VAR(kappa)) TEST(RSI, RSI) JZ(PACK24_DONE) CMP(RBX, IMM(8)) JNE(PACK24_T) LABEL(PACK24_N) SAR(RSI, IMM(3)) JZ(PACK24_N_TAIL) LEA(R8, MEM(RCX,RCX,2)) //lda*3 LEA(R9, MEM(RCX,RCX,4)) //lda*5 LEA(R10, MEM(R8 ,RCX,4)) //lda*7 LABEL(PACK24_N_LOOP) LOADMUL8x8(RAX, 0,RCX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) LOADMUL8x8(RAX, 64,RCX,R8,R9,R10, 8, 9,10,11,12,13,14,15) LOADMUL8x8(RAX,128,RCX,R8,R9,R10,16,17,18,19,20,21,22,23) STORE8x8(R15, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) STORE8x8(R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15) STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RAX, MEM(RAX,RCX,8)) LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK24_N_LOOP) LABEL(PACK24_N_TAIL) MOV(RSI, VAR(n)) AND(RSI, IMM(7)) TEST(RSI, RSI) JZ(PACK24_DONE) LABEL(PACK24_N_TAIL_LOOP) VMULPD(ZMM(0), ZMM(31), MEM(RAX, 0)) VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64)) VMULPD(ZMM(2), ZMM(31), MEM(RAX,128)) VMOVUPD(MEM(R15, 0), ZMM(0)) VMOVUPD(MEM(R15, 64), ZMM(1)) VMOVUPD(MEM(R15,128), ZMM(2)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R15, MEM(R15,RDI,1)) SUB(RSI, IMM(1)) JNZ(PACK24_N_TAIL_LOOP) JMP(PACK24_DONE) LABEL(PACK24_T) CMP(RCX, IMM(8)) JNE(PACK24_G) LEA(R8, MEM(RBX,RBX,2)) //inca*3 LEA(R9, MEM(RBX,RBX,4)) //inca*5 LEA(R10, MEM(R8 ,RBX,4)) //inca*7 LEA(R14, MEM(RAX,RBX,8)) LEA(RCX, MEM(R14,RBX,8)) SAR(RSI, IMM(3)) JZ(PACK24_T_TAIL) LABEL(PACK24_T_LOOP) LOADMUL8x8(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) STORE8x8(R15, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23) STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RAX, MEM(RAX,64)) LEA(R14, MEM(R14,64)) LEA(RCX, MEM(RCX,64)) LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK24_T_LOOP) LABEL(PACK24_T_TAIL) MOV(RSI, VAR(n)) AND(RSI, IMM(7)) TEST(RSI, RSI) JZ(PACK24_DONE) MOV(R13, IMM(1)) SHLX(R13, R13, RSI) SUB(R13, IMM(1)) KMOVW(K(1), R13D) //mask for n%8 elements LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1) LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1) LOADMUL8x8_MASK(RCX,0,RBX,R8,R9,R10,16,17,18,19,20,21,22,23,1) TRANSPOSE8x8(16,17,18,19,20,21,22,23, 24,25,26,27,28,29,30,31) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) VMOVUPD(MEM(R15, 0), ZMM( 8)) VMOVUPD(MEM(R15, 64), ZMM(16)) VMOVUPD(MEM(R15, 128), ZMM(24)) SUB(RSI, IMM(1)) JZ(PACK24_DONE) VMOVUPD(MEM(R15,RDI,1, 0), ZMM( 9)) VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17)) VMOVUPD(MEM(R15,RDI,1,128), ZMM(25)) SUB(RSI, IMM(1)) JZ(PACK24_DONE) VMOVUPD(MEM(R15,RDI,2, 0), ZMM(10)) VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18)) VMOVUPD(MEM(R15,RDI,2,128), ZMM(26)) SUB(RSI, IMM(1)) JZ(PACK24_DONE) VMOVUPD(MEM(R15,R11,1, 0), ZMM(11)) VMOVUPD(MEM(R15,R11,1, 64), ZMM(19)) VMOVUPD(MEM(R15,R11,1,128), ZMM(27)) SUB(RSI, IMM(1)) JZ(PACK24_DONE) VMOVUPD(MEM(R15,RDI,4, 0), ZMM(12)) VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20)) VMOVUPD(MEM(R15,RDI,4,128), ZMM(28)) SUB(RSI, IMM(1)) JZ(PACK24_DONE) VMOVUPD(MEM(R15,R12,1, 0), ZMM(13)) VMOVUPD(MEM(R15,R12,1, 64), ZMM(21)) VMOVUPD(MEM(R15,R12,1,128), ZMM(29)) SUB(RSI, IMM(1)) JZ(PACK24_DONE) VMOVUPD(MEM(R15,R11,2, 0), ZMM(14)) VMOVUPD(MEM(R15,R11,2, 64), ZMM(22)) VMOVUPD(MEM(R15,R11,2,128), ZMM(30)) JMP(PACK24_DONE) LABEL(PACK24_G) VPBROADCASTD(ZMM(3), VAR(inca)) MOV(RBX, VAR(offsetPtr)) VPMULLD(YMM(0), YMM(3), MEM(RBX, 0)) VPMULLD(YMM(1), YMM(3), MEM(RBX,32)) VPMULLD(YMM(2), YMM(3), MEM(RBX,64)) LABEL(PACK24_G_LOOP) KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) KXNORW(K(3), K(0), K(0)) VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8)) VGATHERDPD(ZMM(4) MASK_K(2), MEM(RAX,YMM(1),8)) VGATHERDPD(ZMM(5) MASK_K(3), MEM(RAX,YMM(2),8)) VMULPD(ZMM(3), ZMM(3), ZMM(31)) VMULPD(ZMM(4), ZMM(4), ZMM(31)) VMULPD(ZMM(5), ZMM(5), ZMM(31)) VMOVUPD(MEM(R15, 0), ZMM(3)) VMOVUPD(MEM(R15, 64), ZMM(4)) VMOVUPD(MEM(R15,128), ZMM(5)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R15, MEM(R15,RDI,1)) SUB(RSI, IMM(1)) JNZ(PACK24_G_LOOP) LABEL(PACK24_DONE) END_ASM( : //output operands : //input operands [n] "m" (n), [kappa] "m" (*kappa), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [offsetPtr] "m" (offsetPtr) : //clobbers "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory" ) } else // if ( cdim < mnr ) { bli_dscal2m_ex \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; double* restrict p_edge = p + (i )*1; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; double* restrict p_edge = p + (j )*ldp; bli_dset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/knl/1m/bli_spackm_knl_asm_24x16.c000066400000000000000000000502301427272030600252220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_INTEL #include "bli_x86_asm_macros.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ VMULPS(YMM(z0), YMM(15), MEM(a, o)) \ VMULPS(YMM(z1), YMM(15), MEM(a,s1,1,o)) \ VMULPS(YMM(z2), YMM(15), MEM(a,s1,2,o)) \ VMULPS(YMM(z3), YMM(15), MEM(a,s3,1,o)) \ VMULPS(YMM(z4), YMM(15), MEM(a,s1,4,o)) \ VMULPS(YMM(z5), YMM(15), MEM(a,s5,1,o)) \ VMULPS(YMM(z6), YMM(15), MEM(a,s3,2,o)) \ VMULPS(YMM(z7), YMM(15), MEM(a,s7,1,o)) #define STORE8x8(a,o,s, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ VMOVUPS(MEM(a,(o)+0*(s)), YMM(z0)) \ VMOVUPS(MEM(a,(o)+1*(s)), YMM(z1)) \ VMOVUPS(MEM(a,(o)+2*(s)), YMM(z2)) \ VMOVUPS(MEM(a,(o)+3*(s)), YMM(z3)) \ VMOVUPS(MEM(a,(o)+4*(s)), YMM(z4)) \ VMOVUPS(MEM(a,(o)+5*(s)), YMM(z5)) \ VMOVUPS(MEM(a,(o)+6*(s)), YMM(z6)) \ VMOVUPS(MEM(a,(o)+7*(s)), YMM(z7)) #define STORETRANS8x8(a,o,s, \ a0,a1,a2,a3,a4,a5,a6,a7, \ t0,t1,t2,t3,t4,t5) \ \ VUNPCKLPS(YMM(t0), YMM(a0), YMM(a1)) \ VUNPCKLPS(YMM(t2), YMM(a2), YMM(a3)) \ VUNPCKLPS(YMM(t1), YMM(a4), YMM(a5)) \ VUNPCKLPS(YMM(t3), YMM(a6), YMM(a7)) \ \ VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \ VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \ VMOVUPS(MEM(a,(o )+0*(s)), XMM(t4)) \ VMOVUPS(MEM(a,(o+16)+0*(s)), XMM(t5)) \ VEXTRACTF128(MEM(a,(o )+4*(s)), YMM(t4), IMM(1)) \ VEXTRACTF128(MEM(a,(o+16)+4*(s)), YMM(t5), IMM(1)) \ \ VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \ VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \ VMOVUPS(MEM(a,(o )+1*(s)), XMM(t4)) \ VMOVUPS(MEM(a,(o+16)+1*(s)), XMM(t5)) \ VEXTRACTF128(MEM(a,(o )+5*(s)), YMM(t4), IMM(1)) \ VEXTRACTF128(MEM(a,(o+16)+5*(s)), YMM(t5), IMM(1)) \ \ VUNPCKHPS(YMM(t0), YMM(a0), YMM(a1)) \ VUNPCKHPS(YMM(t2), YMM(a2), YMM(a3)) \ VUNPCKHPS(YMM(t1), YMM(a4), YMM(a5)) \ VUNPCKHPS(YMM(t3), YMM(a6), YMM(a7)) \ \ VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \ VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \ VMOVUPS(MEM(a,(o )+2*(s)), XMM(t4)) \ VMOVUPS(MEM(a,(o+16)+2*(s)), XMM(t5)) \ VEXTRACTF128(MEM(a,(o )+6*(s)), YMM(t4), IMM(1)) \ VEXTRACTF128(MEM(a,(o+16)+6*(s)), YMM(t5), IMM(1)) \ \ VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \ VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \ VMOVUPS(MEM(a,(o )+3*(s)), XMM(t4)) \ VMOVUPS(MEM(a,(o+16)+3*(s)), XMM(t5)) \ VEXTRACTF128(MEM(a,(o )+7*(s)), YMM(t4), IMM(1)) \ VEXTRACTF128(MEM(a,(o+16)+7*(s)), YMM(t5), IMM(1)) //This is an array used for the scatter/gather instructions. static int32_t offsets[32] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; void bli_spackm_knl_asm_16xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, float* restrict kappa_, float* restrict a_, inc_t inca_, inc_t lda_, float* restrict p_, inc_t ldp_, cntx_t* restrict cntx ) { const int32_t* offsetPtr = &offsets[0]; float* a = ( float* )a_; float* p = ( float* )p_; float* kappa = ( float* )kappa_; const int64_t cdim = cdim_; const int64_t mnr = 16; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; if ( cdim == mnr ) { BEGIN_ASM() MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) MOV(R14, VAR(p)) TEST(RSI, RSI) JZ(PACK16_DONE) LEA(RBX, MEM(,RBX,4)) //inca in bytes LEA(RCX, MEM(,RCX,4)) //lda in bytes VBROADCASTSS(YMM(15), VAR(kappa)) CMP(RBX, IMM(4)) JNE(PACK16_T) LABEL(PACK16_N) MOV(RDX, RSI) AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK16_N_TAIL) LEA(R8, MEM(RCX,RCX,2)) //lda*3 LEA(R9, MEM(RCX,RCX,4)) //lda*5 LEA(R10, MEM(R8 ,RCX,4)) //lda*7 LABEL(PACK16_N_LOOP) LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7) LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7) LEA(RAX, MEM(RAX,RCX,8)) LEA(R14, MEM(R14,16*8*4)) SUB(RSI, IMM(1)) JNZ(PACK16_N_LOOP) TEST(RDX, RDX) JZ(PACK16_DONE) LABEL(PACK16_N_TAIL) VMULPS(YMM(0), YMM(15), MEM(RAX )) VMULPS(YMM(1), YMM(15), MEM(RAX,32)) VMOVUPS(MEM(R14 ), YMM(0)) VMOVUPS(MEM(R14,32), YMM(1)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R14, MEM(R14, 16*4)) SUB(RDX, IMM(1)) JNZ(PACK16_N_TAIL) JMP(PACK16_DONE) LABEL(PACK16_T) CMP(RCX, IMM(4)) JNE(PACK16_G) LEA(R8, MEM(RBX,RBX,2)) //inca*3 LEA(R9, MEM(RBX,RBX,4)) //inca*5 LEA(R10, MEM(R8 ,RBX,4)) //inca*7 LEA(R11, MEM(RAX,RBX,8)) MOV(RDX, RSI) AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK16_T_TAIL) LABEL(PACK16_T_LOOP) LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) LEA(RAX, MEM(RAX, 8*4)) LEA(R11, MEM(R11, 8*4)) LEA(R14, MEM(R14,16*8*4)) SUB(RSI, IMM(1)) JNZ(PACK16_T_LOOP) TEST(RDX, RDX) JZ(PACK16_DONE) LABEL(PACK16_T_TAIL) VMULSS(XMM(0), XMM(15), MEM(RAX )) VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1)) VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2)) VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1)) VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4)) VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1)) VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2)) VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1)) VMOVSS(MEM(R14,0*4), XMM(0)) VMOVSS(MEM(R14,1*4), XMM(1)) VMOVSS(MEM(R14,2*4), XMM(2)) VMOVSS(MEM(R14,3*4), XMM(3)) VMOVSS(MEM(R14,4*4), XMM(4)) VMOVSS(MEM(R14,5*4), XMM(5)) VMOVSS(MEM(R14,6*4), XMM(6)) VMOVSS(MEM(R14,7*4), XMM(7)) VMULSS(XMM(0), XMM(15), MEM(R11 )) VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1)) VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2)) VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1)) VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4)) VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1)) VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2)) VMULSS(XMM(7), XMM(15), MEM(R11,R10,1)) VMOVSS(MEM(R14, 8*4), XMM(0)) VMOVSS(MEM(R14, 9*4), XMM(1)) VMOVSS(MEM(R14,10*4), XMM(2)) VMOVSS(MEM(R14,11*4), XMM(3)) VMOVSS(MEM(R14,12*4), XMM(4)) VMOVSS(MEM(R14,13*4), XMM(5)) VMOVSS(MEM(R14,14*4), XMM(6)) VMOVSS(MEM(R14,15*4), XMM(7)) LEA(RAX, MEM(RAX, 4)) LEA(R11, MEM(R11, 4)) LEA(R14, MEM(R14,16*4)) SUB(RDX, IMM(1)) JNZ(PACK16_T_TAIL) JMP(PACK16_DONE) LABEL(PACK16_G) VPBROADCASTD(ZMM(3), VAR(inca)) MOV(RBX, VAR(offsetPtr)) VPMULLD(ZMM(0), ZMM(3), MEM(RBX)) LABEL(PACK16_G_LOOP) KXNORW(K(1), K(0), K(0)) VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8)) VMULPS(ZMM(3), ZMM(3), ZMM(15)) VMOVUPS(MEM(R14), ZMM(3)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R14, MEM(R14, 16*4)) SUB(RSI, IMM(1)) JNZ(PACK16_G_LOOP) LABEL(PACK16_DONE) END_ASM( : //output operands : //input operands [n] "m" (n), [kappa] "m" (*kappa), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [offsetPtr] "m" (offsetPtr) : //clobbers "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" ) } else // if ( cdim < mnr ) { bli_sscal2m_ex \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; float* restrict p_edge = p + (i )*1; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; float* restrict p_edge = p + (j )*ldp; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } void bli_spackm_knl_asm_24xk ( conj_t conja, pack_t schema, dim_t cdim_, dim_t n_, dim_t n_max_, float* restrict kappa_, float* restrict a_, inc_t inca_, inc_t lda_, float* restrict p_, inc_t ldp_, cntx_t* restrict cntx ) { const int32_t* offsetPtr = &offsets[0]; float* a = ( float* )a_; float* p = ( float* )p_; float* kappa = ( float* )kappa_; const int64_t cdim = cdim_; const int64_t mnr = 24; const int64_t n = n_; const int64_t n_max = n_max_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; if ( cdim == mnr ) { BEGIN_ASM() MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) MOV(R14, VAR(p)) MOV(RDI, VAR(ldp)) TEST(RSI, RSI) JZ(PACK24_DONE) LEA(RBX, MEM(,RBX,4)) //inca in bytes LEA(RCX, MEM(,RCX,4)) //lda in bytes LEA(RDI, MEM(,RDI,4)) //ldp in bytes VBROADCASTSS(ZMM(15), VAR(kappa)) CMP(RBX, IMM(4)) JNE(PACK24_T) LABEL(PACK24_N) MOV(RDX, RSI) AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK24_N_TAIL) LEA(R8, MEM(RCX,RCX,2)) //lda*3 LEA(R9, MEM(RCX,RCX,4)) //lda*5 LEA(R10, MEM(R8 ,RCX,4)) //lda*7 LABEL(PACK24_N_LOOP) LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7) LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7) LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7) STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7) LEA(RAX, MEM(RAX,RCX,8)) LEA(R14, MEM(R14,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK24_N_LOOP) TEST(RDX, RDX) JZ(PACK24_DONE) LABEL(PACK24_N_TAIL) VMULPS(ZMM(0), ZMM(15), MEM(RAX)) VMOVUPS(MEM(R14), ZMM(0)) VMULPS(YMM(1), YMM(15), MEM(RAX,64)) VMOVUPS(MEM(R14,64), YMM(1)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R14, MEM(R14,RDI,1)) SUB(RDX, IMM(1)) JNZ(PACK24_N_TAIL) JMP(PACK24_DONE) LABEL(PACK24_T) CMP(RCX, IMM(4)) JNE(PACK24_G) LEA(R8, MEM(RBX,RBX,2)) //inca*3 LEA(R9, MEM(RBX,RBX,4)) //inca*5 LEA(R10, MEM(R8 ,RBX,4)) //inca*7 LEA(R11, MEM(RAX,RBX,8)) LEA(R12, MEM(R11,RBX,8)) MOV(RDX, RSI) AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK24_T_TAIL) LABEL(PACK24_T_LOOP) LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7) STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13) LEA(RAX, MEM(RAX,RCX,8)) LEA(R11, MEM(R11,RCX,8)) LEA(R12, MEM(R12,RCX,8)) LEA(R14, MEM(R14,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK24_T_LOOP) TEST(RDX, RDX) JZ(PACK24_DONE) LABEL(PACK24_T_TAIL) VMULSS(XMM(0), XMM(15), MEM(RAX)) VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1)) VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2)) VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1)) VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4)) VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1)) VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2)) VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1)) VMOVSS(MEM(R14,0*4), XMM(0)) VMOVSS(MEM(R14,1*4), XMM(1)) VMOVSS(MEM(R14,2*4), XMM(2)) VMOVSS(MEM(R14,3*4), XMM(3)) VMOVSS(MEM(R14,4*4), XMM(4)) VMOVSS(MEM(R14,5*4), XMM(5)) VMOVSS(MEM(R14,6*4), XMM(6)) VMOVSS(MEM(R14,7*4), XMM(7)) VMULSS(XMM(0), XMM(15), MEM(R11)) VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1)) VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2)) VMULSS(XMM(3), XMM(15), MEM(R11,R8,1)) VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4)) VMULSS(XMM(5), XMM(15), MEM(R11,R9,1)) VMULSS(XMM(6), XMM(15), MEM(R11,R8,2)) VMULSS(XMM(7), XMM(15), MEM(R11,R10,1)) VMOVSS(MEM(R14, 8*4), XMM(0)) VMOVSS(MEM(R14, 9*4), XMM(1)) VMOVSS(MEM(R14,10*4), XMM(2)) VMOVSS(MEM(R14,11*4), XMM(3)) VMOVSS(MEM(R14,12*4), XMM(4)) VMOVSS(MEM(R14,13*4), XMM(5)) VMOVSS(MEM(R14,14*4), XMM(6)) VMOVSS(MEM(R14,15*4), XMM(7)) VMULSS(XMM(0), XMM(15), MEM(R12)) VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1)) VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2)) VMULSS(XMM(3), XMM(15), MEM(R12,R8,1)) VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4)) VMULSS(XMM(5), XMM(15), MEM(R12,R9,1)) VMULSS(XMM(6), XMM(15), MEM(R12,R8,2)) VMULSS(XMM(7), XMM(15), MEM(R12,R10,1)) VMOVSS(MEM(R14,16*4), XMM(0)) VMOVSS(MEM(R14,17*4), XMM(1)) VMOVSS(MEM(R14,18*4), XMM(2)) VMOVSS(MEM(R14,19*4), XMM(3)) VMOVSS(MEM(R14,20*4), XMM(4)) VMOVSS(MEM(R14,21*4), XMM(5)) VMOVSS(MEM(R14,22*4), XMM(6)) VMOVSS(MEM(R14,23*4), XMM(7)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R11, MEM(R11,RCX,1)) LEA(R12, MEM(R12,RCX,1)) LEA(R14, MEM(R14,RDI,1)) SUB(RDX, IMM(1)) JNZ(PACK24_T_TAIL) JMP(PACK24_DONE) LABEL(PACK24_G) VPBROADCASTD(ZMM(3), VAR(inca)) MOV(RBX, VAR(offsetPtr)) VPMULLD(ZMM(0), ZMM(3), MEM(RBX)) LEA(R11, MEM(RAX,RBX,8)) LEA(R11, MEM(R11,RBX,8)) LABEL(PACK24_G_LOOP) KXNORW(K(1), K(0), K(0)) KSHIFTRW(K(2), K(1), IMM(8)) VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8)) VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8)) VMULPS(ZMM(3), ZMM(3), ZMM(15)) VMULPS(YMM(4), YMM(4), YMM(15)) VMOVUPS(MEM(R14), ZMM(3)) VMOVUPS(MEM(R14,64), YMM(4)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R14, MEM(R14,RDI,1)) SUB(RSI, IMM(1)) JNZ(PACK24_G_LOOP) LABEL(PACK24_DONE) END_ASM( : //output operands : //input operands [n] "m" (n), [kappa] "m" (*kappa), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [offsetPtr] "m" (offsetPtr) : //clobbers "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory" ) } else // if ( cdim < mnr ) { bli_sscal2m_ex \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ // if ( cdim < mnr ) { const dim_t i = cdim; const dim_t m_edge = mnr - i; const dim_t n_edge = n_max; float* restrict p_edge = p + (i )*1; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } if ( n < n_max ) { const dim_t j = n; const dim_t m_edge = mnr; const dim_t n_edge = n_max - j; float* restrict p_edge = p + (j )*ldp; bli_sset0s_mxn ( m_edge, n_edge, p_edge, 1, ldp ); } } cython-blis-0.9.1/blis/_src/kernels/knl/1m/old/000077500000000000000000000000001427272030600211605ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/knl/1m/old/bli_packm_knl_asm_30x8.c000066400000000000000000000365131427272030600255430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "bli_avx512_macros.h" #include "blis.h" #define LOADMUL8x8(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ VMULPD(ZMM(z0), ZMM(31), MEM(a, o)) \ VMULPD(ZMM(z1), ZMM(31), MEM(a,s1,1,o)) \ VMULPD(ZMM(z2), ZMM(31), MEM(a,s1,2,o)) \ VMULPD(ZMM(z3), ZMM(31), MEM(a,s3,1,o)) \ VMULPD(ZMM(z4), ZMM(31), MEM(a,s1,4,o)) \ VMULPD(ZMM(z5), ZMM(31), MEM(a,s5,1,o)) \ VMULPD(ZMM(z6), ZMM(31), MEM(a,s3,2,o)) \ VMULPD(ZMM(z7), ZMM(31), MEM(a,s7,1,o)) #define LOADMUL6x8(a,o,s1,s3,s5, \ z0,z1,z2,z3,z4,z5) \ \ VMULPD(ZMM(z0), ZMM(31), MEM(a, o)) \ VMULPD(ZMM(z1), ZMM(31), MEM(a,s1,1,o)) \ VMULPD(ZMM(z2), ZMM(31), MEM(a,s1,2,o)) \ VMULPD(ZMM(z3), ZMM(31), MEM(a,s3,1,o)) \ VMULPD(ZMM(z4), ZMM(31), MEM(a,s1,4,o)) \ VMULPD(ZMM(z5), ZMM(31), MEM(a,s5,1,o)) #define LOADMUL8x6(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ KXNORW(K(7), K(0), K(0)) \ KSHIFTRW(K(7), K(7), IMM(10)) \ LOADMUL8x8_MASK(a,o,s1,s3,s5,s7,z0,z1,z2,z3,z4,z5,z6,z7,7) #define LOADMUL8x8_MASK(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7,k) \ \ VMULPD(ZMM(z0) MASK_KZ(k), ZMM(31), MEM(a, o)) \ VMULPD(ZMM(z1) MASK_KZ(k), ZMM(31), MEM(a,s1,1,o)) \ VMULPD(ZMM(z2) MASK_KZ(k), ZMM(31), MEM(a,s1,2,o)) \ VMULPD(ZMM(z3) MASK_KZ(k), ZMM(31), MEM(a,s3,1,o)) \ VMULPD(ZMM(z4) MASK_KZ(k), ZMM(31), MEM(a,s1,4,o)) \ VMULPD(ZMM(z5) MASK_KZ(k), ZMM(31), MEM(a,s5,1,o)) \ VMULPD(ZMM(z6) MASK_KZ(k), ZMM(31), MEM(a,s3,2,o)) \ VMULPD(ZMM(z7) MASK_KZ(k), ZMM(31), MEM(a,s7,1,o)) #define LOADMUL6x8_MASK(a,o,s1,s3,s5, \ z0,z1,z2,z3,z4,z5,k) \ \ VMULPD(ZMM(z0) MASK_KZ(k), ZMM(31), MEM(a, o)) \ VMULPD(ZMM(z1) MASK_KZ(k), ZMM(31), MEM(a,s1,1,o)) \ VMULPD(ZMM(z2) MASK_KZ(k), ZMM(31), MEM(a,s1,2,o)) \ VMULPD(ZMM(z3) MASK_KZ(k), ZMM(31), MEM(a,s3,1,o)) \ VMULPD(ZMM(z4) MASK_KZ(k), ZMM(31), MEM(a,s1,4,o)) \ VMULPD(ZMM(z5) MASK_KZ(k), ZMM(31), MEM(a,s5,1,o)) #define STORE8x8(a,o,s1,s3,s5,s7, \ z0,z1,z2,z3,z4,z5,z6,z7) \ \ VMOVUPD(MEM(a, o), ZMM(z0)) \ VMOVUPD(MEM(a,s1,1,o), ZMM(z1)) \ VMOVUPD(MEM(a,s1,2,o), ZMM(z2)) \ VMOVUPD(MEM(a,s3,1,o), ZMM(z3)) \ VMOVUPD(MEM(a,s1,4,o), ZMM(z4)) \ VMOVUPD(MEM(a,s5,1,o), ZMM(z5)) \ VMOVUPD(MEM(a,s3,2,o), ZMM(z6)) \ VMOVUPD(MEM(a,s7,1,o), ZMM(z7)) #define TRANSPOSE8x8(a0,a1,a2,a3,a4,a5,a6,a7, \ b0,b1,b2,b3,b4,b5,b6,b7) \ \ VUNPCKLPD(ZMM(b0), ZMM(a0), ZMM(a1)) \ VUNPCKHPD(ZMM(b1), ZMM(a0), ZMM(a1)) \ VUNPCKLPD(ZMM(b2), ZMM(a2), ZMM(a3)) \ VUNPCKHPD(ZMM(b3), ZMM(a2), ZMM(a3)) \ VUNPCKLPD(ZMM(b4), ZMM(a4), ZMM(a5)) \ VUNPCKHPD(ZMM(b5), ZMM(a4), ZMM(a5)) \ VUNPCKLPD(ZMM(b6), ZMM(a6), ZMM(a7)) \ VUNPCKHPD(ZMM(b7), ZMM(a6), ZMM(a7)) \ VSHUFF64X2(ZMM(a0), ZMM(b0), ZMM(b2), IMM(0x44)) \ VSHUFF64X2(ZMM(a1), ZMM(b1), ZMM(b3), IMM(0x44)) \ VSHUFF64X2(ZMM(a2), ZMM(b0), ZMM(b2), IMM(0xEE)) \ VSHUFF64X2(ZMM(a3), ZMM(b1), ZMM(b3), IMM(0xEE)) \ VSHUFF64X2(ZMM(a4), ZMM(b4), ZMM(b6), IMM(0x44)) \ VSHUFF64X2(ZMM(a5), ZMM(b5), ZMM(b7), IMM(0x44)) \ VSHUFF64X2(ZMM(a6), ZMM(b4), ZMM(b6), IMM(0xEE)) \ VSHUFF64X2(ZMM(a7), ZMM(b5), ZMM(b7), IMM(0xEE)) \ VSHUFF64X2(ZMM(b0), ZMM(a0), ZMM(a4), IMM(0x88)) \ VSHUFF64X2(ZMM(b1), ZMM(a1), ZMM(a5), IMM(0x88)) \ VSHUFF64X2(ZMM(b2), ZMM(a0), ZMM(a4), IMM(0xDD)) \ VSHUFF64X2(ZMM(b3), ZMM(a1), ZMM(a5), IMM(0xDD)) \ VSHUFF64X2(ZMM(b4), ZMM(a2), ZMM(a6), IMM(0x88)) \ VSHUFF64X2(ZMM(b5), ZMM(a3), ZMM(a7), IMM(0x88)) \ VSHUFF64X2(ZMM(b6), ZMM(a2), ZMM(a6), IMM(0xDD)) \ VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD)) //This is an array used for the scatter/gather instructions. extern int32_t offsets[32]; // NOTE: assumes packdim_mr == 32 void bli_dpackm_knl_asm_30xk ( conj_t conja, dim_t n_, void* restrict kappa_, void* restrict a_, inc_t inca_, inc_t lda_, void* restrict p_, inc_t ldp_, cntx_t* restrict cntx ) { (void)conja; const int32_t * offsetPtr = &offsets[0]; double* a = (double*)a_; double* p = (double*)p_; double* kappa = (double*)kappa_; const int64_t n = n_; const int64_t inca = inca_; const int64_t lda = lda_; const int64_t ldp = ldp_; __asm__ volatile ( MOV(RSI, VAR(n)) MOV(RAX, VAR(a)) MOV(RBX, VAR(inca)) MOV(RCX, VAR(lda)) MOV(R15, VAR(p)) MOV(RDI, VAR(ldp)) LEA(RBX, MEM(,RBX,8)) //inca in bytes LEA(RCX, MEM(,RCX,8)) //lda in bytes LEA(RDI, MEM(,RDI,8)) //ldp in bytes LEA(R11, MEM(RDI,RDI,2)) //ldp*3 LEA(R12, MEM(RDI,RDI,4)) //ldp*5 LEA(R13, MEM(R11,RDI,4)) //ldp*7 VBROADCASTSD(ZMM(31), VAR(kappa)) TEST(RSI, RSI) JZ(PACK30_DONE) CMP(RBX, IMM(8)) JNE(PACK30_T) LABEL(PACK30_N) MOV(RDX, RSI) AND(RDX, IMM(7)) SAR(RSI, IMM(3)) JZ(PACK30_N_TAIL) LEA(R8, MEM(RCX,RCX,2)) //lda*3 LEA(R9, MEM(RCX,RCX,4)) //lda*5 LEA(R10, MEM(R8 ,RCX,4)) //lda*7 LABEL(PACK30_N_LOOP) LOADMUL8x8(RAX, 0,RCX,R8, R9, R10, 0, 1, 2, 3, 4, 5, 6, 7) LOADMUL8x8(RAX, 64,RCX,R8, R9, R10, 8, 9,10,11,12,13,14,15) LOADMUL8x8(RAX,128,RCX,R8, R9, R10,16,17,18,19,20,21,22,23) STORE8x8 (R15, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) STORE8x8 (R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15) STORE8x8 (R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LOADMUL8x6(RAX,192,RCX,R8, R9, R10, 0, 1, 2, 3, 4, 5, 6, 7) STORE8x8 (R15,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) LEA(RAX, MEM(RAX,RCX,8)) LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK30_N_LOOP) TEST(RDX, RDX) JZ(PACK30_DONE) LABEL(PACK30_N_TAIL) KXNORW(K(7), K(0), K(0)) KSHIFTRW(K(7), K(7), IMM(10)) VMULPD(ZMM(0), ZMM(31), MEM(RAX, 0)) VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64)) VMULPD(ZMM(2), ZMM(31), MEM(RAX,128)) VMULPD(ZMM(3) MASK_KZ(7), ZMM(31), MEM(RAX,192)) VMOVUPD(MEM(R15, 0), ZMM(0)) VMOVUPD(MEM(R15, 64), ZMM(1)) VMOVUPD(MEM(R15,128), ZMM(2)) VMOVUPD(MEM(R15,192), ZMM(3)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R15, MEM(R15,RDI,1)) SUB(RDX, IMM(1)) JNZ(PACK30_N_TAIL) JMP(PACK30_DONE) LABEL(PACK30_T) CMP(RCX, IMM(8)) JNE(PACK30_G) LEA(R8, MEM(RBX,RBX,2)) //inca*3 LEA(R9, MEM(RBX,RBX,4)) //inca*5 LEA(R10, MEM(R8 ,RBX,4)) //inca*7 LEA(R14, MEM(RAX,RBX,8)) LEA(RCX, MEM(R14,RBX,8)) SAR(RSI, IMM(3)) JZ(PACK30_T_TAIL) LABEL(PACK30_T_LOOP) LOADMUL8x8(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) STORE8x8(R15, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23) STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) LEA(RCX, MEM(RCX,RBX,8)) LOADMUL6x8(RCX,0,RBX,R8,R9, 8, 9,10,11,12,13) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23) STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7) STORE8x8(R15,192,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7) LEA(RAX, MEM(RAX,64)) LEA(R14, MEM(R14,64)) LEA(RCX, MEM(R14,RBX,8)) LEA(R15, MEM(R15,RDI,8)) SUB(RSI, IMM(1)) JNZ(PACK30_T_LOOP) LABEL(PACK30_T_TAIL) MOV(RSI, VAR(n)) AND(RSI, IMM(7)) TEST(RSI, RSI) JZ(PACK30_DONE) MOV(R13, IMM(1)) SHLX(R13, R13, RSI) SUB(R13, IMM(1)) KMOV(K(1), R13D) //mask for n%8 elements LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1) LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1) LOADMUL8x8_MASK(RCX,0,RBX,R8,R9,R10,16,17,18,19,20,21,22,23,1) TRANSPOSE8x8(16,17,18,19,20,21,22,23, 24,25,26,27,28,29,30,31) TRANSPOSE8x8( 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) VMOVUPD(MEM(R15, 0), ZMM( 8)) VMOVUPD(MEM(R15, 64), ZMM(16)) VMOVUPD(MEM(R15, 128), ZMM(24)) SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) VMOVUPD(MEM(R15,RDI,1, 0), ZMM( 9)) VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17)) VMOVUPD(MEM(R15,RDI,1,128), ZMM(25)) SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) VMOVUPD(MEM(R15,RDI,2, 0), ZMM(10)) VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18)) VMOVUPD(MEM(R15,RDI,2,128), ZMM(26)) SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) VMOVUPD(MEM(R15,R11,1, 0), ZMM(11)) VMOVUPD(MEM(R15,R11,1, 64), ZMM(19)) VMOVUPD(MEM(R15,R11,1,128), ZMM(27)) SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) VMOVUPD(MEM(R15,RDI,4, 0), ZMM(12)) VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20)) VMOVUPD(MEM(R15,RDI,4,128), ZMM(28)) SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) VMOVUPD(MEM(R15,R12,1, 0), ZMM(13)) VMOVUPD(MEM(R15,R12,1, 64), ZMM(21)) VMOVUPD(MEM(R15,R12,1,128), ZMM(29)) SUB(RSI, IMM(1)) JZ(PACK30_T_ALMOST_DONE) VMOVUPD(MEM(R15,R11,2, 0), ZMM(14)) VMOVUPD(MEM(R15,R11,2, 64), ZMM(22)) VMOVUPD(MEM(R15,R11,2,128), ZMM(30)) LABEL(PACK30_T_ALMOST_DONE) MOV(RSI, VAR(n)) AND(RSI, IMM(7)) VBROADCASTSD(ZMM(31), VAR(kappa)) LEA(RAX, MEM(RCX,RBX,8)) LOADMUL6x8_MASK(RAX,0,RBX,R8,R9, 0, 1, 2, 3, 4, 5,1) TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15) VMOVUPD(MEM(R15, 192), ZMM( 8)) SUB(RSI, IMM(1)) JZ(PACK30_DONE) VMOVUPD(MEM(R15,RDI,1,192), ZMM( 9)) SUB(RSI, IMM(1)) JZ(PACK30_DONE) VMOVUPD(MEM(R15,RDI,2,192), ZMM(10)) SUB(RSI, IMM(1)) JZ(PACK30_DONE) VMOVUPD(MEM(R15,R11,1,192), ZMM(11)) SUB(RSI, IMM(1)) JZ(PACK30_DONE) VMOVUPD(MEM(R15,RDI,4,192), ZMM(12)) SUB(RSI, IMM(1)) JZ(PACK30_DONE) VMOVUPD(MEM(R15,R12,1,192), ZMM(13)) SUB(RSI, IMM(1)) JZ(PACK30_DONE) VMOVUPD(MEM(R15,R11,2,192), ZMM(14)) JMP(PACK30_DONE) LABEL(PACK30_G) VPBROADCASTD(ZMM(4), VAR(inca)) MOV(RBX, VAR(offsetPtr)) VPMULLD(YMM(0), YMM(4), MEM(RBX, 0)) VPMULLD(YMM(1), YMM(4), MEM(RBX,32)) VPMULLD(YMM(2), YMM(4), MEM(RBX,64)) VPMULLD(YMM(3), YMM(4), MEM(RBX,96)) LABEL(PACK30_G_LOOP) KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) KXNORW(K(3), K(0), K(0)) KSHIFTRW(K(4), K(3), IMM(10)) VGATHERDPD(ZMM(4) MASK_K(1), MEM(RAX,YMM(0),8)) VGATHERDPD(ZMM(5) MASK_K(2), MEM(RAX,YMM(1),8)) VGATHERDPD(ZMM(6) MASK_K(3), MEM(RAX,YMM(2),8)) VGATHERDPD(ZMM(7) MASK_K(4), MEM(RAX,YMM(3),8)) VMULPD(ZMM(4), ZMM(4), ZMM(31)) VMULPD(ZMM(5), ZMM(5), ZMM(31)) VMULPD(ZMM(6), ZMM(6), ZMM(31)) VMULPD(ZMM(7), ZMM(7), ZMM(31)) VMOVUPD(MEM(R15, 0), ZMM(4)) VMOVUPD(MEM(R15, 64), ZMM(5)) VMOVUPD(MEM(R15,128), ZMM(6)) VMOVUPD(MEM(R15,192), ZMM(7)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R15, MEM(R15,RDI,1)) SUB(RSI, IMM(1)) JNZ(PACK30_G_LOOP) LABEL(PACK30_DONE) : //output operands : //input operands [n] "m" (n), [kappa] "m" (*kappa), [a] "m" (a), [inca] "m" (inca), [lda] "m" (lda), [p] "m" (p), [ldp] "m" (ldp), [offsetPtr] "m" (offsetPtr) : //clobbers "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "rax", "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory" ); } cython-blis-0.9.1/blis/_src/kernels/knl/3/000077500000000000000000000000001427272030600202275ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/knl/3/bli_dgemm_knl_asm_24x8.c000066400000000000000000000461621427272030600246140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #define BLIS_ASM_SYNTAX_INTEL #include "bli_x86_asm_macros.h" #define UNROLL_K 32 #define SCATTER_PREFETCH_C 1 #define PREFETCH_A_L2 0 #define PREFETCH_B_L2 0 #define L2_PREFETCH_DIST 64 #define A_L1_PREFETCH_DIST 18 #define B_L1_PREFETCH_DIST 18 #define LOOP_ALIGN ALIGN16 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX )) \ VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \ VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \ VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \ VGATHERDPD(ZMM(3) MASK_K(1), MEM(RCX,YMM(2),8)) \ VFMADD231PD(ZMM(NUM), ZMM(3), ZMM(1)) \ VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(2), ZMM(NUM)) \ ADD(RCX, RAX) #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \ VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(1), ZMM(NUM)) \ ADD(RCX, RAX) #define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*8)) #define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*8+64)) #define PREFETCH_A_L1_3(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*8+128)) #if PREFETCH_A_L2 #undef PREFETCH_A_L2 #define PREFETCH_A_L2(n) \ \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*8)) \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*8+64)) \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*8+128)) #else #undef PREFETCH_A_L2 #define PREFETCH_A_L2(...) #endif #define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*8*8)) #if PREFETCH_B_L2 #undef PREFETCH_B_L2 #define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*8*8)) #else #undef PREFETCH_B_L2 #define PREFETCH_B_L2(...) #endif #define PREFETCH_C_L1_1 #define PREFETCH_C_L1_2 #define PREFETCH_C_L1_3 // // n: index in unrolled loop // // a: ZMM register to load into // b: ZMM register to read from // // ...: addressing for A, except for offset // #define SUBITER(n,a,b,...) \ \ PREFETCH_A_L2(n) \ \ VMOVAPD(ZMM(a), MEM(RBX,(n+1)*64)) \ VFMADD231PD(ZMM( 8), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 0)*8)) \ VFMADD231PD(ZMM( 9), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 1)*8)) \ VFMADD231PD(ZMM(10), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 2)*8)) \ PREFETCH_A_L1_1(n) \ VFMADD231PD(ZMM(11), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 3)*8)) \ VFMADD231PD(ZMM(12), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 4)*8)) \ VFMADD231PD(ZMM(13), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 5)*8)) \ PREFETCH_C_L1_1 \ VFMADD231PD(ZMM(14), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 6)*8)) \ VFMADD231PD(ZMM(15), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 7)*8)) \ VFMADD231PD(ZMM(16), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 8)*8)) \ PREFETCH_A_L1_2(n) \ VFMADD231PD(ZMM(17), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 9)*8)) \ VFMADD231PD(ZMM(18), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+10)*8)) \ VFMADD231PD(ZMM(19), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+11)*8)) \ PREFETCH_C_L1_2 \ VFMADD231PD(ZMM(20), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+12)*8)) \ VFMADD231PD(ZMM(21), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+13)*8)) \ VFMADD231PD(ZMM(22), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+14)*8)) \ PREFETCH_A_L1_3(n) \ VFMADD231PD(ZMM(23), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+15)*8)) \ VFMADD231PD(ZMM(24), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+16)*8)) \ VFMADD231PD(ZMM(25), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+17)*8)) \ PREFETCH_C_L1_3 \ VFMADD231PD(ZMM(26), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+18)*8)) \ VFMADD231PD(ZMM(27), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+19)*8)) \ VFMADD231PD(ZMM(28), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+20)*8)) \ PREFETCH_B_L1(n) \ VFMADD231PD(ZMM(29), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+21)*8)) \ VFMADD231PD(ZMM(30), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+22)*8)) \ VFMADD231PD(ZMM(31), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+23)*8)) \ PREFETCH_B_L2(n) //This is an array used for the scatter/gather instructions. static int32_t offsets[32] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; //#define MONITORS //#define LOOPMON void bli_dgemm_knl_asm_24x8 ( dim_t m, dim_t n, dim_t k_, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* restrict data, cntx_t* restrict cntx ) { (void)data; (void)cntx; const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); int32_t * offsetPtr = &offsets[0]; int64_t k = k_; int64_t rs_c = rs_c_; int64_t cs_c = cs_c_; GEMM_UKR_SETUP_CT( d, 24, 8, true ); #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif BEGIN_ASM() #ifdef MONITORS RDTSC MOV(VAR(topl), EAX) MOV(VAR(toph), EDX) #endif VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers VMOVAPS(ZMM( 9), ZMM(8)) MOV(R12, VAR(rs_c)) VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) #if SCATTER_PREFETCH_C VMOVAPS(ZMM(17), ZMM(8)) VMOVAPS(ZMM(18), ZMM(8)) VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c)) VMOVAPS(ZMM(20), ZMM(8)) VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64)) VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5)) #else VMOVAPS(ZMM(17), ZMM(8)) VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2)) VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4)) VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4)) VMOVAPS(ZMM(21), ZMM(8)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(ZMM(23), ZMM(8)) #endif VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3)) VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3 VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5 VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7 VMOVAPS(ZMM(29), ZMM(8)) VMOVAPS(ZMM(30), ZMM(8)) VMOVAPS(ZMM(31), ZMM(8)) #ifdef MONITORS RDTSC MOV(VAR(midl), EAX) MOV(VAR(midh), EDX) #endif SUB(RSI, IMM(32)) JLE(TAIL) //prefetch C into L2 #if SCATTER_PREFETCH_C ADD(RSI, IMM(24)) KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2)) #else PREFETCHW1(MEM(RCX )) SUBITER( 0,1,0,RAX ) PREFETCHW1(MEM(RCX,R12,1)) SUBITER( 1,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,2)) SUBITER( 2,1,0,RAX ) PREFETCHW1(MEM(RCX,R13,1)) SUBITER( 3,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,4)) SUBITER( 4,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R14,1)) SUBITER( 5,0,1,RAX,R8, 1) PREFETCHW1(MEM(RCX,R13,2)) SUBITER( 6,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R15,1)) SUBITER( 7,0,1,RAX,R8, 1) LEA(RDX, MEM(RCX,R12,8)) PREFETCHW1(MEM(RDX )) SUBITER( 8,1,0,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,1)) SUBITER( 9,0,1,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,2)) SUBITER(10,1,0,RAX,R8, 2) PREFETCHW1(MEM(RDX,R13,1)) SUBITER(11,0,1,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,4)) SUBITER(12,1,0,RAX,R9, 1) PREFETCHW1(MEM(RDX,R14,1)) SUBITER(13,0,1,RAX,R9, 1) PREFETCHW1(MEM(RDX,R13,2)) SUBITER(14,1,0,RAX,R9, 1) PREFETCHW1(MEM(RDX,R15,1)) SUBITER(15,0,1,RAX,R9, 1) LEA(RDI, MEM(RDX,R12,8)) PREFETCHW1(MEM(RDI )) SUBITER(16,1,0,RAX,R8, 4) PREFETCHW1(MEM(RDI,R12,1)) SUBITER(17,0,1,RAX,R8, 4) PREFETCHW1(MEM(RDI,R12,2)) SUBITER(18,1,0,RAX,R8, 4) PREFETCHW1(MEM(RDI,R13,1)) SUBITER(19,0,1,RAX,R8, 4) PREFETCHW1(MEM(RDI,R12,4)) SUBITER(20,1,0,RAX,R10,1) PREFETCHW1(MEM(RDI,R14,1)) SUBITER(21,0,1,RAX,R10,1) PREFETCHW1(MEM(RDI,R13,2)) SUBITER(22,1,0,RAX,R10,1) PREFETCHW1(MEM(RDI,R15,1)) SUBITER(23,0,1,RAX,R10,1) ADD(RAX, IMM(24*24*8)) ADD(RBX, IMM(24* 8*8)) #endif MOV(RDI, RSI) AND(RDI, IMM(31)) SAR(RSI, IMM(5)) JZ(REM_1) LOOP_ALIGN LABEL(MAIN_LOOP) SUBITER( 0,1,0,RAX ) SUBITER( 1,0,1,RAX ) SUBITER( 2,1,0,RAX ) SUBITER( 3,0,1,RAX ) SUBITER( 4,1,0,RAX,R8, 1) SUBITER( 5,0,1,RAX,R8, 1) SUBITER( 6,1,0,RAX,R8, 1) SUBITER( 7,0,1,RAX,R8, 1) SUBITER( 8,1,0,RAX,R8, 2) SUBITER( 9,0,1,RAX,R8, 2) SUBITER(10,1,0,RAX,R8, 2) SUBITER(11,0,1,RAX,R8, 2) SUBITER(12,1,0,RAX,R9, 1) SUBITER(13,0,1,RAX,R9, 1) SUBITER(14,1,0,RAX,R9, 1) SUBITER(15,0,1,RAX,R9, 1) SUBITER(16,1,0,RAX,R8, 4) SUBITER(17,0,1,RAX,R8, 4) SUBITER(18,1,0,RAX,R8, 4) SUBITER(19,0,1,RAX,R8, 4) SUBITER(20,1,0,RAX,R10,1) SUBITER(21,0,1,RAX,R10,1) SUBITER(22,1,0,RAX,R10,1) SUBITER(23,0,1,RAX,R10,1) SUBITER(24,1,0,RAX,R9, 2) SUBITER(25,0,1,RAX,R9, 2) SUBITER(26,1,0,RAX,R9, 2) SUBITER(27,0,1,RAX,R9, 2) SUBITER(28,1,0,RAX,R11,1) SUBITER(29,0,1,RAX,R11,1) SUBITER(30,1,0,RAX,R11,1) SUBITER(31,0,1,RAX,R11,1) ADD(RAX, IMM(32*24*8)) ADD(RBX, IMM(32* 8*8)) SUB(RSI, IMM(1)) JNZ(MAIN_LOOP) LABEL(REM_1) SAR(RDI) JNC(REM_2) SUBITER(0,1,0,RAX) VMOVAPD(ZMM(0), ZMM(1)) ADD(RAX, IMM(24*8)) ADD(RBX, IMM( 8*8)) LABEL(REM_2) SAR(RDI) JNC(REM_4) SUBITER(0,1,0,RAX) SUBITER(1,0,1,RAX) ADD(RAX, IMM(2*24*8)) ADD(RBX, IMM(2* 8*8)) LABEL(REM_4) SAR(RDI) JNC(REM_8) SUBITER(0,1,0,RAX) SUBITER(1,0,1,RAX) SUBITER(2,1,0,RAX) SUBITER(3,0,1,RAX) ADD(RAX, IMM(4*24*8)) ADD(RBX, IMM(4* 8*8)) LABEL(REM_8) SAR(RDI) JNC(REM_16) SUBITER(0,1,0,RAX ) SUBITER(1,0,1,RAX ) SUBITER(2,1,0,RAX ) SUBITER(3,0,1,RAX ) SUBITER(4,1,0,RAX,R8,1) SUBITER(5,0,1,RAX,R8,1) SUBITER(6,1,0,RAX,R8,1) SUBITER(7,0,1,RAX,R8,1) ADD(RAX, IMM(8*24*8)) ADD(RBX, IMM(8* 8*8)) LABEL(REM_16) SAR(RDI) JNC(AFTER_LOOP) SUBITER( 0,1,0,RAX ) SUBITER( 1,0,1,RAX ) SUBITER( 2,1,0,RAX ) SUBITER( 3,0,1,RAX ) SUBITER( 4,1,0,RAX,R8, 1) SUBITER( 5,0,1,RAX,R8, 1) SUBITER( 6,1,0,RAX,R8, 1) SUBITER( 7,0,1,RAX,R8, 1) SUBITER( 8,1,0,RAX,R8, 2) SUBITER( 9,0,1,RAX,R8, 2) SUBITER(10,1,0,RAX,R8, 2) SUBITER(11,0,1,RAX,R8, 2) SUBITER(12,1,0,RAX,R9, 1) SUBITER(13,0,1,RAX,R9, 1) SUBITER(14,1,0,RAX,R9, 1) SUBITER(15,0,1,RAX,R9, 1) ADD(RAX, IMM(16*24*8)) ADD(RBX, IMM(16* 8*8)) LABEL(AFTER_LOOP) //prefetch C into L1 #if SCATTER_PREFETCH_C KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2)) SUBITER(0,1,0,RAX ) SUBITER(1,0,1,RAX ) SUBITER(2,1,0,RAX ) SUBITER(3,0,1,RAX ) SUBITER(4,1,0,RAX,R8,1) SUBITER(5,0,1,RAX,R8,1) SUBITER(6,1,0,RAX,R8,1) SUBITER(7,0,1,RAX,R8,1) #else LEA(RDX, MEM(RCX,R12,8)) LEA(RDI, MEM(RDX,R12,8)) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX )) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2)) SUBITER(0,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1)) SUBITER(1,0,1,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX )) SUBITER(2,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1)) SUBITER(3,0,1,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2)) SUBITER(4,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI )) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1)) SUBITER(5,0,1,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4)) SUBITER(6,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1)) SUBITER(7,0,1,RAX,R8,1) #endif JMP(POSTACCUM) LABEL(TAIL) MOV(RDX, RCX) ADD(RSI, IMM(32)) JZ(POSTACCUM) LABEL(TAIL_LOOP) PREFETCHW0(MEM(RDX)) ADD(RDX, R12) SUBITER(0,1,0,RAX) VMOVAPD(ZMM(0), ZMM(1)) ADD(RAX, IMM(24*8)) ADD(RBX, IMM( 8*8)) SUB(RSI, IMM(1)) JNZ(TAIL_LOOP) LABEL(POSTACCUM) #ifdef MONITORS RDTSC MOV(VAR(mid2l), EAX) MOV(VAR(mid2h), EDX) #endif MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSD(ZMM(0), MEM(RAX)) VBROADCASTSD(ZMM(1), MEM(RBX)) // Check if C is row stride. If not, jump to the slow scattered update MOV(RAX, VAR(rs_c)) LEA(RAX, MEM(,RAX,8)) LEA(RDI, MEM(RAX,RAX,2)) VMOVQ(RDX, XMM(1)) SAL(RDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 8, 9,10,11) UPDATE_C_FOUR_ROWS(12,13,14,15) UPDATE_C_FOUR_ROWS(16,17,18,19) UPDATE_C_FOUR_ROWS(20,21,22,23) UPDATE_C_FOUR_ROWS(24,25,26,27) UPDATE_C_FOUR_ROWS(28,29,30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11) UPDATE_C_BZ_FOUR_ROWS(12,13,14,15) UPDATE_C_BZ_FOUR_ROWS(16,17,18,19) UPDATE_C_BZ_FOUR_ROWS(20,21,22,23) UPDATE_C_BZ_FOUR_ROWS(24,25,26,27) UPDATE_C_BZ_FOUR_ROWS(28,29,30,31) LABEL(END) #ifdef MONITORS RDTSC MOV(VAR(botl), EAX) MOV(VAR(both), EDX) #endif END_ASM( : // output operands #ifdef MONITORS [topl] "=m" (topl), [toph] "=m" (toph), [midl] "=m" (midl), [midh] "=m" (midh), [mid2l] "=m" (mid2l), [mid2h] "=m" (mid2h), [botl] "=m" (botl), [both] "=m" (both) #endif : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [a_next] "m" (a_next), [b_next] "m" (b_next), [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) GEMM_UKR_FLUSH_CT( d ); #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif #ifdef MONITORS dim_t top = ((dim_t)toph << 32) | topl; dim_t mid = ((dim_t)midh << 32) | midl; dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; dim_t bot = ((dim_t)both << 32) | botl; printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); #endif } cython-blis-0.9.1/blis/_src/kernels/knl/3/bli_sgemm_knl_asm_24x16.c000066400000000000000000000457461427272030600247210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #define BLIS_ASM_SYNTAX_INTEL #include "bli_x86_asm_macros.h" #define UNROLL_K 32 #define SCATTER_PREFETCH_C 1 #define PREFETCH_A_L2 0 #define PREFETCH_B_L2 0 #define L2_PREFETCH_DIST 64 #define A_L1_PREFETCH_DIST 36 #define B_L1_PREFETCH_DIST 18 #define LOOP_ALIGN ALIGN16 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX )) \ VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \ VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \ VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \ VMOVUPS(MEM(RCX ), ZMM(R1)) \ VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ VMOVUPS(MEM(RCX ), ZMM(R1)) \ VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \ VGATHERDPS(ZMM(3) MASK_K(1), MEM(RCX,ZMM(2),4)) \ VFMADD231PS(ZMM(NUM), ZMM(3), ZMM(1)) \ VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(2), ZMM(NUM)) \ ADD(RCX, RAX) #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \ VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(1), ZMM(NUM)) \ ADD(RCX, RAX) #define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4)) #define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4+64)) #if PREFETCH_A_L2 #undef PREFETCH_A_L2 #define PREFETCH_A_L2(n) \ \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4)) \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4+64)) #else #undef PREFETCH_A_L2 #define PREFETCH_A_L2(...) #endif #define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*16*4)) #if PREFETCH_B_L2 #undef PREFETCH_B_L2 #define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*16*4)) #else #undef PREFETCH_B_L2 #define PREFETCH_B_L2(...) #endif #define PREFETCH_C_L1_1 #define PREFETCH_C_L1_2 #define PREFETCH_C_L1_3 // // n: index in unrolled loop // // a: ZMM register to load into // b: ZMM register to read from // // ...: addressing for A, except for offset // #define SUBITER(n,a,b,...) \ \ PREFETCH_A_L2(n) \ \ VMOVAPS(ZMM(a), MEM(RBX,(n+1)*64)) \ VFMADD231PS(ZMM( 8), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 0)*4)) \ VFMADD231PS(ZMM( 9), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 1)*4)) \ VFMADD231PS(ZMM(10), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 2)*4)) \ PREFETCH_A_L1_1(n) \ VFMADD231PS(ZMM(11), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 3)*4)) \ VFMADD231PS(ZMM(12), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 4)*4)) \ VFMADD231PS(ZMM(13), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 5)*4)) \ PREFETCH_C_L1_1 \ VFMADD231PS(ZMM(14), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 6)*4)) \ VFMADD231PS(ZMM(15), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 7)*4)) \ VFMADD231PS(ZMM(16), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 8)*4)) \ PREFETCH_A_L1_2(n) \ VFMADD231PS(ZMM(17), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 9)*4)) \ VFMADD231PS(ZMM(18), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+10)*4)) \ VFMADD231PS(ZMM(19), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+11)*4)) \ PREFETCH_C_L1_2 \ VFMADD231PS(ZMM(20), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+12)*4)) \ VFMADD231PS(ZMM(21), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+13)*4)) \ VFMADD231PS(ZMM(22), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+14)*4)) \ PREFETCH_C_L1_3 \ VFMADD231PS(ZMM(23), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+15)*4)) \ VFMADD231PS(ZMM(24), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+16)*4)) \ VFMADD231PS(ZMM(25), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+17)*4)) \ PREFETCH_B_L1(n) \ VFMADD231PS(ZMM(26), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+18)*4)) \ VFMADD231PS(ZMM(27), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+19)*4)) \ VFMADD231PS(ZMM(28), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+20)*4)) \ PREFETCH_B_L2(n) \ VFMADD231PS(ZMM(29), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+21)*4)) \ VFMADD231PS(ZMM(30), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+22)*4)) \ VFMADD231PS(ZMM(31), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+23)*4)) //This is an array used for the scatter/gather instructions. static int32_t offsets[32] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; //#define MONITORS //#define LOOPMON void bli_sgemm_knl_asm_24x16 ( dim_t m, dim_t n, dim_t k_, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* restrict data, cntx_t* restrict cntx ) { (void)data; (void)cntx; const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); int32_t * offsetPtr = &offsets[0]; int64_t k = k_; int64_t rs_c = rs_c_; int64_t cs_c = cs_c_; GEMM_UKR_SETUP_CT( s, 24, 16, true ); #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif BEGIN_ASM() #ifdef MONITORS RDTSC MOV(VAR(topl), EAX) MOV(VAR(toph), EDX) #endif VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers VMOVAPS(ZMM( 9), ZMM(8)) MOV(R12, VAR(rs_c)) VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) #if SCATTER_PREFETCH_C VMOVAPS(ZMM(17), ZMM(8)) VMOVAPS(ZMM(18), ZMM(8)) VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c)) VMOVAPS(ZMM(20), ZMM(8)) VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64)) VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5)) #else VMOVAPS(ZMM(17), ZMM(8)) VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2)) VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4)) VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4)) VMOVAPS(ZMM(21), ZMM(8)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(ZMM(23), ZMM(8)) #endif VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(2)) VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*4)) //offset for 4 iterations VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3 VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5 VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7 VMOVAPS(ZMM(29), ZMM(8)) VMOVAPS(ZMM(30), ZMM(8)) VMOVAPS(ZMM(31), ZMM(8)) #ifdef MONITORS RDTSC MOV(VAR(midl), EAX) MOV(VAR(midh), EDX) #endif SUB(RSI, IMM(32)) JLE(TAIL) //prefetch C into L2 #if SCATTER_PREFETCH_C ADD(RSI, IMM(24)) KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2)) #else PREFETCHW1(MEM(RCX )) SUBITER( 0,1,0,RAX ) PREFETCHW1(MEM(RCX,R12,1)) SUBITER( 1,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,2)) SUBITER( 2,1,0,RAX ) PREFETCHW1(MEM(RCX,R13,1)) SUBITER( 3,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,4)) SUBITER( 4,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R14,1)) SUBITER( 5,0,1,RAX,R8, 1) PREFETCHW1(MEM(RCX,R13,2)) SUBITER( 6,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R15,1)) SUBITER( 7,0,1,RAX,R8, 1) LEA(RDX, MEM(RCX,R12,8)) PREFETCHW1(MEM(RDX )) SUBITER( 8,1,0,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,1)) SUBITER( 9,0,1,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,2)) SUBITER(10,1,0,RAX,R8, 2) PREFETCHW1(MEM(RDX,R13,1)) SUBITER(11,0,1,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,4)) SUBITER(12,1,0,RAX,R9, 1) PREFETCHW1(MEM(RDX,R14,1)) SUBITER(13,0,1,RAX,R9, 1) PREFETCHW1(MEM(RDX,R13,2)) SUBITER(14,1,0,RAX,R9, 1) PREFETCHW1(MEM(RDX,R15,1)) SUBITER(15,0,1,RAX,R9, 1) LEA(RDI, MEM(RDX,R12,8)) PREFETCHW1(MEM(RDI )) SUBITER(16,1,0,RAX,R8, 4) PREFETCHW1(MEM(RDI,R12,1)) SUBITER(17,0,1,RAX,R8, 4) PREFETCHW1(MEM(RDI,R12,2)) SUBITER(18,1,0,RAX,R8, 4) PREFETCHW1(MEM(RDI,R13,1)) SUBITER(19,0,1,RAX,R8, 4) PREFETCHW1(MEM(RDI,R12,4)) SUBITER(20,1,0,RAX,R10,1) PREFETCHW1(MEM(RDI,R14,1)) SUBITER(21,0,1,RAX,R10,1) PREFETCHW1(MEM(RDI,R13,2)) SUBITER(22,1,0,RAX,R10,1) PREFETCHW1(MEM(RDI,R15,1)) SUBITER(23,0,1,RAX,R10,1) ADD(RAX, IMM(24*24*4)) ADD(RBX, IMM(24*16*4)) #endif MOV(RDI, RSI) AND(RDI, IMM(31)) SAR(RSI, IMM(5)) JZ(REM_1) LOOP_ALIGN LABEL(MAIN_LOOP) SUBITER( 0,1,0,RAX ) SUBITER( 1,0,1,RAX ) SUBITER( 2,1,0,RAX ) SUBITER( 3,0,1,RAX ) SUBITER( 4,1,0,RAX,R8, 1) SUBITER( 5,0,1,RAX,R8, 1) SUBITER( 6,1,0,RAX,R8, 1) SUBITER( 7,0,1,RAX,R8, 1) SUBITER( 8,1,0,RAX,R8, 2) SUBITER( 9,0,1,RAX,R8, 2) SUBITER(10,1,0,RAX,R8, 2) SUBITER(11,0,1,RAX,R8, 2) SUBITER(12,1,0,RAX,R9, 1) SUBITER(13,0,1,RAX,R9, 1) SUBITER(14,1,0,RAX,R9, 1) SUBITER(15,0,1,RAX,R9, 1) SUBITER(16,1,0,RAX,R8, 4) SUBITER(17,0,1,RAX,R8, 4) SUBITER(18,1,0,RAX,R8, 4) SUBITER(19,0,1,RAX,R8, 4) SUBITER(20,1,0,RAX,R10,1) SUBITER(21,0,1,RAX,R10,1) SUBITER(22,1,0,RAX,R10,1) SUBITER(23,0,1,RAX,R10,1) SUBITER(24,1,0,RAX,R9, 2) SUBITER(25,0,1,RAX,R9, 2) SUBITER(26,1,0,RAX,R9, 2) SUBITER(27,0,1,RAX,R9, 2) SUBITER(28,1,0,RAX,R11,1) SUBITER(29,0,1,RAX,R11,1) SUBITER(30,1,0,RAX,R11,1) SUBITER(31,0,1,RAX,R11,1) ADD(RAX, IMM(32*24*4)) ADD(RBX, IMM(32*16*4)) SUB(RSI, IMM(1)) JNZ(MAIN_LOOP) LABEL(REM_1) SAR(RDI) JNC(REM_2) SUBITER(0,1,0,RAX) VMOVAPD(ZMM(0), ZMM(1)) ADD(RAX, IMM(24*4)) ADD(RBX, IMM(16*4)) LABEL(REM_2) SAR(RDI) JNC(REM_4) SUBITER(0,1,0,RAX) SUBITER(1,0,1,RAX) ADD(RAX, IMM(2*24*4)) ADD(RBX, IMM(2*16*4)) LABEL(REM_4) SAR(RDI) JNC(REM_8) SUBITER(0,1,0,RAX) SUBITER(1,0,1,RAX) SUBITER(2,1,0,RAX) SUBITER(3,0,1,RAX) ADD(RAX, IMM(4*24*4)) ADD(RBX, IMM(4*16*4)) LABEL(REM_8) SAR(RDI) JNC(REM_16) SUBITER(0,1,0,RAX ) SUBITER(1,0,1,RAX ) SUBITER(2,1,0,RAX ) SUBITER(3,0,1,RAX ) SUBITER(4,1,0,RAX,R8,1) SUBITER(5,0,1,RAX,R8,1) SUBITER(6,1,0,RAX,R8,1) SUBITER(7,0,1,RAX,R8,1) ADD(RAX, IMM(8*24*4)) ADD(RBX, IMM(8*16*4)) LABEL(REM_16) SAR(RDI) JNC(AFTER_LOOP) SUBITER( 0,1,0,RAX ) SUBITER( 1,0,1,RAX ) SUBITER( 2,1,0,RAX ) SUBITER( 3,0,1,RAX ) SUBITER( 4,1,0,RAX,R8, 1) SUBITER( 5,0,1,RAX,R8, 1) SUBITER( 6,1,0,RAX,R8, 1) SUBITER( 7,0,1,RAX,R8, 1) SUBITER( 8,1,0,RAX,R8, 2) SUBITER( 9,0,1,RAX,R8, 2) SUBITER(10,1,0,RAX,R8, 2) SUBITER(11,0,1,RAX,R8, 2) SUBITER(12,1,0,RAX,R9, 1) SUBITER(13,0,1,RAX,R9, 1) SUBITER(14,1,0,RAX,R9, 1) SUBITER(15,0,1,RAX,R9, 1) ADD(RAX, IMM(16*24*4)) ADD(RBX, IMM(16*16*4)) LABEL(AFTER_LOOP) //prefetch C into L1 #if SCATTER_PREFETCH_C KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2)) SUBITER(0,1,0,RAX ) SUBITER(1,0,1,RAX ) SUBITER(2,1,0,RAX ) SUBITER(3,0,1,RAX ) SUBITER(4,1,0,RAX,R8,1) SUBITER(5,0,1,RAX,R8,1) SUBITER(6,1,0,RAX,R8,1) SUBITER(7,0,1,RAX,R8,1) #else LEA(RDX, MEM(RCX,R12,8)) LEA(RDI, MEM(RDX,R12,8)) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX )) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2)) SUBITER(0,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1)) SUBITER(1,0,1,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX )) SUBITER(2,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1)) SUBITER(3,0,1,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2)) SUBITER(4,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI )) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1)) SUBITER(5,0,1,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4)) SUBITER(6,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1)) SUBITER(7,0,1,RAX,R8,1) #endif JMP(POSTACCUM) LABEL(TAIL) MOV(RDX, RCX) ADD(RSI, IMM(32)) JZ(POSTACCUM) LABEL(TAIL_LOOP) PREFETCHW0(MEM(RDX)) ADD(RDX, R12) SUBITER(0,1,0,RAX) VMOVAPD(ZMM(0), ZMM(1)) ADD(RAX, IMM(24*4)) ADD(RBX, IMM(16*4)) SUB(RSI, IMM(1)) JNZ(TAIL_LOOP) LABEL(POSTACCUM) #ifdef MONITORS RDTSC MOV(VAR(mid2l), EAX) MOV(VAR(mid2h), EDX) #endif MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSS(ZMM(0), MEM(RAX)) VBROADCASTSS(ZMM(1), MEM(RBX)) // Check if C is row stride. If not, jump to the slow scattered update MOV(RAX, VAR(rs_c)) LEA(RAX, MEM(,RAX,4)) LEA(RDI, MEM(RAX,RAX,2)) VMOVD(EDX, XMM(1)) SAL(EDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 8, 9,10,11) UPDATE_C_FOUR_ROWS(12,13,14,15) UPDATE_C_FOUR_ROWS(16,17,18,19) UPDATE_C_FOUR_ROWS(20,21,22,23) UPDATE_C_FOUR_ROWS(24,25,26,27) UPDATE_C_FOUR_ROWS(28,29,30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11) UPDATE_C_BZ_FOUR_ROWS(12,13,14,15) UPDATE_C_BZ_FOUR_ROWS(16,17,18,19) UPDATE_C_BZ_FOUR_ROWS(20,21,22,23) UPDATE_C_BZ_FOUR_ROWS(24,25,26,27) UPDATE_C_BZ_FOUR_ROWS(28,29,30,31) LABEL(END) #ifdef MONITORS RDTSC MOV(VAR(botl), EAX) MOV(VAR(both), EDX) #endif END_ASM( : // output operands #ifdef MONITORS [topl] "=m" (topl), [toph] "=m" (toph), [midl] "=m" (midl), [midh] "=m" (midh), [mid2l] "=m" (mid2l), [mid2h] "=m" (mid2h), [botl] "=m" (botl), [both] "=m" (both) #endif : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [a_next] "m" (a_next), [b_next] "m" (b_next), [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) GEMM_UKR_FLUSH_CT( s ); #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif #ifdef MONITORS dim_t top = ((dim_t)toph << 32) | topl; dim_t mid = ((dim_t)midh << 32) | midl; dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; dim_t bot = ((dim_t)both << 32) | botl; printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); #endif } cython-blis-0.9.1/blis/_src/kernels/knl/3/other/000077500000000000000000000000001427272030600213505ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/knl/3/other/bli_dgemm_knl_asm_12x16.c000066400000000000000000000604751427272030600260140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "bli_avx512_macros.h" extern int32_t offsets[16]; void bli_dgemm_knl_asm_12x16 ( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //const void* a_next = bli_auxinfo_next_a( data ); //const void* b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; __asm__ volatile ( VPXORD(ZMM(8), ZMM(8), ZMM(8)) MOV(RAX, VAR(a)) VMOVAPD(ZMM( 9), ZMM(8)) MOV(RBX, VAR(b)) VMOVAPD(ZMM(10), ZMM(8)) //no ADD(RBX, IMM(4*64)) VMOVAPD(ZMM(11), ZMM(8)) //maybe? PREFETCH(0, MEM(RAX, 0)) VMOVAPD(ZMM(12), ZMM(8)) //maybe? PREFETCH(0, MEM(RAX,64)) VMOVAPD(ZMM(13), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX,0*64)) VMOVAPD(ZMM(14), ZMM(8)) VMOVAPD(ZMM(1), MEM(RBX,1*64)) VMOVAPD(ZMM(15), ZMM(8)) MOV(RCX, VAR(c)) VMOVAPD(ZMM(16), ZMM(8)) MOV(RDI, RCX) VMOVAPD(ZMM(17), ZMM(8)) VBROADCASTSS(ZMM(4), VAR(cs_c)) VMOVAPD(ZMM(18), ZMM(8)) VMOVAPS(ZMM(5), VAR(offsetPtr)) VMOVAPD(ZMM(19), ZMM(8)) VPMULLD(ZMM(4), ZMM(5), ZMM(4)) VMOVAPD(ZMM(20), ZMM(8)) MOV(RDX, IMM(0xFFF)) VMOVAPD(ZMM(21), ZMM(8)) KMOV(K(1), EDX) VMOVAPD(ZMM(22), ZMM(8)) KMOV(K(2), EDX) VMOVAPD(ZMM(23), ZMM(8)) KMOV(K(3), EDX) VMOVAPD(ZMM(24), ZMM(8)) VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8, 0) MASK_K(1)) VMOVAPD(ZMM(25), ZMM(8)) VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8, 8*8) MASK_K(2)) VMOVAPD(ZMM(26), ZMM(8)) VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8,15*8) MASK_K(3)) VMOVAPD(ZMM(27), ZMM(8)) MOV(RSI, VAR(k)) VMOVAPD(ZMM(28), ZMM(8)) SAR(RSI, IMM(2)) // rsi = k/4 VMOVAPD(ZMM(29), ZMM(8)) VMOVAPD(ZMM(30), ZMM(8)) VMOVAPD(ZMM(31), ZMM(8)) JZ(.DCONSIDKLEFT) ALIGN16 LABEL(.DLOOPKITER) VBROADCASTSD(ZMM(2), MEM(RAX, 0*8)) // Iteration 0 VBROADCASTSD(ZMM(3), MEM(RAX, 1*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 2*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 3*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 4*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 5*8)) VFMADD231PD(ZMM( 8), ZMM(0), ZMM(2)) VFMADD231PD(ZMM( 9), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(10), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(11), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(12), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(13), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(14), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(15), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(16), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(17), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(18), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(19), ZMM(1), ZMM(7)) VBROADCASTSD(ZMM(2), MEM(RAX, 6*8)) VBROADCASTSD(ZMM(3), MEM(RAX, 7*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 8*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 9*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 10*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 11*8)) VFMADD231PD(ZMM(20), ZMM(0), ZMM(2)) VFMADD231PD(ZMM(21), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(22), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(23), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(24), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(25), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(26), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(27), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(28), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(29), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(30), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(31), ZMM(1), ZMM(7)) VMOVAPD(ZMM(0), MEM(RBX,2*64)) VMOVAPD(ZMM(1), MEM(RBX,3*64)) PREFETCH(0, MEM(RAX, 64*8)) PREFETCH(0, MEM(RAX, 72*8)) VBROADCASTSD(ZMM(2), MEM(RAX, 12*8)) // Iteration 1 VBROADCASTSD(ZMM(3), MEM(RAX, 13*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 14*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 15*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 16*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 17*8)) VFMADD231PD(ZMM( 8), ZMM(0), ZMM(2)) VFMADD231PD(ZMM( 9), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(10), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(11), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(12), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(13), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(14), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(15), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(16), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(17), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(18), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(19), ZMM(1), ZMM(7)) VBROADCASTSD(ZMM(2), MEM(RAX, 18*8)) VBROADCASTSD(ZMM(3), MEM(RAX, 19*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 20*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 21*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 22*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 23*8)) VFMADD231PD(ZMM(20), ZMM(0), ZMM(2)) VFMADD231PD(ZMM(21), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(22), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(23), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(24), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(25), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(26), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(27), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(28), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(29), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(30), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(31), ZMM(1), ZMM(7)) VMOVAPD(ZMM(0), MEM(RBX,4*64)) VMOVAPD(ZMM(1), MEM(RBX,5*64)) PREFETCH(0, MEM(RAX, 80*8)) PREFETCH(0, MEM(RAX, 88*8)) VBROADCASTSD(ZMM(2), MEM(RAX, 24*8)) // Iteration 2 VBROADCASTSD(ZMM(3), MEM(RAX, 25*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 26*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 27*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 28*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 29*8)) VFMADD231PD(ZMM( 8), ZMM(0), ZMM(2)) VFMADD231PD(ZMM( 9), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(10), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(11), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(12), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(13), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(14), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(15), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(16), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(17), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(18), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(19), ZMM(1), ZMM(7)) VBROADCASTSD(ZMM(2), MEM(RAX, 30*8)) VBROADCASTSD(ZMM(3), MEM(RAX, 31*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 32*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 33*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 34*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 35*8)) VFMADD231PD(ZMM(20), ZMM(0), ZMM(2)) VFMADD231PD(ZMM(21), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(22), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(23), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(24), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(25), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(26), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(27), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(28), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(29), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(30), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(31), ZMM(1), ZMM(7)) VMOVAPD(ZMM(0), MEM(RBX,6*64)) VMOVAPD(ZMM(1), MEM(RBX,7*64)) ADD(RBX, IMM(4*8*16)) PREFETCH(0, MEM(RAX, 96*8)) PREFETCH(0, MEM(RAX, 104*8)) VBROADCASTSD(ZMM(2), MEM(RAX, 36*8)) // Iteration 3 VBROADCASTSD(ZMM(3), MEM(RAX, 37*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 38*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 39*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 40*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 41*8)) VFMADD231PD(ZMM( 8), ZMM(0), ZMM(2)) VFMADD231PD(ZMM( 9), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(10), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(11), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(12), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(13), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(14), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(15), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(16), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(17), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(18), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(19), ZMM(1), ZMM(7)) VBROADCASTSD(ZMM(2), MEM(RAX, 42*8)) VBROADCASTSD(ZMM(3), MEM(RAX, 43*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 44*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 45*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 46*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 47*8)) VFMADD231PD(ZMM(20), ZMM(0), ZMM(2)) VFMADD231PD(ZMM(21), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(22), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(23), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(24), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(25), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(26), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(27), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(28), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(29), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(30), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(31), ZMM(1), ZMM(7)) ADD(RAX, IMM(4*8*12)) SUB(RSI, IMM(1)) VMOVAPD(ZMM(0), MEM(RBX,0*64)) VMOVAPD(ZMM(1), MEM(RBX,1*64)) JNZ(.DLOOPKITER) LABEL(.DCONSIDKLEFT) MOV(RSI, VAR(k)) AND(RSI, IMM(3)) // rsi = k%4 JZ(.DPOSTACCUM) ALIGN16 LABEL(.DLOOPKLEFT) VBROADCASTSD(ZMM(2), MEM(RAX, 0*8)) VBROADCASTSD(ZMM(3), MEM(RAX, 1*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 2*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 3*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 4*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 5*8)) VFMADD231PD(ZMM( 8), ZMM(0), ZMM(2)) VFMADD231PD(ZMM( 9), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(10), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(11), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(12), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(13), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(14), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(15), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(16), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(17), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(18), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(19), ZMM(1), ZMM(7)) VBROADCASTSD(ZMM(2), MEM(RAX, 6*8)) VBROADCASTSD(ZMM(3), MEM(RAX, 7*8)) VBROADCASTSD(ZMM(4), MEM(RAX, 8*8)) VBROADCASTSD(ZMM(5), MEM(RAX, 9*8)) VBROADCASTSD(ZMM(6), MEM(RAX, 10*8)) VBROADCASTSD(ZMM(7), MEM(RAX, 11*8)) VFMADD231PD(ZMM(20), ZMM(0), ZMM(2)) VFMADD231PD(ZMM(21), ZMM(1), ZMM(2)) VFMADD231PD(ZMM(22), ZMM(0), ZMM(3)) VFMADD231PD(ZMM(23), ZMM(1), ZMM(3)) VFMADD231PD(ZMM(24), ZMM(0), ZMM(4)) VFMADD231PD(ZMM(25), ZMM(1), ZMM(4)) VFMADD231PD(ZMM(26), ZMM(0), ZMM(5)) VFMADD231PD(ZMM(27), ZMM(1), ZMM(5)) VFMADD231PD(ZMM(28), ZMM(0), ZMM(6)) VFMADD231PD(ZMM(29), ZMM(1), ZMM(6)) VFMADD231PD(ZMM(30), ZMM(0), ZMM(7)) VFMADD231PD(ZMM(31), ZMM(1), ZMM(7)) ADD(RAX, IMM(12*8)) ADD(RBX, IMM(16*8)) SUB(RSI, IMM(1)) VMOVAPD(ZMM(0), MEM(RBX,0*64)) VMOVAPD(ZMM(1), MEM(RBX,1*64)) JNZ(.DLOOPKLEFT) LABEL(.DPOSTACCUM) MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSD(ZMM(0), MEM(RAX)) VBROADCASTSD(ZMM(1), MEM(RBX)) VMULPD(ZMM( 8), ZMM( 8), ZMM(0)) VMULPD(ZMM( 9), ZMM( 9), ZMM(0)) VMULPD(ZMM(10), ZMM(10), ZMM(0)) VMULPD(ZMM(11), ZMM(11), ZMM(0)) VMULPD(ZMM(12), ZMM(12), ZMM(0)) VMULPD(ZMM(13), ZMM(13), ZMM(0)) VMULPD(ZMM(14), ZMM(14), ZMM(0)) VMULPD(ZMM(15), ZMM(15), ZMM(0)) VMULPD(ZMM(16), ZMM(16), ZMM(0)) VMULPD(ZMM(17), ZMM(17), ZMM(0)) VMULPD(ZMM(18), ZMM(18), ZMM(0)) VMULPD(ZMM(19), ZMM(19), ZMM(0)) VMULPD(ZMM(20), ZMM(20), ZMM(0)) VMULPD(ZMM(21), ZMM(21), ZMM(0)) VMULPD(ZMM(22), ZMM(22), ZMM(0)) VMULPD(ZMM(23), ZMM(23), ZMM(0)) VMULPD(ZMM(24), ZMM(24), ZMM(0)) VMULPD(ZMM(25), ZMM(25), ZMM(0)) VMULPD(ZMM(26), ZMM(26), ZMM(0)) VMULPD(ZMM(27), ZMM(27), ZMM(0)) VMULPD(ZMM(28), ZMM(28), ZMM(0)) VMULPD(ZMM(29), ZMM(29), ZMM(0)) VMULPD(ZMM(30), ZMM(30), ZMM(0)) VMULPD(ZMM(31), ZMM(31), ZMM(0)) MOV(RDI, VAR(rs_c)) SUB(RDI, IMM(1)) JNZ(.DGENSTORED) LABEL(.ROWSTORED) MOV(RSI, VAR(cs_c)) MOV(R(8), MEM(RBX)) LEA(RSI, MEM(,RSI,8)) LEA(RDX, MEM(RCX,RSI,4)) LEA(RDI, MEM(RCX,RSI,8)) LEA(R(13), MEM(RSI,RSI,2)) SAL1(R(8)) // shift out the sign bit to check for +/- zero JZ(.DROWSTORBZ) VFMADD231PD(ZMM( 8), ZMM(1), MEM(RCX)) VFMADD231PD(ZMM( 9), ZMM(1), MEM(RCX,64)) VMOVUPD(MEM(RCX), ZMM( 8)) VMOVUPD(MEM(RCX,64), ZMM( 9)) VFMADD231PD(ZMM(10), ZMM(1), MEM(RCX,RSI,1)) VFMADD231PD(ZMM(11), ZMM(1), MEM(RCX,RSI,1,64)) VMOVUPD(MEM(RCX,RSI,1), ZMM(10)) VMOVUPD(MEM(RCX,RSI,1,64), ZMM(11)) VFMADD231PD(ZMM(12), ZMM(1), MEM(RCX,RSI,2)) VFMADD231PD(ZMM(13), ZMM(1), MEM(RCX,RSI,2,64)) VMOVUPD(MEM(RCX,RSI,2), ZMM(12)) VMOVUPD(MEM(RCX,RSI,2,64), ZMM(13)) VFMADD231PD(ZMM(14), ZMM(1), MEM(RCX,R(13),1)) VFMADD231PD(ZMM(15), ZMM(1), MEM(RCX,R(13),1,64)) VMOVUPD(MEM(RCX,R(13),1), ZMM(14)) VMOVUPD(MEM(RCX,R(13),1,64), ZMM(15)) VFMADD231PD(ZMM(16), ZMM(1), MEM(RDX)) VFMADD231PD(ZMM(17), ZMM(1), MEM(RDX,64)) VMOVUPD(MEM(RDX), ZMM(16)) VMOVUPD(MEM(RDX,64), ZMM(17)) VFMADD231PD(ZMM(18), ZMM(1), MEM(RDX,RSI,1)) VFMADD231PD(ZMM(19), ZMM(1), MEM(RDX,RSI,1,64)) VMOVUPD(MEM(RDX,RSI,1), ZMM(18)) VMOVUPD(MEM(RDX,RSI,1,64), ZMM(19)) VFMADD231PD(ZMM(20), ZMM(1), MEM(RDX,RSI,2)) VFMADD231PD(ZMM(21), ZMM(1), MEM(RDX,RSI,2,64)) VMOVUPD(MEM(RDX,RSI,2), ZMM(20)) VMOVUPD(MEM(RDX,RSI,2,64), ZMM(21)) VFMADD231PD(ZMM(22), ZMM(1), MEM(RDX,R(13),1)) VFMADD231PD(ZMM(23), ZMM(1), MEM(RDX,R(13),1,64)) VMOVUPD(MEM(RDX,R(13),1), ZMM(22)) VMOVUPD(MEM(RDX,R(13),1,64), ZMM(23)) VFMADD231PD(ZMM(24), ZMM(1), MEM(RDI)) VFMADD231PD(ZMM(25), ZMM(1), MEM(RDI,64)) VMOVUPD(MEM(RDI), ZMM(24)) VMOVUPD(MEM(RDI,64), ZMM(25)) VFMADD231PD(ZMM(26), ZMM(1), MEM(RDI,RSI,1)) VFMADD231PD(ZMM(27), ZMM(1), MEM(RDI,RSI,1,64)) VMOVUPD(MEM(RDI,RSI,1), ZMM(26)) VMOVUPD(MEM(RDI,RSI,1,64), ZMM(27)) VFMADD231PD(ZMM(28), ZMM(1), MEM(RDI,RSI,2)) VFMADD231PD(ZMM(29), ZMM(1), MEM(RDI,RSI,2,64)) VMOVUPD(MEM(RDI,RSI,2), ZMM(28)) VMOVUPD(MEM(RDI,RSI,2,64), ZMM(29)) VFMADD231PD(ZMM(30), ZMM(1), MEM(RDI,R(13),1)) VFMADD231PD(ZMM(31), ZMM(1), MEM(RDI,R(13),1,64)) VMOVUPD(MEM(RDI,R(13),1), ZMM(30)) VMOVUPD(MEM(RDI,R(13),1,64), ZMM(31)) JMP(.DDONE) LABEL(.DROWSTORBZ) VMOVUPD(MEM(RCX), ZMM( 8)) VMOVUPD(MEM(RCX,64), ZMM( 9)) VMOVUPD(MEM(RCX,RSI,1), ZMM(10)) VMOVUPD(MEM(RCX,RSI,1,64), ZMM(11)) VMOVUPD(MEM(RCX,RSI,2), ZMM(12)) VMOVUPD(MEM(RCX,RSI,2,64), ZMM(13)) VMOVUPD(MEM(RCX,R(13),1), ZMM(14)) VMOVUPD(MEM(RCX,R(13),1,64), ZMM(15)) VMOVUPD(MEM(RDX), ZMM(16)) VMOVUPD(MEM(RDX,64), ZMM(17)) VMOVUPD(MEM(RDX,RSI,1), ZMM(18)) VMOVUPD(MEM(RDX,RSI,1,64), ZMM(19)) VMOVUPD(MEM(RDX,RSI,2), ZMM(20)) VMOVUPD(MEM(RDX,RSI,2,64), ZMM(21)) VMOVUPD(MEM(RDX,R(13),1), ZMM(22)) VMOVUPD(MEM(RDX,R(13),1,64), ZMM(23)) VMOVUPD(MEM(RDI), ZMM(24)) VMOVUPD(MEM(RDI,64), ZMM(25)) VMOVUPD(MEM(RDI,RSI,1), ZMM(26)) VMOVUPD(MEM(RDI,RSI,1,64), ZMM(27)) VMOVUPD(MEM(RDI,RSI,2), ZMM(28)) VMOVUPD(MEM(RDI,RSI,2,64), ZMM(29)) VMOVUPD(MEM(RDI,R(13),1), ZMM(30)) VMOVUPD(MEM(RDI,R(13),1,64), ZMM(31)) JMP(.DDONE) LABEL(.DGENSTORED) MOV(RDI, VAR(cs_c)) MOV(RAX, VAR(rs_c)) LEA(RDI, MEM(,RDI,8)) MOV(R(8), MEM(RBX)) VBROADCASTSS(YMM(4), VAR(rs_c)) VMOVAPS(YMM(5), VAR(offsetPtr)) VPMULLD(YMM(4), YMM(5), YMM(4)) LEA(RDX, MEM(RCX,RAX,8)) MOV(RSI, 0x3F) SAL1(R(8)) // shift out the sign bit to check for +/- zero JZ(.DGENSTORBZ) KMOV(K(1), ESI) KMOV(K(2), ESI) VGATHERDPD(ZMM(2) MASK_K(1), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(2), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM( 8)) VFMADD231PD(ZMM(3), ZMM(1), ZMM( 9)) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM( 8)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM( 9)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(3), ESI) KMOV(K(4), ESI) VGATHERDPD(ZMM(2) MASK_K(3), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(4), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(10)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(11)) KMOV(K(3), ESI) KMOV(K(4), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(3), ZMM(10)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(4), ZMM(11)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VGATHERDPD(ZMM(2) MASK_K(1), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(2), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(12)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(13)) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(12)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(13)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(3), ESI) KMOV(K(4), ESI) VGATHERDPD(ZMM(2) MASK_K(3), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(4), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(14)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(15)) KMOV(K(3), ESI) KMOV(K(4), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(3), ZMM(14)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(4), ZMM(15)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VGATHERDPD(ZMM(2) MASK_K(1), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(2), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(16)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(17)) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(16)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(17)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(3), ESI) KMOV(K(4), ESI) VGATHERDPD(ZMM(2) MASK_K(3), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(4), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(18)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(19)) KMOV(K(3), ESI) KMOV(K(4), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(3), ZMM(18)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(4), ZMM(19)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VGATHERDPD(ZMM(2) MASK_K(1), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(2), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(20)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(21)) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(20)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(21)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(3), ESI) KMOV(K(4), ESI) VGATHERDPD(ZMM(2) MASK_K(3), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(4), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(22)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(23)) KMOV(K(3), ESI) KMOV(K(4), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(3), ZMM(22)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(4), ZMM(23)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VGATHERDPD(ZMM(2) MASK_K(1), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(2), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(24)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(25)) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(24)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(25)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(3), ESI) KMOV(K(4), ESI) VGATHERDPD(ZMM(2) MASK_K(3), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(4), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(26)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(27)) KMOV(K(3), ESI) KMOV(K(4), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(3), ZMM(26)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(4), ZMM(27)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VGATHERDPD(ZMM(2) MASK_K(1), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(2), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(28)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(29)) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(28)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(29)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(3), ESI) KMOV(K(4), ESI) VGATHERDPD(ZMM(2) MASK_K(3), MEM(RCX,YMM(4),8)) VGATHERDPD(ZMM(3) MASK_K(4), MEM(RDX,YMM(4),8)) VFMADD231PD(ZMM(2), ZMM(1), ZMM(30)) VFMADD231PD(ZMM(3), ZMM(1), ZMM(31)) KMOV(K(3), ESI) KMOV(K(4), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(3), ZMM(30)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(4), ZMM(31)) ADD(RCX, RDI) ADD(RDX, RDI) JMP(.DDONE) LABEL(.DGENSTORBZ) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM( 8)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM( 9)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(10)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(11)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(12)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(13)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(14)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(15)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(16)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(17)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(18)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(19)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(20)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(21)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(22)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(23)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(24)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(25)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(26)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(27)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(28)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(29)) ADD(RCX, RDI) ADD(RDX, RDI) KMOV(K(1), ESI) KMOV(K(2), ESI) VSCATTERDPD(MEM(RCX,YMM(4),8) MASK_K(1), ZMM(30)) VSCATTERDPD(MEM(RDX,YMM(4),8) MASK_K(2), ZMM(31)) ADD(RCX, RDI) ADD(RDX, RDI) LABEL(.DDONE) : // output operands (none) : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), //[a_next] "m" (a_next), //[b_next] "m" (b_next), [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ); } cython-blis-0.9.1/blis/_src/kernels/knl/3/other/bli_dgemm_knl_asm_30x8.c000066400000000000000000000563231427272030600257320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #include "bli_avx512_macros.h" #define UNROLL_K 32 #define SCATTER_PREFETCH_C 1 #define PREFETCH_A_L2 0 #define PREFETCH_B_L2 0 #define L2_PREFETCH_DIST 64 #define A_L1_PREFETCH_DIST 18 #define B_L1_PREFETCH_DIST 18 #define LOOP_ALIGN ALIGN16 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX )) \ VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \ VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \ VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_TWO_ROWS(R1,R2) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX )) \ VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ LEA(RCX, MEM(RCX,RAX,2)) #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_BZ_TWO_ROWS(R1,R2) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ LEA(RCX, MEM(RCX,RAX,2)) #define UPDATE_C_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VGATHERDPD(ZMM(1) MASK_K(1), MEM(RCX,YMM(0),8)) \ VFMADD231PD(ZMM(NUM), ZMM(1), MEM_1TO8(RBX)) \ VSCATTERDPD(MEM(RCX,YMM(0),8) MASK_K(2), ZMM(NUM)) \ ADD(RCX, RAX) #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ VSCATTERDPD(MEM(RCX,YMM(0),8) MASK_K(1), ZMM(NUM)) \ ADD(RCX, RAX) #define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*32*8)) #define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*32*8+64)) #define PREFETCH_A_L1_3(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*32*8+128)) #define PREFETCH_A_L1_4(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*32*8+192)) #if PREFETCH_A_L2 #undef PREFETCH_A_L2 #define PREFETCH_A_L2(n) \ \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*32*8)) \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*32*8+64)) \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*32*8+128)) \ PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*32*8+192)) #else #undef PREFETCH_A_L2 #define PREFETCH_A_L2(...) #endif #define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*8*8)) #if PREFETCH_B_L2 #undef PREFETCH_B_L2 #define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*8*8)) #else #undef PREFETCH_B_L2 #define PREFETCH_B_L2(...) #endif #define PREFETCH_C_L1_1 #define PREFETCH_C_L1_2 #define PREFETCH_C_L1_3 #define PREFETCH_C_L1_4 // // n: index in unrolled loop // // a: ZMM register to load into // b: ZMM register to read from // // ...: addressing for A, except for offset // #define SUBITER(n,a,b,...) \ \ PREFETCH_A_L2(n) \ \ VMOVAPD(ZMM(a), MEM(RBX,(n+1)*64)) \ VFMADD231PD(ZMM( 2), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 0)*8)) \ VFMADD231PD(ZMM( 3), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 1)*8)) \ VFMADD231PD(ZMM( 4), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 2)*8)) \ PREFETCH_A_L1_1(n) \ VFMADD231PD(ZMM( 5), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 3)*8)) \ VFMADD231PD(ZMM( 6), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 4)*8)) \ VFMADD231PD(ZMM( 7), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 5)*8)) \ PREFETCH_C_L1_1 \ VFMADD231PD(ZMM( 8), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 6)*8)) \ VFMADD231PD(ZMM( 9), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 7)*8)) \ VFMADD231PD(ZMM(10), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 8)*8)) \ PREFETCH_A_L1_2(n) \ VFMADD231PD(ZMM(11), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+ 9)*8)) \ VFMADD231PD(ZMM(12), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+10)*8)) \ VFMADD231PD(ZMM(13), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+11)*8)) \ PREFETCH_C_L1_2 \ VFMADD231PD(ZMM(14), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+12)*8)) \ VFMADD231PD(ZMM(15), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+13)*8)) \ VFMADD231PD(ZMM(16), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+14)*8)) \ PREFETCH_A_L1_3(n) \ VFMADD231PD(ZMM(17), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+15)*8)) \ VFMADD231PD(ZMM(18), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+16)*8)) \ VFMADD231PD(ZMM(19), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+17)*8)) \ PREFETCH_C_L1_3 \ VFMADD231PD(ZMM(20), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+18)*8)) \ VFMADD231PD(ZMM(21), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+19)*8)) \ VFMADD231PD(ZMM(22), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+20)*8)) \ PREFETCH_A_L1_4(n) \ VFMADD231PD(ZMM(23), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+21)*8)) \ VFMADD231PD(ZMM(24), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+22)*8)) \ VFMADD231PD(ZMM(25), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+23)*8)) \ PREFETCH_C_L1_4 \ VFMADD231PD(ZMM(26), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+24)*8)) \ VFMADD231PD(ZMM(27), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+25)*8)) \ VFMADD231PD(ZMM(28), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+26)*8)) \ PREFETCH_B_L1(n) \ VFMADD231PD(ZMM(29), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+27)*8)) \ VFMADD231PD(ZMM(30), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+28)*8)) \ VFMADD231PD(ZMM(31), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*32+29)*8)) \ PREFETCH_B_L2(n) //This is an array used for the scatter/gather instructions. extern int32_t offsets[32]; //#define MONITORS //#define LOOPMON void bli_dgemm_knl_asm_30x8 ( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { const int32_t * offsetPtr = &offsets[0]; uint64_t k64 = k; __asm__ volatile ( VPXORD(ZMM(2), ZMM(2), ZMM(2)) //clear out registers VMOVAPS(ZMM( 3), ZMM(2)) VMOVAPS(ZMM( 4), ZMM(2)) VMOVAPS(ZMM( 5), ZMM(2)) VMOVAPS(ZMM( 6), ZMM(2)) VMOVAPS(ZMM( 7), ZMM(2)) VMOVAPS(ZMM( 8), ZMM(2)) VMOVAPS(ZMM( 9), ZMM(2)) MOV(R12, VAR(rs_c)) VMOVAPS(ZMM(10), ZMM(2)) MOV(RSI, VAR(k)) //loop index VMOVAPS(ZMM(11), ZMM(2)) MOV(RAX, VAR(a)) //load address of a VMOVAPS(ZMM(12), ZMM(2)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM(13), ZMM(2)) MOV(RCX, VAR(c)) //load address of c VMOVAPS(ZMM(14), ZMM(2)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b VMOVAPS(ZMM(15), ZMM(2)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(2)) VMOVAPS(ZMM(17), ZMM(2)) VMOVAPS(ZMM(18), ZMM(2)) VMOVAPS(ZMM(19), ZMM(2)) LEA(R13, MEM(R12,R12,2)) VMOVAPS(ZMM(20), ZMM(2)) LEA(R14, MEM(R12,R12,4)) VMOVAPS(ZMM(21), ZMM(2)) LEA(R15, MEM(R13,R12,4)) VMOVAPS(ZMM(22), ZMM(2)) VMOVAPS(ZMM(23), ZMM(2)) VMOVAPS(ZMM(24), ZMM(2)) VMOVAPS(ZMM(25), ZMM(2)) MOV(R8, IMM(4*32*8)) //offset for 4 iterations VMOVAPS(ZMM(26), ZMM(2)) LEA(R9, MEM(R8,R8,2)) //*3 VMOVAPS(ZMM(27), ZMM(2)) LEA(R10, MEM(R8,R8,4)) //*5 VMOVAPS(ZMM(28), ZMM(2)) LEA(R11, MEM(R9,R8,4)) //*7 VMOVAPS(ZMM(29), ZMM(2)) VMOVAPS(ZMM(30), ZMM(2)) VMOVAPS(ZMM(31), ZMM(2)) SUB(RSI, IMM(38)) JLE(TAIL) //prefetch C into L2 #if SCATTER_PREFETCH_C VPBROADCASTD(ZMM(0), R12D) VPBROADCASTD(ZMM(1), R12D) VPMULLD(ZMM(0), ZMM(0), MEM(RDI)) VPMULLD(ZMM(1), ZMM(1), MEM(RDI,64)) ADD(RSI, IMM(30)) KXNORW(K(1), K(0), K(0)) KSHIFTRW(K(2), K(1), IMM(2)) VSCATTERPFDPS(1, MEM(RCX,ZMM(0),8) MASK_K(1)) VSCATTERPFDPS(1, MEM(RCX,ZMM(1),8) MASK_K(2)) VMOVAPD(ZMM(0), MEM(RBX)) #else PREFETCHW1(MEM(RCX )) SUBITER( 0,1,0,RAX ) PREFETCHW1(MEM(RCX,R12,1)) SUBITER( 1,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,2)) SUBITER( 2,1,0,RAX ) PREFETCHW1(MEM(RCX,R13,1)) SUBITER( 3,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,4)) SUBITER( 4,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R14,1)) SUBITER( 5,0,1,RAX,R8, 1) PREFETCHW1(MEM(RCX,R13,2)) SUBITER( 6,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R15,1)) LEA(RDX, MEM(RCX,R12,8)) SUBITER( 7,0,1,RAX,R8, 1) PREFETCHW1(MEM(RDX )) SUBITER( 8,1,0,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,1)) SUBITER( 9,0,1,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,2)) SUBITER(10,1,0,RAX,R8, 2) PREFETCHW1(MEM(RDX,R13,1)) SUBITER(11,0,1,RAX,R8, 2) PREFETCHW1(MEM(RDX,R12,4)) SUBITER(12,1,0,RAX,R9, 1) PREFETCHW1(MEM(RDX,R14,1)) SUBITER(13,0,1,RAX,R9, 1) PREFETCHW1(MEM(RDX,R13,2)) SUBITER(14,1,0,RAX,R9, 1) PREFETCHW1(MEM(RDX,R15,1)) LEA(RDX, MEM(RDX,R12,8)) SUBITER(15,0,1,RAX,R9, 1) PREFETCHW1(MEM(RDX )) SUBITER(16,1,0,RAX,R8, 4) PREFETCHW1(MEM(RDX,R12,1)) SUBITER(17,0,1,RAX,R8, 4) PREFETCHW1(MEM(RDX,R12,2)) SUBITER(18,1,0,RAX,R8, 4) PREFETCHW1(MEM(RDX,R13,1)) SUBITER(19,0,1,RAX,R8, 4) PREFETCHW1(MEM(RDX,R12,4)) SUBITER(20,1,0,RAX,R10,1) PREFETCHW1(MEM(RDX,R14,1)) SUBITER(21,0,1,RAX,R10,1) PREFETCHW1(MEM(RDX,R13,2)) SUBITER(22,1,0,RAX,R10,1) PREFETCHW1(MEM(RDX,R15,1)) LEA(RDX, MEM(RDX,R12,8)) SUBITER(23,0,1,RAX,R10,1) PREFETCHW1(MEM(RDX )) SUBITER(24,1,0,RAX,R9, 2) PREFETCHW1(MEM(RDX,R12,1)) SUBITER(25,0,1,RAX,R9, 2) PREFETCHW1(MEM(RDX,R12,2)) SUBITER(26,1,0,RAX,R9, 2) PREFETCHW1(MEM(RDX,R13,1)) SUBITER(27,0,1,RAX,R9, 2) PREFETCHW1(MEM(RDX,R12,4)) SUBITER(28,1,0,RAX,R11,1) PREFETCHW1(MEM(RDX,R14,1)) SUBITER(29,0,1,RAX,R11,1) ADD(RAX, IMM(30*32*8)) ADD(RBX, IMM(30* 8*8)) #endif MOV(RDI, RSI) AND(RDI, IMM(31)) SAR(RSI, IMM(5)) JZ(REM_1) LOOP_ALIGN LABEL(MAIN_LOOP) SUBITER( 0,1,0,RAX ) SUBITER( 1,0,1,RAX ) SUBITER( 2,1,0,RAX ) SUBITER( 3,0,1,RAX ) SUBITER( 4,1,0,RAX,R8, 1) SUBITER( 5,0,1,RAX,R8, 1) SUBITER( 6,1,0,RAX,R8, 1) SUBITER( 7,0,1,RAX,R8, 1) SUBITER( 8,1,0,RAX,R8, 2) SUBITER( 9,0,1,RAX,R8, 2) SUBITER(10,1,0,RAX,R8, 2) SUBITER(11,0,1,RAX,R8, 2) SUBITER(12,1,0,RAX,R9, 1) SUBITER(13,0,1,RAX,R9, 1) SUBITER(14,1,0,RAX,R9, 1) SUBITER(15,0,1,RAX,R9, 1) SUBITER(16,1,0,RAX,R8, 4) SUBITER(17,0,1,RAX,R8, 4) SUBITER(18,1,0,RAX,R8, 4) SUBITER(19,0,1,RAX,R8, 4) SUBITER(20,1,0,RAX,R10,1) SUBITER(21,0,1,RAX,R10,1) SUBITER(22,1,0,RAX,R10,1) SUBITER(23,0,1,RAX,R10,1) SUBITER(24,1,0,RAX,R9, 2) SUBITER(25,0,1,RAX,R9, 2) SUBITER(26,1,0,RAX,R9, 2) SUBITER(27,0,1,RAX,R9, 2) SUBITER(28,1,0,RAX,R11,1) SUBITER(29,0,1,RAX,R11,1) SUBITER(30,1,0,RAX,R11,1) SUBITER(31,0,1,RAX,R11,1) ADD(RAX, IMM(32*32*8)) ADD(RBX, IMM(32* 8*8)) SUB(RSI, IMM(1)) JNZ(MAIN_LOOP) LABEL(REM_1) SAR1(RDI) JNC(REM_2) SUBITER(0,1,0,RAX) VMOVAPD(ZMM(0), ZMM(1)) ADD(RAX, IMM(32*8)) ADD(RBX, IMM( 8*8)) LABEL(REM_2) SAR1(RDI) JNC(REM_4) SUBITER(0,1,0,RAX) SUBITER(1,0,1,RAX) ADD(RAX, IMM(2*32*8)) ADD(RBX, IMM(2* 8*8)) LABEL(REM_4) SAR1(RDI) JNC(REM_8) SUBITER(0,1,0,RAX) SUBITER(1,0,1,RAX) SUBITER(2,1,0,RAX) SUBITER(3,0,1,RAX) ADD(RAX, IMM(4*32*8)) ADD(RBX, IMM(4* 8*8)) LABEL(REM_8) SAR1(RDI) JNC(REM_16) SUBITER(0,1,0,RAX ) SUBITER(1,0,1,RAX ) SUBITER(2,1,0,RAX ) SUBITER(3,0,1,RAX ) SUBITER(4,1,0,RAX,R8,1) SUBITER(5,0,1,RAX,R8,1) SUBITER(6,1,0,RAX,R8,1) SUBITER(7,0,1,RAX,R8,1) ADD(RAX, IMM(8*32*8)) ADD(RBX, IMM(8* 8*8)) LABEL(REM_16) SAR1(RDI) JNC(AFTER_LOOP) SUBITER( 0,1,0,RAX ) SUBITER( 1,0,1,RAX ) SUBITER( 2,1,0,RAX ) SUBITER( 3,0,1,RAX ) SUBITER( 4,1,0,RAX,R8, 1) SUBITER( 5,0,1,RAX,R8, 1) SUBITER( 6,1,0,RAX,R8, 1) SUBITER( 7,0,1,RAX,R8, 1) SUBITER( 8,1,0,RAX,R8, 2) SUBITER( 9,0,1,RAX,R8, 2) SUBITER(10,1,0,RAX,R8, 2) SUBITER(11,0,1,RAX,R8, 2) SUBITER(12,1,0,RAX,R9, 1) SUBITER(13,0,1,RAX,R9, 1) SUBITER(14,1,0,RAX,R9, 1) SUBITER(15,0,1,RAX,R9, 1) ADD(RAX, IMM(16*32*8)) ADD(RBX, IMM(16* 8*8)) LABEL(AFTER_LOOP) //prefetch C into L1 #if SCATTER_PREFETCH_C MOV(RDI, VAR(offsetPtr)) VPBROADCASTD(ZMM(0), R12D) VPBROADCASTD(ZMM(1), R12D) VPMULLD(ZMM(0), ZMM(0), MEM(RDI)) VPMULLD(ZMM(1), ZMM(1), MEM(RDI,64)) KXNORW(K(1), K(0), K(0)) KSHIFTRW(K(2), K(1), IMM(2)) VSCATTERPFDPS(0, MEM(RCX,ZMM(0),8) MASK_K(1)) VSCATTERPFDPS(0, MEM(RCX,ZMM(1),8) MASK_K(2)) VMOVAPD(ZMM(0), MEM(RBX)) SUBITER(0,1,0,RAX ) SUBITER(1,0,1,RAX ) SUBITER(2,1,0,RAX ) SUBITER(3,0,1,RAX ) SUBITER(4,1,0,RAX,R8,1) SUBITER(5,0,1,RAX,R8,1) SUBITER(6,1,0,RAX,R8,1) SUBITER(7,0,1,RAX,R8,1) #else #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX )) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2)) #define PREFETCH_C_L1_4 PREFETCHW0(MEM(RCX,R13,1)) SUBITER(0,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R12,4)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R14,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R13,2)) #define PREFETCH_C_L1_4 PREFETCHW0(MEM(RCX,R15,1)) SUBITER(1,0,1,RAX ) LEA(RDX, MEM(RCX,R12,8)) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX )) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R12,2)) #define PREFETCH_C_L1_4 PREFETCHW0(MEM(RDX,R13,1)) SUBITER(2,1,0,RAX ) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2)) #define PREFETCH_C_L1_4 PREFETCHW0(MEM(RDX,R15,1)) SUBITER(3,0,1,RAX ) LEA(RDX, MEM(RDX,R12,8)) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX )) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R12,2)) #define PREFETCH_C_L1_4 PREFETCHW0(MEM(RDX,R13,1)) SUBITER(4,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2)) #define PREFETCH_C_L1_4 PREFETCHW0(MEM(RDX,R15,1)) SUBITER(5,0,1,RAX,R8,1) LEA(RDX, MEM(RDX,R12,8)) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX )) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,1)) #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R12,2)) #define PREFETCH_C_L1_4 PREFETCHW0(MEM(RDX,R13,1)) SUBITER(6,1,0,RAX,R8,1) #undef PREFETCH_C_L1_1 #undef PREFETCH_C_L1_2 #undef PREFETCH_C_L1_3 #undef PREFETCH_C_L1_4 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4)) #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1)) #define PREFETCH_C_L1_3 #define PREFETCH_C_L1_4 SUBITER(7,0,1,RAX,R8,1) #endif JMP(POSTACCUM) LABEL(TAIL) MOV(RDX, RCX) ADD(RSI, IMM(38)) LABEL(TAIL_LOOP) PREFETCHW0(MEM(RDX)) ADD(RDX, R12) SUBITER(0,1,0,RAX) VMOVAPD(ZMM(0), ZMM(1)) ADD(RAX, IMM(32*8)) ADD(RBX, IMM( 8*8)) SUB(RSI, IMM(1)) JNZ(TAIL_LOOP) LABEL(POSTACCUM) MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSD(ZMM(0), MEM(RAX)) VBROADCASTSD(ZMM(1), MEM(RBX)) // Check if C is row stride. If not, jump to the slow scattered update MOV(RAX, VAR(rs_c)) LEA(RAX, MEM(,RAX,8)) MOV(RBX, VAR(cs_c)) LEA(RDI, MEM(RAX,RAX,2)) CMP(RBX, IMM(1)) JNE(SCATTEREDUPDATE) VMOVQ(RDX, XMM(1)) SAL1(RDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 2, 3, 4, 5) UPDATE_C_FOUR_ROWS( 6, 7, 8, 9) UPDATE_C_FOUR_ROWS(10,11,12,13) UPDATE_C_FOUR_ROWS(14,15,16,17) UPDATE_C_FOUR_ROWS(18,19,20,21) UPDATE_C_FOUR_ROWS(22,23,24,25) UPDATE_C_FOUR_ROWS(26,27,28,29) UPDATE_C_TWO_ROWS (30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ_FOUR_ROWS( 2, 3, 4, 5) UPDATE_C_BZ_FOUR_ROWS( 6, 7, 8, 9) UPDATE_C_BZ_FOUR_ROWS(10,11,12,13) UPDATE_C_BZ_FOUR_ROWS(14,15,16,17) UPDATE_C_BZ_FOUR_ROWS(18,19,20,21) UPDATE_C_BZ_FOUR_ROWS(22,23,24,25) UPDATE_C_BZ_FOUR_ROWS(26,27,28,29) UPDATE_C_BZ_TWO_ROWS (30,31) JMP(END) LABEL(SCATTEREDUPDATE) VMULPD(ZMM( 2), ZMM( 2), ZMM(0)) VMULPD(ZMM( 3), ZMM( 3), ZMM(0)) VMULPD(ZMM( 4), ZMM( 4), ZMM(0)) VMULPD(ZMM( 5), ZMM( 5), ZMM(0)) VMULPD(ZMM( 6), ZMM( 6), ZMM(0)) VMULPD(ZMM( 7), ZMM( 7), ZMM(0)) VMULPD(ZMM( 8), ZMM( 8), ZMM(0)) VMULPD(ZMM( 9), ZMM( 9), ZMM(0)) VMULPD(ZMM(10), ZMM(10), ZMM(0)) VMULPD(ZMM(11), ZMM(11), ZMM(0)) VMULPD(ZMM(12), ZMM(12), ZMM(0)) VMULPD(ZMM(13), ZMM(13), ZMM(0)) VMULPD(ZMM(14), ZMM(14), ZMM(0)) VMULPD(ZMM(15), ZMM(15), ZMM(0)) VMULPD(ZMM(16), ZMM(16), ZMM(0)) VMULPD(ZMM(17), ZMM(17), ZMM(0)) VMULPD(ZMM(18), ZMM(18), ZMM(0)) VMULPD(ZMM(19), ZMM(19), ZMM(0)) VMULPD(ZMM(20), ZMM(20), ZMM(0)) VMULPD(ZMM(21), ZMM(21), ZMM(0)) VMULPD(ZMM(22), ZMM(22), ZMM(0)) VMULPD(ZMM(23), ZMM(23), ZMM(0)) VMULPD(ZMM(24), ZMM(24), ZMM(0)) VMULPD(ZMM(25), ZMM(25), ZMM(0)) VMULPD(ZMM(26), ZMM(26), ZMM(0)) VMULPD(ZMM(27), ZMM(27), ZMM(0)) VMULPD(ZMM(28), ZMM(28), ZMM(0)) VMULPD(ZMM(29), ZMM(29), ZMM(0)) VMULPD(ZMM(30), ZMM(30), ZMM(0)) VMULPD(ZMM(31), ZMM(31), ZMM(0)) VMOVQ(RDX, XMM(1)) /* Note that this ignores the upper 32 bits in cs_c */ MOV(RDI, VAR(offsetPtr)) VPBROADCASTD(ZMM(0), EBX) VPMULLD(ZMM(0), ZMM(0), MEM(RDI)) MOV(RBX, VAR(beta)) SAL1(RDX) //shift out sign bit JZ(SCATTERBZ) UPDATE_C_ROW_SCATTERED( 2) UPDATE_C_ROW_SCATTERED( 3) UPDATE_C_ROW_SCATTERED( 4) UPDATE_C_ROW_SCATTERED( 5) UPDATE_C_ROW_SCATTERED( 6) UPDATE_C_ROW_SCATTERED( 7) UPDATE_C_ROW_SCATTERED( 8) UPDATE_C_ROW_SCATTERED( 9) UPDATE_C_ROW_SCATTERED(10) UPDATE_C_ROW_SCATTERED(11) UPDATE_C_ROW_SCATTERED(12) UPDATE_C_ROW_SCATTERED(13) UPDATE_C_ROW_SCATTERED(14) UPDATE_C_ROW_SCATTERED(15) UPDATE_C_ROW_SCATTERED(16) UPDATE_C_ROW_SCATTERED(17) UPDATE_C_ROW_SCATTERED(18) UPDATE_C_ROW_SCATTERED(19) UPDATE_C_ROW_SCATTERED(20) UPDATE_C_ROW_SCATTERED(21) UPDATE_C_ROW_SCATTERED(22) UPDATE_C_ROW_SCATTERED(23) UPDATE_C_ROW_SCATTERED(24) UPDATE_C_ROW_SCATTERED(25) UPDATE_C_ROW_SCATTERED(26) UPDATE_C_ROW_SCATTERED(27) UPDATE_C_ROW_SCATTERED(28) UPDATE_C_ROW_SCATTERED(29) UPDATE_C_ROW_SCATTERED(30) UPDATE_C_ROW_SCATTERED(31) JMP(END) LABEL(SCATTERBZ) UPDATE_C_BZ_ROW_SCATTERED( 2) UPDATE_C_BZ_ROW_SCATTERED( 3) UPDATE_C_BZ_ROW_SCATTERED( 4) UPDATE_C_BZ_ROW_SCATTERED( 5) UPDATE_C_BZ_ROW_SCATTERED( 6) UPDATE_C_BZ_ROW_SCATTERED( 7) UPDATE_C_BZ_ROW_SCATTERED( 8) UPDATE_C_BZ_ROW_SCATTERED( 9) UPDATE_C_BZ_ROW_SCATTERED(10) UPDATE_C_BZ_ROW_SCATTERED(11) UPDATE_C_BZ_ROW_SCATTERED(12) UPDATE_C_BZ_ROW_SCATTERED(13) UPDATE_C_BZ_ROW_SCATTERED(14) UPDATE_C_BZ_ROW_SCATTERED(15) UPDATE_C_BZ_ROW_SCATTERED(16) UPDATE_C_BZ_ROW_SCATTERED(17) UPDATE_C_BZ_ROW_SCATTERED(18) UPDATE_C_BZ_ROW_SCATTERED(19) UPDATE_C_BZ_ROW_SCATTERED(20) UPDATE_C_BZ_ROW_SCATTERED(21) UPDATE_C_BZ_ROW_SCATTERED(22) UPDATE_C_BZ_ROW_SCATTERED(23) UPDATE_C_BZ_ROW_SCATTERED(24) UPDATE_C_BZ_ROW_SCATTERED(25) UPDATE_C_BZ_ROW_SCATTERED(26) UPDATE_C_BZ_ROW_SCATTERED(27) UPDATE_C_BZ_ROW_SCATTERED(28) UPDATE_C_BZ_ROW_SCATTERED(29) UPDATE_C_BZ_ROW_SCATTERED(30) UPDATE_C_BZ_ROW_SCATTERED(31) LABEL(END) : // output operands : // input operands [k] "m" (k64), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ); } cython-blis-0.9.1/blis/_src/kernels/knl/3/other/bli_dgemm_knl_asm_30x8_knc.c000066400000000000000000000371001427272030600265550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #include "bli_avx512_macros.h" #define A_L1_PREFETCH_DIST 4 #define B_L1_PREFETCH_DIST 2 #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. //Alternate code path uused if C is not row-major // r9 = c // ymm0 = cs_c * 1...8 // r11 = rs_c // r12 = &alpha // r13 = &beta #define UPDATE_C_ROW_SCATTERED_(NUM,BNZ1,BNZ2) \ \ BNZ1 KXNORW(K(2), K(0), K(0)) BNZ2 \ KXNORW(K(3), K(0), K(0)) \ BNZ1 VGATHERDPD(ZMM(31) MASK_K(2), MEM(R(9),YMM(0),8)) BNZ2 \ VMULPD(ZMM(NUM), ZMM(NUM), MEM_1TO8(R(12))) /*scale by alpha*/ \ BNZ1 VFMADD231PD(ZMM(NUM), ZMM(31), MEM_1TO8(R(13))) BNZ2 /*scale by beta, add in result*/ \ VSCATTERDPD(MEM(R(9),YMM(0),8) MASK_K(3), ZMM(NUM)) \ ADD(R(9), R(11)) #define UPDATE_C_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,,) #define UPDATE_C_BZ_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,COMMENT_BEGIN,COMMENT_END) // r12 = &alpha // zmm31 = beta // r9 = c // r11 = rs_c // r10 = 3*rs_c // rdi = 4*rs_c #define UPDATE_C_4_ROWS_(R1,R2,R3,R4,BNZ1,BNZ2) \ \ VMULPD(ZMM(R1), ZMM(R1), MEM_1TO8(R(12))) \ VMULPD(ZMM(R2), ZMM(R2), MEM_1TO8(R(12))) \ VMULPD(ZMM(R3), ZMM(R3), MEM_1TO8(R(12))) \ VMULPD(ZMM(R4), ZMM(R4), MEM_1TO8(R(12))) \ BNZ1 VFMADD231PD(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \ BNZ1 VFMADD231PD(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \ BNZ1 VFMADD231PD(ZMM(R3), ZMM(31), MEM(R(9),R(11),2)) BNZ2 \ BNZ1 VFMADD231PD(ZMM(R4), ZMM(31), MEM(R(9),R(10),1)) BNZ2 \ VMOVUPD(MEM(R(9) ), ZMM(R1)) \ VMOVUPD(MEM(R(9),R(11),1), ZMM(R2)) \ VMOVUPD(MEM(R(9),R(11),2), ZMM(R3)) \ VMOVUPD(MEM(R(9),R(10),1), ZMM(R4)) \ ADD(R(9), RDI) // r12 = &alpha // zmm31 = beta // r9 = c // r11 = rs_c #define UPDATE_C_2_ROWS_(R1,R2,BNZ1,BNZ2) \ \ VMULPD(ZMM(R1), ZMM(R1), MEM_1TO8(R(12))) \ VMULPD(ZMM(R2), ZMM(R2), MEM_1TO8(R(12))) \ BNZ1 VFMADD231PD(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \ BNZ1 VFMADD231PD(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \ VMOVUPD(MEM(R(9) ), ZMM(R1)) \ VMOVUPD(MEM(R(9),R(11),1), ZMM(R2)) \ #define UPDATE_C_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,,) #define UPDATE_C_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,,) #define UPDATE_C_BZ_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,COMMENT_BEGIN,COMMENT_END) #define UPDATE_C_BZ_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,COMMENT_BEGIN,COMMENT_END) #define A_TIMES_B_ROW(n) VFMADD231PD(ZMM(n), ZMM(31), MEM_1TO8(R(15),(n-1)*8)) #define A_TIMES_B_ROW_PREV(n) VFMADD231PD(ZMM(n), ZMM(31), MEM_1TO8(R(15),((n-1)-32)*8)) #define PREFETCH_A_L1(n) PREFETCH(0, MEM(R(15),A_L1_PREFETCH_DIST*8*32+n*64)) #define PREFETCH_A_L2(n) PREFETCH(1, MEM(R(15),R(14),1,n*64)) #define PREFETCH_B_L1 PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*8*8)) #define PREFETCH_B_L2 PREFETCH(1, MEM(RBX,R(13),1)) //One iteration of the k_r loop. //Each iteration, we prefetch A into L1 and into L2 // r15 = a // rbx = b // rcx = c // r11 = rs_c // r13 = L2_PREFETCH_DIST*8*8 // r14 = L2_PREFETCH_DIST*8*32 // r12 = 32*8 = dist. to next sliver of a // r9 = 8*8 = dist. to next sliver of b #define MAIN_LOOP_(COUNTER, PC_L1_1, PC_L1_2, PC_L2_1, PC_L2_2) \ \ /* Can this be pre-loaded for next it. in zmm0? */ \ VMOVAPD(ZMM(31), MEM(RBX)) \ \ A_TIMES_B_ROW ( 1) \ A_TIMES_B_ROW ( 2) PREFETCH_A_L1(0) \ A_TIMES_B_ROW ( 3) PREFETCH_A_L1(1) \ A_TIMES_B_ROW ( 4) PREFETCH_A_L1(2) \ A_TIMES_B_ROW ( 5) PREFETCH_A_L1(3) \ A_TIMES_B_ROW ( 6) PREFETCH_A_L2(0) \ A_TIMES_B_ROW ( 7) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \ A_TIMES_B_ROW ( 8) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \ A_TIMES_B_ROW ( 9) \ A_TIMES_B_ROW (10) PC_L2_1 PREFETCH(1, MEM(RCX)) PC_L2_2 \ A_TIMES_B_ROW (11) PREFETCH_A_L2(1) \ A_TIMES_B_ROW (12) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \ A_TIMES_B_ROW (13) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \ A_TIMES_B_ROW (14) \ A_TIMES_B_ROW (15) \ A_TIMES_B_ROW (16) PREFETCH_A_L2(2) \ A_TIMES_B_ROW (17) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \ A_TIMES_B_ROW (18) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \ A_TIMES_B_ROW (19) \ A_TIMES_B_ROW (20) \ A_TIMES_B_ROW (21) PREFETCH_A_L2(3) \ A_TIMES_B_ROW (22) ADD(R(15), R(12)) \ A_TIMES_B_ROW_PREV(23) \ A_TIMES_B_ROW_PREV(24) PC_L2_1 ADD(RCX, R(11)) PC_L2_2 \ A_TIMES_B_ROW_PREV(25) DEC(COUNTER) \ A_TIMES_B_ROW_PREV(26) PREFETCH_B_L2 \ A_TIMES_B_ROW_PREV(27) PREFETCH_B_L1 \ A_TIMES_B_ROW_PREV(28) ADD(RBX, R(9)) \ A_TIMES_B_ROW_PREV(29) CMP(COUNTER, IMM(0)) \ A_TIMES_B_ROW_PREV(30) #define MAIN_LOOP(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,COMMENT_BEGIN,COMMENT_END) #define MAIN_LOOP_PC_L1(COUNTER) MAIN_LOOP_(COUNTER,,,COMMENT_BEGIN,COMMENT_END) #define MAIN_LOOP_PC_L2(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,,) //This is an array used for the scatter/gather instructions. extern int32_t offsets[16]; //#define MONITORS //#define LOOPMON void bli_dgemm_knl_asm_30x8_knc ( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif __asm__ volatile ( #ifdef MONITORS RDTSC MOV(VAR(topl), EAX) MOV(VAR(toph), EDX) #endif VPXORD(ZMM(1), ZMM(1), ZMM(1)) //clear out registers VMOVAPS(ZMM( 2), ZMM(1)) VMOVAPS(ZMM( 3), ZMM(1)) MOV(RSI, VAR(k)) //loop index VMOVAPS(ZMM( 4), ZMM(1)) MOV(R(11), VAR(rs_c)) //load row stride VMOVAPS(ZMM( 5), ZMM(1)) SAL(R(11), IMM(3)) //scale row stride VMOVAPS(ZMM( 6), ZMM(1)) MOV(R(15), VAR(a)) //load address of a VMOVAPS(ZMM( 7), ZMM(1)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM( 8), ZMM(1)) VMOVAPS(ZMM( 9), ZMM(1)) LEA(R(10), MEM(R(11),R(11),2)) //r10 has 3 * r11 VMOVAPS(ZMM(10), ZMM(1)) VMOVAPS(ZMM(11), ZMM(1)) MOV(RDI, R(11)) VMOVAPS(ZMM(12), ZMM(1)) SAL(RDI, IMM(2)) //rdi has 4*r11 VMOVAPS(ZMM(13), ZMM(1)) MOV(RCX, VAR(c)) //load address of c for prefetching VMOVAPS(ZMM(14), ZMM(1)) VMOVAPS(ZMM(15), ZMM(1)) MOV(R(8), VAR(k)) VMOVAPS(ZMM(16), ZMM(1)) VMOVAPS(ZMM(17), ZMM(1)) VMOVAPS(ZMM(18), ZMM(1)) MOV(R(13), IMM(8*8*L2_PREFETCH_DIST)) VMOVAPS(ZMM(19), ZMM(1)) MOV(R(14), IMM(8*32*L2_PREFETCH_DIST)) VMOVAPS(ZMM(20), ZMM(1)) VMOVAPS(ZMM(21), ZMM(1)) VMOVAPS(ZMM(22), ZMM(1)) VMOVAPS(ZMM(23), ZMM(1)) VMOVAPS(ZMM(24), ZMM(1)) SUB(R(8), IMM(30+L2_PREFETCH_DIST)) //Check if we have over 40 operations to do. VMOVAPS(ZMM(25), ZMM(1)) MOV(R(8), IMM(30)) VMOVAPS(ZMM(26), ZMM(1)) MOV(R(9), IMM(8*8)) //amount to increment b* by each iteration VMOVAPS(ZMM(27), ZMM(1)) MOV(R(12), IMM(8*32)) //amount to increment a* by each iteration VMOVAPS(ZMM(28), ZMM(1)) VMOVAPS(ZMM(29), ZMM(1)) VMOVAPS(ZMM(30), ZMM(1)) #ifdef MONITORS RDTSC MOV(VAR(midl), EAX) MOV(VAR(midh), EDX) #endif JLE(CONSIDER_UNDER_40) SUB(RSI, IMM(30+L2_PREFETCH_DIST)) //First 30 iterations LABEL(LOOPREFECHCL2) MAIN_LOOP_PC_L2(R(8)) JNZ(LOOPREFECHCL2) MOV(RCX, VAR(c)) //Main Loop. LABEL(LOOPMAIN) MAIN_LOOP(RSI) JNZ(LOOPMAIN) //Penultimate 22 iterations. //Break these off from the main loop to avoid prefetching extra shit. MOV(R(14), VAR(a_next)) MOV(R(13), VAR(b_next)) SUB(R(14), R(15)) SUB(R(13), RBX) //Yes, I know 10-20 = -10 MOV(RSI, IMM(10+L2_PREFETCH_DIST-20)) LABEL(LOOPMAIN2) MAIN_LOOP(RSI) JNZ(LOOPMAIN2) //Last 10 iterations MOV(R(8), IMM(10)) LABEL(LOOPREFETCHCL1) MAIN_LOOP_PC_L1(R(8)) JNZ(LOOPREFETCHCL1) JMP(POSTACCUM) //Alternate main loop, with no prefetching of C //Used when <= 40 iterations LABEL(CONSIDER_UNDER_40) MOV(RSI, VAR(k)) TEST(RSI, RSI) JZ(POSTACCUM) LABEL(LOOP_UNDER_40) MAIN_LOOP(RSI) JNZ(LOOP_UNDER_40) LABEL(POSTACCUM) #ifdef MONITORS RDTSC MOV(VAR(mid2l), EAX) MOV(VAR(mid2h), EDX) #endif MOV(R(9), VAR(c)) //load address of c for update MOV(R(12), VAR(alpha)) //load address of alpha // Check if C is row stride. If not, jump to the slow scattered update MOV(R(14), VAR(cs_c)) DEC(R(14)) JNZ(SCATTEREDUPDATE) MOV(R(14), VAR(beta)) VBROADCASTSD(ZMM(31), MEM(R(14))) MOV(RBX, MEM(R(14))) TEST(RBX, RBX) JZ(COLSTORBZ) UPDATE_C_4_ROWS( 1, 2, 3, 4) UPDATE_C_4_ROWS( 5, 6, 7, 8) UPDATE_C_4_ROWS( 9,10,11,12) UPDATE_C_4_ROWS(13,14,15,16) UPDATE_C_4_ROWS(17,18,19,20) UPDATE_C_4_ROWS(21,22,23,24) UPDATE_C_4_ROWS(25,26,27,28) UPDATE_C_2_ROWS(29,30) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ_4_ROWS( 1, 2, 3, 4) UPDATE_C_BZ_4_ROWS( 5, 6, 7, 8) UPDATE_C_BZ_4_ROWS( 9,10,11,12) UPDATE_C_BZ_4_ROWS(13,14,15,16) UPDATE_C_BZ_4_ROWS(17,18,19,20) UPDATE_C_BZ_4_ROWS(21,22,23,24) UPDATE_C_BZ_4_ROWS(25,26,27,28) UPDATE_C_BZ_2_ROWS(29,30) JMP(END) LABEL(SCATTEREDUPDATE) MOV(R(13), VAR(beta)) MOV(R(10), VAR(offsetPtr)) VMOVAPS(ZMM(0), MEM(R(10))) MOV(RBX, MEM(R(13))) /* Note that this ignores the upper 32 bits in cs_c */ VPBROADCASTD(ZMM(31), VAR(cs_c)) VPMULLD(ZMM(0), ZMM(31), ZMM(0)) TEST(RBX, RBX) JZ(SCATTERBZ) UPDATE_C_ROW_SCATTERED( 1) UPDATE_C_ROW_SCATTERED( 2) UPDATE_C_ROW_SCATTERED( 3) UPDATE_C_ROW_SCATTERED( 4) UPDATE_C_ROW_SCATTERED( 5) UPDATE_C_ROW_SCATTERED( 6) UPDATE_C_ROW_SCATTERED( 7) UPDATE_C_ROW_SCATTERED( 8) UPDATE_C_ROW_SCATTERED( 9) UPDATE_C_ROW_SCATTERED(10) UPDATE_C_ROW_SCATTERED(11) UPDATE_C_ROW_SCATTERED(12) UPDATE_C_ROW_SCATTERED(13) UPDATE_C_ROW_SCATTERED(14) UPDATE_C_ROW_SCATTERED(15) UPDATE_C_ROW_SCATTERED(16) UPDATE_C_ROW_SCATTERED(17) UPDATE_C_ROW_SCATTERED(18) UPDATE_C_ROW_SCATTERED(19) UPDATE_C_ROW_SCATTERED(20) UPDATE_C_ROW_SCATTERED(21) UPDATE_C_ROW_SCATTERED(22) UPDATE_C_ROW_SCATTERED(23) UPDATE_C_ROW_SCATTERED(24) UPDATE_C_ROW_SCATTERED(25) UPDATE_C_ROW_SCATTERED(26) UPDATE_C_ROW_SCATTERED(27) UPDATE_C_ROW_SCATTERED(28) UPDATE_C_ROW_SCATTERED(29) UPDATE_C_ROW_SCATTERED(30) JMP(END) LABEL(SCATTERBZ) UPDATE_C_BZ_ROW_SCATTERED( 1) UPDATE_C_BZ_ROW_SCATTERED( 2) UPDATE_C_BZ_ROW_SCATTERED( 3) UPDATE_C_BZ_ROW_SCATTERED( 4) UPDATE_C_BZ_ROW_SCATTERED( 5) UPDATE_C_BZ_ROW_SCATTERED( 6) UPDATE_C_BZ_ROW_SCATTERED( 7) UPDATE_C_BZ_ROW_SCATTERED( 8) UPDATE_C_BZ_ROW_SCATTERED( 9) UPDATE_C_BZ_ROW_SCATTERED(10) UPDATE_C_BZ_ROW_SCATTERED(11) UPDATE_C_BZ_ROW_SCATTERED(12) UPDATE_C_BZ_ROW_SCATTERED(13) UPDATE_C_BZ_ROW_SCATTERED(14) UPDATE_C_BZ_ROW_SCATTERED(15) UPDATE_C_BZ_ROW_SCATTERED(16) UPDATE_C_BZ_ROW_SCATTERED(17) UPDATE_C_BZ_ROW_SCATTERED(18) UPDATE_C_BZ_ROW_SCATTERED(19) UPDATE_C_BZ_ROW_SCATTERED(20) UPDATE_C_BZ_ROW_SCATTERED(21) UPDATE_C_BZ_ROW_SCATTERED(22) UPDATE_C_BZ_ROW_SCATTERED(23) UPDATE_C_BZ_ROW_SCATTERED(24) UPDATE_C_BZ_ROW_SCATTERED(25) UPDATE_C_BZ_ROW_SCATTERED(26) UPDATE_C_BZ_ROW_SCATTERED(27) UPDATE_C_BZ_ROW_SCATTERED(28) UPDATE_C_BZ_ROW_SCATTERED(29) UPDATE_C_BZ_ROW_SCATTERED(30) LABEL(END) #ifdef MONITORS RDTSC MOV(VAR(botl), EAX) MOV(VAR(both), EDX) #endif : // output operands #ifdef MONITORS [topl] "=m" (topl), [toph] "=m" (toph), [midl] "=m" (midl), [midh] "=m" (midh), [mid2l] "=m" (mid2l), [mid2h] "=m" (mid2h), [botl] "=m" (botl), [both] "=m" (both) #endif : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [a_next] "m" (a_next), [b_next] "m" (b_next), [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ); #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif #ifdef MONITORS dim_t top = ((dim_t)toph << 32) | topl; dim_t mid = ((dim_t)midh << 32) | midl; dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; dim_t bot = ((dim_t)both << 32) | botl; printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); #endif } cython-blis-0.9.1/blis/_src/kernels/knl/3/other/bli_dgemm_knl_asm_8x24.c000066400000000000000000000530411427272030600257270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #include "bli_avx512_macros.h" #define UNROLL_K 8 #define SCATTER_PREFETCH_AB 0 #define SCATTER_PREFETCH_C 1 #define PREFETCH_A_L2 0 #define PREFETCH_B_L2 0 #define L2_PREFETCH_DIST 64 #define A_L1_PREFETCH_DIST 32 #define B_L1_PREFETCH_DIST 12 #define C_MIN_L2_ITERS 64 //C is not prefetched into L2 for k <= this #define C_L1_ITERS 8 //number of iterations before the end to prefetch C into L1 //make sure there is an unrolled MAIN_LOOP_X for this number #define LOOP_ALIGN ALIGN16 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX )) \ VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \ VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \ VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VMOVUPD(MEM(RCX ), ZMM(R1)) \ VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \ VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,4)) #define UPDATE_C_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \ VGATHERDPD(ZMM(3) MASK_K(1), MEM(RCX,YMM(2),8)) \ VFMADD231PD(ZMM(NUM), ZMM(3), ZMM(1)) \ VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(2), ZMM(NUM)) \ ADD(RCX, RAX) #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \ \ KXNORW(K(1), K(0), K(0)) \ VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \ VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(1), ZMM(NUM)) \ ADD(RCX, RAX) #define PREFETCH_B_L1_1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8)) #define PREFETCH_B_L1_2(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8+64)) #define PREFETCH_B_L1_3(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8+128)) #if PREFETCH_B_L2 #undef PREFETCH_B_L2 #define PREFETCH_B_L2(n) \ \ PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8)) \ PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8+64)) \ PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8+128)) #else #undef PREFETCH_B_L2 #define PREFETCH_B_L2(...) #endif #define PREFETCH_A_L1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*8*8)) #if PREFETCH_A_L2 #undef PREFETCH_A_L2 #define PREFETCH_A_L2(n) PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*8*8)) #else #undef PREFETCH_A_L2 #define PREFETCH_A_L2(...) #endif #if SCATTER_PREFETCH_AB #undef SCATTER_PREFETCH_AB #undef PREFETCH_B_L1_1 #undef PREFETCH_B_L1_2 #undef PREFETCH_B_L1_3 #undef PREFETCH_A_L1 #define SCATTER_PREFETCH_AB(n) \ \ KXNORW(K(1), K(0), K(0)) \ VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n )*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(1)) \ KXNORW(K(2), K(0), K(0)) \ VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n+1)*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(2)) \ KXNORW(K(3), K(0), K(0)) \ VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n+2)*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(3)) \ KXNORW(K(4), K(0), K(0)) \ VGATHERPFDPS(0, MEM(RAX,ZMM(4),8,( n *16+ A_L1_PREFETCH_DIST)*64) MASK_K(4)) #define PREFETCH_B_L1_1(...) #define PREFETCH_B_L1_2(...) #define PREFETCH_B_L1_3(...) #define PREFETCH_A_L1(...) #else #undef SCATTER_PREFETCH_AB #define SCATTER_PREFETCH_AB(...) #endif // // n: index in unrolled loop (for prefetching offsets) // // a: ZMM register to load into // b: ZMM register to read from // // ...: addressing for B, except for offset // #define SUBITER(n,a,b,...) \ \ PREFETCH_B_L2(n) \ \ VMOVAPD(ZMM(a), MEM(RAX,(n+1)*64)) \ VFMADD231PD(ZMM( 8), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 0)*8)) \ VFMADD231PD(ZMM( 9), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 1)*8)) \ VFMADD231PD(ZMM(10), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 2)*8)) \ VFMADD231PD(ZMM(11), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 3)*8)) \ PREFETCH_B_L1_1(n) \ VFMADD231PD(ZMM(12), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 4)*8)) \ VFMADD231PD(ZMM(13), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 5)*8)) \ VFMADD231PD(ZMM(14), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 6)*8)) \ VFMADD231PD(ZMM(15), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 7)*8)) \ PREFETCH_B_L1_2(n) \ VFMADD231PD(ZMM(16), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 8)*8)) \ VFMADD231PD(ZMM(17), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 9)*8)) \ VFMADD231PD(ZMM(18), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+10)*8)) \ VFMADD231PD(ZMM(19), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+11)*8)) \ PREFETCH_B_L1_3(n) \ VFMADD231PD(ZMM(20), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+12)*8)) \ VFMADD231PD(ZMM(21), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+13)*8)) \ VFMADD231PD(ZMM(22), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+14)*8)) \ VFMADD231PD(ZMM(23), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+15)*8)) \ PREFETCH_A_L1(n) \ VFMADD231PD(ZMM(24), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+16)*8)) \ VFMADD231PD(ZMM(25), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+17)*8)) \ VFMADD231PD(ZMM(26), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+18)*8)) \ VFMADD231PD(ZMM(27), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+19)*8)) \ PREFETCH_A_L2(n) \ VFMADD231PD(ZMM(28), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+20)*8)) \ VFMADD231PD(ZMM(29), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+21)*8)) \ VFMADD231PD(ZMM(30), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+22)*8)) \ VFMADD231PD(ZMM(31), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+23)*8)) #define TAIL_LOOP(NAME) \ \ LOOP_ALIGN \ LABEL(NAME) \ \ SUBITER(0,1,0,RBX) \ \ VMOVAPD(ZMM(0), ZMM(1)) \ \ LEA(RBX, MEM(RBX,24*8)) \ LEA(RAX, MEM(RAX, 8*8)) \ \ SUB(RDI, IMM(1)) \ \ JNZ(NAME) #define MAIN_LOOP_1(NAME) \ \ LOOP_ALIGN \ LABEL(NAME##_LOOP) \ \ SUBITER(0,1,0,RBX) \ \ VMOVAPD(ZMM(0), ZMM(1)) \ \ LEA(RBX, MEM(RBX,24*8)) \ LEA(RAX, MEM(RAX, 8*8)) \ \ SUB(RSI, IMM(1)) \ \ JNZ(NAME##_LOOP) #define MAIN_LOOP_2(NAME) \ \ MOV(RDI, RSI) \ AND(RDI, IMM(1)) \ SAR1(RSI) \ JZ(NAME##_TAIL) \ \ LOOP_ALIGN \ LABEL(NAME##_LOOP) \ \ SUBITER(0,1,0,RBX) \ SUBITER(1,0,1,RBX) \ \ LEA(RBX, MEM(RBX,2*24*8)) \ LEA(RAX, MEM(RAX,2* 8*8)) \ \ SUB(RSI, IMM(1)) \ \ JNZ(NAME##_LOOP) \ \ TEST(RDI, RDI) \ JZ(NAME##_DONE) \ \ LABEL(NAME##_TAIL) \ \ SUBITER(0,1,0,RBX) \ \ VMOVAPD(ZMM(0), ZMM(1)) \ \ LEA(RBX, MEM(RBX,24*8)) \ LEA(RAX, MEM(RAX, 8*8)) \ \ LABEL(NAME##_DONE) #define MAIN_LOOP_4(NAME) \ \ MOV(RDI, RSI) \ AND(RDI, IMM(3)) \ SAR(RSI, IMM(2)) \ JZ(NAME##_TAIL) \ \ LOOP_ALIGN \ LABEL(NAME##_LOOP) \ \ SUBITER(0,1,0,RBX) \ SUBITER(1,0,1,RBX) \ SUBITER(2,1,0,RBX) \ SUBITER(3,0,1,RBX) \ \ LEA(RBX, MEM(RBX,4*24*8)) \ LEA(RAX, MEM(RAX,4* 8*8)) \ \ SUB(RSI, IMM(1)) \ \ JNZ(NAME##_LOOP) \ \ TEST(RDI, RDI) \ JZ(NAME##_DONE) \ \ TAIL_LOOP(NAME##_TAIL) \ \ LABEL(NAME##_DONE) #define MAIN_LOOP_8(NAME) \ \ MOV(RDI, RSI) \ AND(RDI, IMM(7)) \ SAR(RSI, IMM(3)) \ JZ(NAME##_TAIL) \ \ LOOP_ALIGN \ LABEL(NAME##_LOOP) \ \ SUBITER(0,1,0,RBX) \ SUBITER(1,0,1,RBX) \ SUBITER(2,1,0,RBX) \ SUBITER(3,0,1,RBX) \ SUBITER(4,1,0,RBX,R8,1) \ SUBITER(5,0,1,RBX,R8,1) \ SUBITER(6,1,0,RBX,R8,1) \ SUBITER(7,0,1,RBX,R8,1) \ \ LEA(RBX, MEM(RBX,8*24*8)) \ LEA(RAX, MEM(RAX,8* 8*8)) \ \ SUB(RSI, IMM(1)) \ \ JNZ(NAME##_LOOP) \ \ TEST(RDI, RDI) \ JZ(NAME##_DONE) \ \ TAIL_LOOP(NAME##_TAIL) \ \ LABEL(NAME##_DONE) #define MAIN_LOOP_16(NAME) \ \ MOV(RDI, RSI) \ AND(RDI, IMM(15)) \ SAR(RSI, IMM(4)) \ JZ(NAME##_TAIL) \ \ LOOP_ALIGN \ LABEL(NAME##_LOOP) \ \ SCATTER_PREFETCH_AB(0) \ \ SUBITER( 0,1,0,RBX) \ SUBITER( 1,0,1,RBX) \ SUBITER( 2,1,0,RBX) \ SUBITER( 3,0,1,RBX) \ SUBITER( 4,1,0,RBX,R8,1) \ SUBITER( 5,0,1,RBX,R8,1) \ SUBITER( 6,1,0,RBX,R8,1) \ SUBITER( 7,0,1,RBX,R8,1) \ SUBITER( 8,1,0,RBX,R8,2) \ SUBITER( 9,0,1,RBX,R8,2) \ SUBITER(10,1,0,RBX,R8,2) \ SUBITER(11,0,1,RBX,R8,2) \ SUBITER(12,1,0,RBX,R9,1) \ SUBITER(13,0,1,RBX,R9,1) \ SUBITER(14,1,0,RBX,R9,1) \ SUBITER(15,0,1,RBX,R9,1) \ \ LEA(RBX, MEM(RBX,16*24*8)) \ LEA(RAX, MEM(RAX,16* 8*8)) \ \ SUB(RSI, IMM(1)) \ \ JNZ(NAME##_LOOP) \ \ TEST(RDI, RDI) \ JZ(NAME##_DONE) \ \ SCATTER_PREFETCH_AB(0) \ \ TAIL_LOOP(NAME##_TAIL) \ \ LABEL(NAME##_DONE) #define MAIN_LOOP_32(NAME) \ \ MOV(RDI, RSI) \ AND(RDI, IMM(31)) \ SAR(RSI, IMM(5)) \ JZ(NAME##_TAIL) \ \ LOOP_ALIGN \ LABEL(NAME##_LOOP) \ \ SCATTER_PREFETCH_AB(0) \ \ SUBITER( 0,1,0,RBX) \ SUBITER( 1,0,1,RBX) \ SUBITER( 2,1,0,RBX) \ SUBITER( 3,0,1,RBX) \ SUBITER( 4,1,0,RBX,R8,1) \ SUBITER( 5,0,1,RBX,R8,1) \ SUBITER( 6,1,0,RBX,R8,1) \ SUBITER( 7,0,1,RBX,R8,1) \ SUBITER( 8,1,0,RBX,R8,2) \ SUBITER( 9,0,1,RBX,R8,2) \ SUBITER(10,1,0,RBX,R8,2) \ SUBITER(11,0,1,RBX,R8,2) \ SUBITER(12,1,0,RBX,R9,1) \ SUBITER(13,0,1,RBX,R9,1) \ SUBITER(14,1,0,RBX,R9,1) \ SUBITER(15,0,1,RBX,R9,1) \ \ SCATTER_PREFETCH_AB(1) \ \ SUBITER(16,1,0,RBX,R8,4) \ SUBITER(17,0,1,RBX,R8,4) \ SUBITER(18,1,0,RBX,R8,4) \ SUBITER(19,0,1,RBX,R8,4) \ SUBITER(20,1,0,RBX,R10,1) \ SUBITER(21,0,1,RBX,R10,1) \ SUBITER(22,1,0,RBX,R10,1) \ SUBITER(23,0,1,RBX,R10,1) \ SUBITER(24,1,0,RBX,R9,2) \ SUBITER(25,0,1,RBX,R9,2) \ SUBITER(26,1,0,RBX,R9,2) \ SUBITER(27,0,1,RBX,R9,2) \ SUBITER(28,1,0,RBX,R11,1) \ SUBITER(29,0,1,RBX,R11,1) \ SUBITER(30,1,0,RBX,R11,1) \ SUBITER(31,0,1,RBX,R11,1) \ \ LEA(RBX, MEM(RBX,32*24*8)) \ LEA(RAX, MEM(RAX,32* 8*8)) \ \ SUB(RSI, IMM(1)) \ \ JNZ(NAME##_LOOP) \ \ TEST(RDI, RDI) \ JZ(NAME##_DONE) \ \ SCATTER_PREFETCH_AB(0) \ SCATTER_PREFETCH_AB(1) \ \ TAIL_LOOP(NAME##_TAIL) \ \ LABEL(NAME##_DONE) #define LOOP_K_(M,K) M##K #define LOOP_K(M,K,NAME) LOOP_K_(M,K)(NAME) #define MAIN_LOOP_L2 LOOP_K(MAIN_LOOP_,UNROLL_K,MAIN_LOOP_L2) #define MAIN_LOOP_L1 LOOP_K(MAIN_LOOP_,C_L1_ITERS,MAIN_LOOP_L1) //This is an array used for the scatter/gather instructions. extern int32_t offsets[24]; //#define MONITORS //#define LOOPMON void bli_dgemm_knl_asm_8x24 ( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; uint64_t k64 = k; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif __asm__ volatile ( #ifdef MONITORS RDTSC MOV(VAR(topl), EAX) MOV(VAR(toph), EDX) #endif VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers VMOVAPS(ZMM( 9), ZMM(8)) VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RAX)) //pre-load a VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) #if SCATTER_PREFETCH_C VMOVAPS(ZMM(17), ZMM(8)) VMOVAPS(ZMM(18), ZMM(8)) VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(cs_c)) VMOVAPS(ZMM(20), ZMM(8)) VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64)) VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5)) #else VMOVAPS(ZMM(17), ZMM(8)) MOV(R12, VAR(cs_c)) VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2)) VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4)) VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4)) VMOVAPS(ZMM(21), ZMM(8)) LEA(RDX, MEM(RCX,R12,8)) VMOVAPS(ZMM(22), ZMM(8)) LEA(RDI, MEM(RDX,R12,8)) VMOVAPS(ZMM(23), ZMM(8)) #endif VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3)) VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3 VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5 VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7 VMOVAPS(ZMM(29), ZMM(8)) VMOVAPS(ZMM(30), ZMM(8)) VMOVAPS(ZMM(31), ZMM(8)) #ifdef MONITORS RDTSC MOV(VAR(midl), EAX) MOV(VAR(midh), EDX) #endif //need 0+... to satisfy preprocessor CMP(RSI, IMM(0+C_MIN_L2_ITERS)) JLE(PREFETCH_C_L1) SUB(RSI, IMM(0+C_L1_ITERS)) //prefetch C into L2 #if SCATTER_PREFETCH_C KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2)) #else PREFETCH(1, MEM(RCX )) PREFETCH(1, MEM(RCX,R12,1)) PREFETCH(1, MEM(RCX,R12,2)) PREFETCH(1, MEM(RCX,R13,1)) PREFETCH(1, MEM(RCX,R12,4)) PREFETCH(1, MEM(RCX,R14,1)) PREFETCH(1, MEM(RCX,R13,2)) PREFETCH(1, MEM(RCX,R15,1)) PREFETCH(1, MEM(RDX )) PREFETCH(1, MEM(RDX,R12,1)) PREFETCH(1, MEM(RDX,R12,2)) PREFETCH(1, MEM(RDX,R13,1)) PREFETCH(1, MEM(RDX,R12,4)) PREFETCH(1, MEM(RDX,R14,1)) PREFETCH(1, MEM(RDX,R13,2)) PREFETCH(1, MEM(RDX,R15,1)) PREFETCH(1, MEM(RDI )) PREFETCH(1, MEM(RDI,R12,1)) PREFETCH(1, MEM(RDI,R12,2)) PREFETCH(1, MEM(RDI,R13,1)) PREFETCH(1, MEM(RDI,R12,4)) PREFETCH(1, MEM(RDI,R14,1)) PREFETCH(1, MEM(RDI,R13,2)) PREFETCH(1, MEM(RDI,R15,1)) #endif MAIN_LOOP_L2 MOV(RSI, IMM(0+C_L1_ITERS)) LABEL(PREFETCH_C_L1) //prefetch C into L1 #if SCATTER_PREFETCH_C KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2)) #else PREFETCH(0, MEM(RCX )) PREFETCH(0, MEM(RCX,R12,1)) PREFETCH(0, MEM(RCX,R12,2)) PREFETCH(0, MEM(RCX,R13,1)) PREFETCH(0, MEM(RCX,R12,4)) PREFETCH(0, MEM(RCX,R14,1)) PREFETCH(0, MEM(RCX,R13,2)) PREFETCH(0, MEM(RCX,R15,1)) PREFETCH(0, MEM(RDX )) PREFETCH(0, MEM(RDX,R12,1)) PREFETCH(0, MEM(RDX,R12,2)) PREFETCH(0, MEM(RDX,R13,1)) PREFETCH(0, MEM(RDX,R12,4)) PREFETCH(0, MEM(RDX,R14,1)) PREFETCH(0, MEM(RDX,R13,2)) PREFETCH(0, MEM(RDX,R15,1)) PREFETCH(0, MEM(RDI )) PREFETCH(0, MEM(RDI,R12,1)) PREFETCH(0, MEM(RDI,R12,2)) PREFETCH(0, MEM(RDI,R13,1)) PREFETCH(0, MEM(RDI,R12,4)) PREFETCH(0, MEM(RDI,R14,1)) PREFETCH(0, MEM(RDI,R13,2)) PREFETCH(0, MEM(RDI,R15,1)) #endif MAIN_LOOP_L1 LABEL(POSTACCUM) #ifdef MONITORS RDTSC MOV(VAR(mid2l), EAX) MOV(VAR(mid2h), EDX) #endif MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSD(ZMM(0), MEM(RAX)) VBROADCASTSD(ZMM(1), MEM(RBX)) // Check if C is column stride. If not, jump to the slow scattered update MOV(RAX, VAR(cs_c)) LEA(RAX, MEM(,RAX,8)) MOV(RBX, VAR(rs_c)) LEA(RDI, MEM(RAX,RAX,2)) CMP(RBX, IMM(1)) JNE(SCATTEREDUPDATE) VMOVQ(RDX, XMM(1)) SAL1(RDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 8, 9,10,11) UPDATE_C_FOUR_ROWS(12,13,14,15) UPDATE_C_FOUR_ROWS(16,17,18,19) UPDATE_C_FOUR_ROWS(20,21,22,23) UPDATE_C_FOUR_ROWS(24,25,26,27) UPDATE_C_FOUR_ROWS(28,29,30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11) UPDATE_C_BZ_FOUR_ROWS(12,13,14,15) UPDATE_C_BZ_FOUR_ROWS(16,17,18,19) UPDATE_C_BZ_FOUR_ROWS(20,21,22,23) UPDATE_C_BZ_FOUR_ROWS(24,25,26,27) UPDATE_C_BZ_FOUR_ROWS(28,29,30,31) JMP(END) LABEL(SCATTEREDUPDATE) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(2), MEM(RDI)) /* Note that this ignores the upper 32 bits in rs_c */ VPBROADCASTD(ZMM(3), EBX) VPMULLD(ZMM(2), ZMM(3), ZMM(2)) VMOVQ(RDX, XMM(1)) SAL1(RDX) //shift out sign bit JZ(SCATTERBZ) UPDATE_C_ROW_SCATTERED( 8) UPDATE_C_ROW_SCATTERED( 9) UPDATE_C_ROW_SCATTERED(10) UPDATE_C_ROW_SCATTERED(11) UPDATE_C_ROW_SCATTERED(12) UPDATE_C_ROW_SCATTERED(13) UPDATE_C_ROW_SCATTERED(14) UPDATE_C_ROW_SCATTERED(15) UPDATE_C_ROW_SCATTERED(16) UPDATE_C_ROW_SCATTERED(17) UPDATE_C_ROW_SCATTERED(18) UPDATE_C_ROW_SCATTERED(19) UPDATE_C_ROW_SCATTERED(20) UPDATE_C_ROW_SCATTERED(21) UPDATE_C_ROW_SCATTERED(22) UPDATE_C_ROW_SCATTERED(23) UPDATE_C_ROW_SCATTERED(24) UPDATE_C_ROW_SCATTERED(25) UPDATE_C_ROW_SCATTERED(26) UPDATE_C_ROW_SCATTERED(27) UPDATE_C_ROW_SCATTERED(28) UPDATE_C_ROW_SCATTERED(29) UPDATE_C_ROW_SCATTERED(30) UPDATE_C_ROW_SCATTERED(31) JMP(END) LABEL(SCATTERBZ) UPDATE_C_BZ_ROW_SCATTERED( 8) UPDATE_C_BZ_ROW_SCATTERED( 9) UPDATE_C_BZ_ROW_SCATTERED(10) UPDATE_C_BZ_ROW_SCATTERED(11) UPDATE_C_BZ_ROW_SCATTERED(12) UPDATE_C_BZ_ROW_SCATTERED(13) UPDATE_C_BZ_ROW_SCATTERED(14) UPDATE_C_BZ_ROW_SCATTERED(15) UPDATE_C_BZ_ROW_SCATTERED(16) UPDATE_C_BZ_ROW_SCATTERED(17) UPDATE_C_BZ_ROW_SCATTERED(18) UPDATE_C_BZ_ROW_SCATTERED(19) UPDATE_C_BZ_ROW_SCATTERED(20) UPDATE_C_BZ_ROW_SCATTERED(21) UPDATE_C_BZ_ROW_SCATTERED(22) UPDATE_C_BZ_ROW_SCATTERED(23) UPDATE_C_BZ_ROW_SCATTERED(24) UPDATE_C_BZ_ROW_SCATTERED(25) UPDATE_C_BZ_ROW_SCATTERED(26) UPDATE_C_BZ_ROW_SCATTERED(27) UPDATE_C_BZ_ROW_SCATTERED(28) UPDATE_C_BZ_ROW_SCATTERED(29) UPDATE_C_BZ_ROW_SCATTERED(30) UPDATE_C_BZ_ROW_SCATTERED(31) LABEL(END) #ifdef MONITORS RDTSC MOV(VAR(botl), EAX) MOV(VAR(both), EDX) #endif : // output operands #ifdef MONITORS [topl] "=m" (topl), [toph] "=m" (toph), [midl] "=m" (midl), [midh] "=m" (midh), [mid2l] "=m" (mid2l), [mid2h] "=m" (mid2h), [botl] "=m" (botl), [both] "=m" (both) #endif : // input operands [k] "m" (k64), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [a_next] "m" (a_next), [b_next] "m" (b_next), [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ); #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif #ifdef MONITORS dim_t top = ((dim_t)toph << 32) | topl; dim_t mid = ((dim_t)midh << 32) | midl; dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; dim_t bot = ((dim_t)both << 32) | botl; printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); #endif } cython-blis-0.9.1/blis/_src/kernels/knl/3/other/bli_sgemm_knl_asm_30x16_knc.c000066400000000000000000000400051427272030600266510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include #include "bli_avx512_macros.h" #define A_L1_PREFETCH_DIST 4 #define B_L1_PREFETCH_DIST 2 #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed. //Alternate code path uused if C is not row-major // r9 = c // zmm30 = cs_c * 1...16 // r11 = rs_c // r12 = &alpha // r13 = &beta #define UPDATE_C_ROW_SCATTERED_(NUM,BNZ1,BNZ2) \ \ BNZ1 KXNORW(K(2), K(0), K(0)) BNZ2 \ KXNORW(K(3), K(0), K(0)) \ BNZ1 VGATHERDPS(ZMM(31) MASK_K(2), MEM(R(9),ZMM(30),4)) BNZ2 \ VMULPS(ZMM(NUM), ZMM(NUM), MEM_1TO16(R(12))) /*scale by alpha*/ \ BNZ1 VFMADD231PS(ZMM(NUM), ZMM(31), MEM_1TO16(R(13))) BNZ2 /*scale by beta, add in result*/ \ VSCATTERDPS(MEM(R(9),ZMM(30),4) MASK_K(3), ZMM(NUM)) \ ADD(R(9), R(11)) #define UPDATE_C_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,,) #define UPDATE_C_BZ_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,COMMENT_BEGIN,COMMENT_END) // r12 = &alpha // zmm31 = beta // r9 = c // r11 = rs_c // r10 = 3*rs_c // rdi = 4*rs_c #define UPDATE_C_4_ROWS_(R1,R2,R3,R4,BNZ1,BNZ2) \ \ VMULPS(ZMM(R1), ZMM(R1), MEM_1TO16(R(12))) \ VMULPS(ZMM(R2), ZMM(R2), MEM_1TO16(R(12))) \ VMULPS(ZMM(R3), ZMM(R3), MEM_1TO16(R(12))) \ VMULPS(ZMM(R4), ZMM(R4), MEM_1TO16(R(12))) \ BNZ1 VFMADD231PS(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \ BNZ1 VFMADD231PS(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \ BNZ1 VFMADD231PS(ZMM(R3), ZMM(31), MEM(R(9),R(11),2)) BNZ2 \ BNZ1 VFMADD231PS(ZMM(R4), ZMM(31), MEM(R(9),R(10),1)) BNZ2 \ VMOVUPS(MEM(R(9) ), ZMM(R1)) \ VMOVUPS(MEM(R(9),R(11),1), ZMM(R2)) \ VMOVUPS(MEM(R(9),R(11),2), ZMM(R3)) \ VMOVUPS(MEM(R(9),R(10),1), ZMM(R4)) \ ADD(R(9), RDI) // r12 = &alpha // zmm31 = beta // r9 = c // r11 = rs_c #define UPDATE_C_2_ROWS_(R1,R2,BNZ1,BNZ2) \ \ VMULPS(ZMM(R1), ZMM(R1), MEM_1TO16(R(12))) \ VMULPS(ZMM(R2), ZMM(R2), MEM_1TO16(R(12))) \ BNZ1 VFMADD231PS(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \ BNZ1 VFMADD231PS(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \ VMOVUPS(MEM(R(9) ), ZMM(R1)) \ VMOVUPS(MEM(R(9),R(11),1), ZMM(R2)) #define UPDATE_C_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,,) #define UPDATE_C_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,,) #define UPDATE_C_BZ_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,COMMENT_BEGIN,COMMENT_END) #define UPDATE_C_BZ_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,COMMENT_BEGIN,COMMENT_END) #define A_TIMES_B_ROW(n) VFMADD231PS(ZMM(n), ZMM(31), MEM_1TO16(R(15),n*4)) #define A_TIMES_B_ROW_PREV(n) VFMADD231PS(ZMM(n), ZMM(31), MEM_1TO16(R(15),(n-32)*4)) #define PREFETCH_A_L1(n) PREFETCH(0, MEM(R(15),A_L1_PREFETCH_DIST*4*32+n*64)) #define PREFETCH_A_L2(n) PREFETCH(1, MEM(R(15),R(14),1,n*64)) #define PREFETCH_B_L1 PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*4*16)) #define PREFETCH_B_L2 PREFETCH(1, MEM(RBX,R(13),1)) //One iteration of the k_r loop. //Each iteration, we prefetch A into L1 and into L2 // r15 = a // rbx = b // rcx = c // r11 = rs_c // r13 = L2_PREFETCH_DIST*4*16 // r14 = L2_PREFETCH_DIST*4*32 // r12 = 32*4 = dist. to next sliver of a // r9 = 16*4 = dist. to next sliver of b #define MAIN_LOOP_(COUNTER, PC_L1_1, PC_L1_2, PC_L2_1, PC_L2_2) \ \ /* Can this be pre-loaded for next it. in zmm30? */ \ VMOVAPS(ZMM(31), MEM(RBX)) \ \ A_TIMES_B_ROW ( 0) \ A_TIMES_B_ROW ( 1) PREFETCH_A_L1(0) \ A_TIMES_B_ROW ( 2) PREFETCH_A_L1(1) \ A_TIMES_B_ROW ( 3) PREFETCH_A_L1(2) \ A_TIMES_B_ROW ( 4) PREFETCH_A_L1(3) \ A_TIMES_B_ROW ( 5) PREFETCH_A_L2(0) \ A_TIMES_B_ROW ( 6) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \ A_TIMES_B_ROW ( 7) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \ A_TIMES_B_ROW ( 8) \ A_TIMES_B_ROW ( 9) PC_L2_1 PREFETCH(1, MEM(RCX)) PC_L2_2 \ A_TIMES_B_ROW (10) PREFETCH_A_L2(1) \ A_TIMES_B_ROW (11) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \ A_TIMES_B_ROW (12) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \ A_TIMES_B_ROW (13) \ A_TIMES_B_ROW (14) \ A_TIMES_B_ROW (15) PREFETCH_A_L2(2) \ A_TIMES_B_ROW (16) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \ A_TIMES_B_ROW (17) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \ A_TIMES_B_ROW (18) \ A_TIMES_B_ROW (19) \ A_TIMES_B_ROW (20) PREFETCH_A_L2(3) \ A_TIMES_B_ROW (21) ADD(R(15), R(12)) \ A_TIMES_B_ROW_PREV(22) \ A_TIMES_B_ROW_PREV(23) PC_L2_1 ADD(RCX, R(11)) PC_L2_2 \ A_TIMES_B_ROW_PREV(24) DEC(COUNTER) \ A_TIMES_B_ROW_PREV(25) PREFETCH_B_L2 \ A_TIMES_B_ROW_PREV(26) PREFETCH_B_L1 \ A_TIMES_B_ROW_PREV(27) ADD(RBX, R(9)) \ A_TIMES_B_ROW_PREV(28) CMP(COUNTER, IMM(0)) \ A_TIMES_B_ROW_PREV(29) #define MAIN_LOOP(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,COMMENT_BEGIN,COMMENT_END) #define MAIN_LOOP_PC_L1(COUNTER) MAIN_LOOP_(COUNTER,,,COMMENT_BEGIN,COMMENT_END) #define MAIN_LOOP_PC_L2(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,,) //This is an array used for the scatter/gather instructions. int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; //#define MONITORS //#define LOOPMON void bli_sgemm_knl_asm_30x16_knc ( dim_t k_, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* restrict data, cntx_t* restrict cntx ) { (void)data; (void)cntx; const float * a_next = bli_auxinfo_next_a( data ); const float * b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; const int64_t k = k_; const int64_t rs_c = rs_c_; const int64_t cs_c = cs_c_; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif __asm__ volatile ( #ifdef MONITORS RDTSC MOV(VAR(topl), EAX) MOV(VAR(toph), EDX) #endif VPXORD(ZMM(0), ZMM(0), ZMM(0)) //clear out registers VMOVAPS(ZMM( 1), ZMM(0)) VMOVAPS(ZMM( 2), ZMM(0)) MOV(RSI, VAR(k)) //loop index VMOVAPS(ZMM( 3), ZMM(0)) MOV(R(11), VAR(rs_c)) //load row stride VMOVAPS(ZMM( 4), ZMM(0)) SAL(R(11), IMM(2)) //scale row stride VMOVAPS(ZMM( 5), ZMM(0)) MOV(R(15), VAR(a)) //load address of a VMOVAPS(ZMM( 6), ZMM(0)) MOV(RBX, VAR(b)) //load address of b VMOVAPS(ZMM( 7), ZMM(0)) VMOVAPS(ZMM( 8), ZMM(0)) LEA(R(10), MEM(R(11),R(11),2)) //r10 has 3 * r11 VMOVAPS(ZMM( 9), ZMM(0)) VMOVAPS(ZMM(10), ZMM(0)) MOV(RDI, R(11)) VMOVAPS(ZMM(11), ZMM(0)) SAL(RDI, IMM(2)) //rdi has 4*r11 VMOVAPS(ZMM(12), ZMM(0)) MOV(RCX, VAR(c)) //load address of c for prefetching VMOVAPS(ZMM(13), ZMM(0)) VMOVAPS(ZMM(14), ZMM(0)) MOV(R(8), VAR(k)) VMOVAPS(ZMM(15), ZMM(0)) VMOVAPS(ZMM(16), ZMM(0)) VMOVAPS(ZMM(17), ZMM(0)) MOV(R(13), IMM(4*16*L2_PREFETCH_DIST)) VMOVAPS(ZMM(18), ZMM(0)) MOV(R(14), IMM(4*32*L2_PREFETCH_DIST)) VMOVAPS(ZMM(19), ZMM(0)) VMOVAPS(ZMM(20), ZMM(0)) VMOVAPS(ZMM(21), ZMM(0)) VMOVAPS(ZMM(22), ZMM(0)) VMOVAPS(ZMM(23), ZMM(0)) SUB(R(8), IMM(30+L2_PREFETCH_DIST)) //Check if we have over 40 operations to do. VMOVAPS(ZMM(24), ZMM(0)) MOV(R(8), IMM(30)) VMOVAPS(ZMM(25), ZMM(0)) MOV(R(9), IMM(4*16)) //amount to increment b* by each iteration VMOVAPS(ZMM(26), ZMM(0)) MOV(R(12), IMM(4*32)) //amount to increment a* by each iteration VMOVAPS(ZMM(27), ZMM(0)) VMOVAPS(ZMM(28), ZMM(0)) VMOVAPS(ZMM(29), ZMM(0)) #ifdef MONITORS RDTSC MOV(VAR(midl), EAX) MOV(VAR(midh), EDX) #endif JLE(CONSIDER_UNDER_40) SUB(RSI, IMM(30+L2_PREFETCH_DIST)) //First 30 iterations LABEL(LOOPREFECHCL2) MAIN_LOOP_PC_L2(R(8)) JNZ(LOOPREFECHCL2) MOV(RCX, VAR(c)) //Main Loop. LABEL(LOOPMAIN) MAIN_LOOP(RSI) JNZ(LOOPMAIN) //Penultimate 22 iterations. //Break these off from the main loop to avoid prefetching extra shit. MOV(R(14), VAR(a_next)) MOV(R(13), VAR(b_next)) SUB(R(14), R(15)) SUB(R(13), RBX) //Yes, I know 10-20 = -10 MOV(RSI, IMM(10+L2_PREFETCH_DIST-20)) LABEL(LOOPMAIN2) MAIN_LOOP(RSI) JNZ(LOOPMAIN2) //Last 10 iterations MOV(R(8), IMM(10)) LABEL(LOOPREFETCHCL1) MAIN_LOOP_PC_L1(R(8)) JNZ(LOOPREFETCHCL1) JMP(POSTACCUM) //Alternate main loop, with no prefetching of C //Used when <= 40 iterations LABEL(CONSIDER_UNDER_40) MOV(RSI, VAR(k)) TEST(RSI, RSI) JZ(POSTACCUM) LABEL(LOOP_UNDER_40) MAIN_LOOP(RSI) JNZ(LOOP_UNDER_40) LABEL(POSTACCUM) #ifdef MONITORS RDTSC MOV(VAR(mid2l), EAX) MOV(VAR(mid2h), EDX) #endif MOV(R(9), VAR(c)) //load address of c for update MOV(R(12), VAR(alpha)) //load address of alpha // Check if C is row stride. If not, jump to the slow scattered update MOV(R(14), VAR(cs_c)) DEC(R(14)) JNZ(SCATTEREDUPDATE) MOV(R(14), VAR(beta)) VBROADCASTSS(ZMM(31), MEM(R(14))) MOV(EBX, MEM(R(14))) TEST(EBX, EBX) JZ(COLSTORBZ) UPDATE_C_4_ROWS( 0, 1, 2, 3) UPDATE_C_4_ROWS( 4, 5, 6, 7) UPDATE_C_4_ROWS( 8, 9,10,11) UPDATE_C_4_ROWS(12,13,14,15) UPDATE_C_4_ROWS(16,17,18,19) UPDATE_C_4_ROWS(20,21,22,23) UPDATE_C_4_ROWS(24,25,26,27) UPDATE_C_2_ROWS(28,29) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ_4_ROWS( 0, 1, 2, 3) UPDATE_C_BZ_4_ROWS( 4, 5, 6, 7) UPDATE_C_BZ_4_ROWS( 8, 9,10,11) UPDATE_C_BZ_4_ROWS(12,13,14,15) UPDATE_C_BZ_4_ROWS(16,17,18,19) UPDATE_C_BZ_4_ROWS(20,21,22,23) UPDATE_C_BZ_4_ROWS(24,25,26,27) UPDATE_C_BZ_2_ROWS(28,29) JMP(END) LABEL(SCATTEREDUPDATE) MOV(R(13), VAR(beta)) MOV(R(10), VAR(offsetPtr)) VMOVAPS(ZMM(30), MEM(R(10))) MOV(EBX, MEM(R(13))) /* Note that this ignores the upper 32 bits in cs_c */ VPBROADCASTD(ZMM(31), VAR(cs_c)) VPMULLD(ZMM(30), ZMM(31), ZMM(30)) TEST(EBX, EBX) JZ(SCATTERBZ) UPDATE_C_ROW_SCATTERED( 0) UPDATE_C_ROW_SCATTERED( 1) UPDATE_C_ROW_SCATTERED( 2) UPDATE_C_ROW_SCATTERED( 3) UPDATE_C_ROW_SCATTERED( 4) UPDATE_C_ROW_SCATTERED( 5) UPDATE_C_ROW_SCATTERED( 6) UPDATE_C_ROW_SCATTERED( 7) UPDATE_C_ROW_SCATTERED( 8) UPDATE_C_ROW_SCATTERED( 9) UPDATE_C_ROW_SCATTERED(10) UPDATE_C_ROW_SCATTERED(11) UPDATE_C_ROW_SCATTERED(12) UPDATE_C_ROW_SCATTERED(13) UPDATE_C_ROW_SCATTERED(14) UPDATE_C_ROW_SCATTERED(15) UPDATE_C_ROW_SCATTERED(16) UPDATE_C_ROW_SCATTERED(17) UPDATE_C_ROW_SCATTERED(18) UPDATE_C_ROW_SCATTERED(19) UPDATE_C_ROW_SCATTERED(20) UPDATE_C_ROW_SCATTERED(21) UPDATE_C_ROW_SCATTERED(22) UPDATE_C_ROW_SCATTERED(23) UPDATE_C_ROW_SCATTERED(24) UPDATE_C_ROW_SCATTERED(25) UPDATE_C_ROW_SCATTERED(26) UPDATE_C_ROW_SCATTERED(27) UPDATE_C_ROW_SCATTERED(28) UPDATE_C_ROW_SCATTERED(29) JMP(END) LABEL(SCATTERBZ) UPDATE_C_BZ_ROW_SCATTERED( 0) UPDATE_C_BZ_ROW_SCATTERED( 1) UPDATE_C_BZ_ROW_SCATTERED( 2) UPDATE_C_BZ_ROW_SCATTERED( 3) UPDATE_C_BZ_ROW_SCATTERED( 4) UPDATE_C_BZ_ROW_SCATTERED( 5) UPDATE_C_BZ_ROW_SCATTERED( 6) UPDATE_C_BZ_ROW_SCATTERED( 7) UPDATE_C_BZ_ROW_SCATTERED( 8) UPDATE_C_BZ_ROW_SCATTERED( 9) UPDATE_C_BZ_ROW_SCATTERED(10) UPDATE_C_BZ_ROW_SCATTERED(11) UPDATE_C_BZ_ROW_SCATTERED(12) UPDATE_C_BZ_ROW_SCATTERED(13) UPDATE_C_BZ_ROW_SCATTERED(14) UPDATE_C_BZ_ROW_SCATTERED(15) UPDATE_C_BZ_ROW_SCATTERED(16) UPDATE_C_BZ_ROW_SCATTERED(17) UPDATE_C_BZ_ROW_SCATTERED(18) UPDATE_C_BZ_ROW_SCATTERED(19) UPDATE_C_BZ_ROW_SCATTERED(20) UPDATE_C_BZ_ROW_SCATTERED(21) UPDATE_C_BZ_ROW_SCATTERED(22) UPDATE_C_BZ_ROW_SCATTERED(23) UPDATE_C_BZ_ROW_SCATTERED(24) UPDATE_C_BZ_ROW_SCATTERED(25) UPDATE_C_BZ_ROW_SCATTERED(26) UPDATE_C_BZ_ROW_SCATTERED(27) UPDATE_C_BZ_ROW_SCATTERED(28) UPDATE_C_BZ_ROW_SCATTERED(29) LABEL(END) #ifdef MONITORS RDTSC MOV(VAR(botl), EAX) MOV(VAR(both), EDX) #endif : // output operands #ifdef MONITORS [topl] "=m" (topl), [toph] "=m" (toph), [midl] "=m" (midl), [midh] "=m" (midh), [mid2l] "=m" (mid2l), [mid2h] "=m" (mid2h), [botl] "=m" (botl), [both] "=m" (both) #endif : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c), [a_next] "m" (a_next), [b_next] "m" (b_next), [offsetPtr] "m" (offsetPtr) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ); #ifdef LOOPMON printf("looptime = \t%d\n", bloopl - tloopl); #endif #ifdef MONITORS dim_t top = ((dim_t)toph << 32) | topl; dim_t mid = ((dim_t)midh << 32) | midl; dim_t mid2 = ((dim_t)mid2h << 32) | mid2l; dim_t bot = ((dim_t)both << 32) | botl; printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top); #endif } cython-blis-0.9.1/blis/_src/kernels/knl/bli_kernels_knl.h000066400000000000000000000042051427272030600233740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( float, s, gemm_knl_asm_24x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_24x8 ) PACKM_KER_PROT( float, s, packm_knl_asm_24xk ) PACKM_KER_PROT( float, s, packm_knl_asm_16xk ) PACKM_KER_PROT( double, d, packm_knl_asm_24xk ) PACKM_KER_PROT( double, d, packm_knl_asm_8xk ) // unused: GEMM_UKR_PROT( double, d, gemm_knl_asm_12x16 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_30x8 ) GEMM_UKR_PROT( double, d, gemm_knl_asm_8x24 ) PACKM_KER_PROT( double, d, packm_knl_asm_30xk ) cython-blis-0.9.1/blis/_src/kernels/old/000077500000000000000000000000001427272030600200575ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/c99/000077500000000000000000000000001427272030600204635ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/c99/3/000077500000000000000000000000001427272030600206255ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/c99/3/bli_gemm_c99_4x4.c000066400000000000000000000142671427272030600237410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, kername ) \ \ void PASTEMAC(ch,kername) \ ( \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ ctype a0; \ ctype a1; \ ctype a2; \ ctype a3; \ \ ctype b0, b1, b2, b3; \ \ ctype ab00, ab01, ab02, ab03; \ ctype ab10, ab11, ab12, ab13; \ ctype ab20, ab21, ab22, ab23; \ ctype ab30, ab31, ab32, ab33; \ \ ctype* c00, * c01, * c02, * c03; \ ctype* c10, * c11, * c12, * c13; \ ctype* c20, * c21, * c22, * c23; \ ctype* c30, * c31, * c32, * c33; \ \ dim_t i; \ \ \ c00 = (c + 0*rs_c + 0*cs_c); \ c10 = (c + 1*rs_c + 0*cs_c); \ c20 = (c + 2*rs_c + 0*cs_c); \ c30 = (c + 3*rs_c + 0*cs_c); \ \ c01 = (c + 0*rs_c + 1*cs_c); \ c11 = (c + 1*rs_c + 1*cs_c); \ c21 = (c + 2*rs_c + 1*cs_c); \ c31 = (c + 3*rs_c + 1*cs_c); \ \ c02 = (c + 0*rs_c + 2*cs_c); \ c12 = (c + 1*rs_c + 2*cs_c); \ c22 = (c + 2*rs_c + 2*cs_c); \ c32 = (c + 3*rs_c + 2*cs_c); \ \ c03 = (c + 0*rs_c + 3*cs_c); \ c13 = (c + 1*rs_c + 3*cs_c); \ c23 = (c + 2*rs_c + 3*cs_c); \ c33 = (c + 3*rs_c + 3*cs_c); \ \ PASTEMAC(ch,set0s)( ab00 ); \ PASTEMAC(ch,set0s)( ab10 ); \ PASTEMAC(ch,set0s)( ab20 ); \ PASTEMAC(ch,set0s)( ab30 ); \ \ PASTEMAC(ch,set0s)( ab01 ); \ PASTEMAC(ch,set0s)( ab11 ); \ PASTEMAC(ch,set0s)( ab21 ); \ PASTEMAC(ch,set0s)( ab31 ); \ \ PASTEMAC(ch,set0s)( ab02 ); \ PASTEMAC(ch,set0s)( ab12 ); \ PASTEMAC(ch,set0s)( ab22 ); \ PASTEMAC(ch,set0s)( ab32 ); \ \ PASTEMAC(ch,set0s)( ab03 ); \ PASTEMAC(ch,set0s)( ab13 ); \ PASTEMAC(ch,set0s)( ab23 ); \ PASTEMAC(ch,set0s)( ab33 ); \ \ for ( i = 0; i < k; ++i ) \ { \ a0 = *(a + 0); \ a1 = *(a + 1); \ a2 = *(a + 2); \ a3 = *(a + 3); \ \ b0 = *(b + 0); \ b1 = *(b + 1); \ b2 = *(b + 2); \ b3 = *(b + 3); \ \ PASTEMAC(ch,dots)( a0, b0, ab00 ); \ PASTEMAC(ch,dots)( a1, b0, ab10 ); \ PASTEMAC(ch,dots)( a2, b0, ab20 ); \ PASTEMAC(ch,dots)( a3, b0, ab30 ); \ \ PASTEMAC(ch,dots)( a0, b1, ab01 ); \ PASTEMAC(ch,dots)( a1, b1, ab11 ); \ PASTEMAC(ch,dots)( a2, b1, ab21 ); \ PASTEMAC(ch,dots)( a3, b1, ab31 ); \ \ PASTEMAC(ch,dots)( a0, b2, ab02 ); \ PASTEMAC(ch,dots)( a1, b2, ab12 ); \ PASTEMAC(ch,dots)( a2, b2, ab22 ); \ PASTEMAC(ch,dots)( a3, b2, ab32 ); \ \ PASTEMAC(ch,dots)( a0, b3, ab03 ); \ PASTEMAC(ch,dots)( a1, b3, ab13 ); \ PASTEMAC(ch,dots)( a2, b3, ab23 ); \ PASTEMAC(ch,dots)( a3, b3, ab33 ); \ \ a += 4; \ b += 4; \ } \ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,set0s)( *c00 ); \ PASTEMAC(ch,set0s)( *c10 ); \ PASTEMAC(ch,set0s)( *c20 ); \ PASTEMAC(ch,set0s)( *c30 ); \ \ PASTEMAC(ch,set0s)( *c01 ); \ PASTEMAC(ch,set0s)( *c11 ); \ PASTEMAC(ch,set0s)( *c21 ); \ PASTEMAC(ch,set0s)( *c31 ); \ \ PASTEMAC(ch,set0s)( *c02 ); \ PASTEMAC(ch,set0s)( *c12 ); \ PASTEMAC(ch,set0s)( *c22 ); \ PASTEMAC(ch,set0s)( *c32 ); \ \ PASTEMAC(ch,set0s)( *c03 ); \ PASTEMAC(ch,set0s)( *c13 ); \ PASTEMAC(ch,set0s)( *c23 ); \ PASTEMAC(ch,set0s)( *c33 ); \ } \ else \ { \ PASTEMAC(ch,scals)( *beta, *c00 ); \ PASTEMAC(ch,scals)( *beta, *c10 ); \ PASTEMAC(ch,scals)( *beta, *c20 ); \ PASTEMAC(ch,scals)( *beta, *c30 ); \ \ PASTEMAC(ch,scals)( *beta, *c01 ); \ PASTEMAC(ch,scals)( *beta, *c11 ); \ PASTEMAC(ch,scals)( *beta, *c21 ); \ PASTEMAC(ch,scals)( *beta, *c31 ); \ \ PASTEMAC(ch,scals)( *beta, *c02 ); \ PASTEMAC(ch,scals)( *beta, *c12 ); \ PASTEMAC(ch,scals)( *beta, *c22 ); \ PASTEMAC(ch,scals)( *beta, *c32 ); \ \ PASTEMAC(ch,scals)( *beta, *c03 ); \ PASTEMAC(ch,scals)( *beta, *c13 ); \ PASTEMAC(ch,scals)( *beta, *c23 ); \ PASTEMAC(ch,scals)( *beta, *c33 ); \ } \ \ PASTEMAC(ch,dots)( *alpha, ab00, *c00 ); \ PASTEMAC(ch,dots)( *alpha, ab10, *c10 ); \ PASTEMAC(ch,dots)( *alpha, ab20, *c20 ); \ PASTEMAC(ch,dots)( *alpha, ab30, *c30 ); \ \ PASTEMAC(ch,dots)( *alpha, ab01, *c01 ); \ PASTEMAC(ch,dots)( *alpha, ab11, *c11 ); \ PASTEMAC(ch,dots)( *alpha, ab21, *c21 ); \ PASTEMAC(ch,dots)( *alpha, ab31, *c31 ); \ \ PASTEMAC(ch,dots)( *alpha, ab02, *c02 ); \ PASTEMAC(ch,dots)( *alpha, ab12, *c12 ); \ PASTEMAC(ch,dots)( *alpha, ab22, *c22 ); \ PASTEMAC(ch,dots)( *alpha, ab32, *c32 ); \ \ PASTEMAC(ch,dots)( *alpha, ab03, *c03 ); \ PASTEMAC(ch,dots)( *alpha, ab13, *c13 ); \ PASTEMAC(ch,dots)( *alpha, ab23, *c23 ); \ PASTEMAC(ch,dots)( *alpha, ab33, *c33 ); \ } INSERT_GENTFUNC_BASIC0( gemm_c99_4x4 ) cython-blis-0.9.1/blis/_src/kernels/old/c99/3/bli_gemmtrsm_l_c99_4x4.c000066400000000000000000000053701427272030600251550ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, gemmkerid, trsmkerid ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a10, \ ctype* restrict a11, \ ctype* restrict b01, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ const inc_t rs_b = 4; \ const inc_t cs_b = 1; \ \ ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, gemmkerid, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ trsm_ukr = bli_cntx_get_l3_ukr_dt( dt, trsmkerid, cntx ); \ \ gemm_ukr \ ( \ k, \ minus_one, \ a10, \ b01, \ alpha, \ b11, rs_b, cs_b, \ data, \ cntx \ ); \ \ trsm_ukr \ ( \ a11, \ b11, \ c11, rs_c, cs_c, \ data, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( gemmtrsm_l_c99_4x4, BLIS_GEMM_UKR, BLIS_TRSM_L_UKR ) cython-blis-0.9.1/blis/_src/kernels/old/c99/3/bli_gemmtrsm_u_c99_4x4.c000066400000000000000000000053701427272030600251660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname, gemmkerid, trsmkerid ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a12, \ ctype* restrict a11, \ ctype* restrict b21, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ const inc_t rs_b = 4; \ const inc_t cs_b = 1; \ \ ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_ukr_dt( dt, gemmkerid, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ trsm_ukr = bli_cntx_get_l3_ukr_dt( dt, trsmkerid, cntx ); \ \ gemm_ukr \ ( \ k, \ minus_one, \ a12, \ b21, \ alpha, \ b11, rs_b, cs_b, \ data, \ cntx \ ); \ \ trsm_ukr \ ( \ a11, \ b11, \ c11, rs_c, cs_c, \ data, \ cntx \ ); \ } INSERT_GENTFUNC_BASIC2( gemmtrsm_u_c99_4x4, BLIS_GEMM_UKR, BLIS_TRSM_U_UKR ) cython-blis-0.9.1/blis/_src/kernels/old/c99/3/bli_trsm_l_c99_4x4.c000066400000000000000000000135231427272030600243060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const dim_t rs_a = 1; \ const dim_t cs_a = 4; \ \ const dim_t rs_b = 4; \ const dim_t cs_b = 1; \ \ ctype a00; \ ctype a10, a11; \ ctype a20, a21, a22; \ ctype a30, a31, a32, a33; \ \ ctype b00, b01, b02, b03; \ ctype b10, b11, b12, b13; \ ctype b20, b21, b22, b23; \ ctype b30, b31, b32, b33; \ \ \ /* Load contents of B. */ \ \ b00 = *(b + 0*rs_b + 0*cs_b); \ b01 = *(b + 0*rs_b + 1*cs_b); \ b02 = *(b + 0*rs_b + 2*cs_b); \ b03 = *(b + 0*rs_b + 3*cs_b); \ \ b10 = *(b + 1*rs_b + 0*cs_b); \ b11 = *(b + 1*rs_b + 1*cs_b); \ b12 = *(b + 1*rs_b + 2*cs_b); \ b13 = *(b + 1*rs_b + 3*cs_b); \ \ b20 = *(b + 2*rs_b + 0*cs_b); \ b21 = *(b + 2*rs_b + 1*cs_b); \ b22 = *(b + 2*rs_b + 2*cs_b); \ b23 = *(b + 2*rs_b + 3*cs_b); \ \ b30 = *(b + 3*rs_b + 0*cs_b); \ b31 = *(b + 3*rs_b + 1*cs_b); \ b32 = *(b + 3*rs_b + 2*cs_b); \ b33 = *(b + 3*rs_b + 3*cs_b); \ \ \ /* iteration 0 */ \ \ a00 = *(a + 0*rs_a + 0*cs_a); \ \ PASTEMAC(ch,scals)( a00, b00 ); \ PASTEMAC(ch,scals)( a00, b01 ); \ PASTEMAC(ch,scals)( a00, b02 ); \ PASTEMAC(ch,scals)( a00, b03 ); \ \ *(b + 0*rs_b + 0*cs_b) = b00; \ *(b + 0*rs_b + 1*cs_b) = b01; \ *(b + 0*rs_b + 2*cs_b) = b02; \ *(b + 0*rs_b + 3*cs_b) = b03; \ \ *(c + 0*rs_c + 0*cs_c) = b00; \ *(c + 0*rs_c + 1*cs_c) = b01; \ *(c + 0*rs_c + 2*cs_c) = b02; \ *(c + 0*rs_c + 3*cs_c) = b03; \ \ \ /* iteration 1 */ \ \ a10 = *(a + 1*rs_a + 0*cs_a); \ a11 = *(a + 1*rs_a + 1*cs_a); \ \ PASTEMAC(ch,axmys)( a10, b00, b10 ); \ PASTEMAC(ch,axmys)( a10, b01, b11 ); \ PASTEMAC(ch,axmys)( a10, b02, b12 ); \ PASTEMAC(ch,axmys)( a10, b03, b13 ); \ \ PASTEMAC(ch,scals)( a11, b10 ); \ PASTEMAC(ch,scals)( a11, b11 ); \ PASTEMAC(ch,scals)( a11, b12 ); \ PASTEMAC(ch,scals)( a11, b13 ); \ \ *(b + 1*rs_b + 0*cs_b) = b10; \ *(b + 1*rs_b + 1*cs_b) = b11; \ *(b + 1*rs_b + 2*cs_b) = b12; \ *(b + 1*rs_b + 3*cs_b) = b13; \ \ *(c + 1*rs_c + 0*cs_c) = b10; \ *(c + 1*rs_c + 1*cs_c) = b11; \ *(c + 1*rs_c + 2*cs_c) = b12; \ *(c + 1*rs_c + 3*cs_c) = b13; \ \ \ /* iteration 2 */ \ \ a20 = *(a + 2*rs_a + 0*cs_a); \ a21 = *(a + 2*rs_a + 1*cs_a); \ a22 = *(a + 2*rs_a + 2*cs_a); \ \ PASTEMAC(ch,axmys)( a20, b00, b20 ); \ PASTEMAC(ch,axmys)( a20, b01, b21 ); \ PASTEMAC(ch,axmys)( a20, b02, b22 ); \ PASTEMAC(ch,axmys)( a20, b03, b23 ); \ \ PASTEMAC(ch,axmys)( a21, b10, b20 ); \ PASTEMAC(ch,axmys)( a21, b11, b21 ); \ PASTEMAC(ch,axmys)( a21, b12, b22 ); \ PASTEMAC(ch,axmys)( a21, b13, b23 ); \ \ PASTEMAC(ch,scals)( a22, b20 ); \ PASTEMAC(ch,scals)( a22, b21 ); \ PASTEMAC(ch,scals)( a22, b22 ); \ PASTEMAC(ch,scals)( a22, b23 ); \ \ *(b + 2*rs_b + 0*cs_b) = b20; \ *(b + 2*rs_b + 1*cs_b) = b21; \ *(b + 2*rs_b + 2*cs_b) = b22; \ *(b + 2*rs_b + 3*cs_b) = b23; \ \ *(c + 2*rs_c + 0*cs_c) = b20; \ *(c + 2*rs_c + 1*cs_c) = b21; \ *(c + 2*rs_c + 2*cs_c) = b22; \ *(c + 2*rs_c + 3*cs_c) = b23; \ \ \ /* iteration 3 */ \ \ a30 = *(a + 3*rs_a + 0*cs_a); \ a31 = *(a + 3*rs_a + 1*cs_a); \ a32 = *(a + 3*rs_a + 2*cs_a); \ a33 = *(a + 3*rs_a + 3*cs_a); \ \ PASTEMAC(ch,axmys)( a30, b00, b30 ); \ PASTEMAC(ch,axmys)( a30, b01, b31 ); \ PASTEMAC(ch,axmys)( a30, b02, b32 ); \ PASTEMAC(ch,axmys)( a30, b03, b33 ); \ \ PASTEMAC(ch,axmys)( a31, b10, b30 ); \ PASTEMAC(ch,axmys)( a31, b11, b31 ); \ PASTEMAC(ch,axmys)( a31, b12, b32 ); \ PASTEMAC(ch,axmys)( a31, b13, b33 ); \ \ PASTEMAC(ch,axmys)( a32, b20, b30 ); \ PASTEMAC(ch,axmys)( a32, b21, b31 ); \ PASTEMAC(ch,axmys)( a32, b22, b32 ); \ PASTEMAC(ch,axmys)( a32, b23, b33 ); \ \ PASTEMAC(ch,scals)( a33, b30 ); \ PASTEMAC(ch,scals)( a33, b31 ); \ PASTEMAC(ch,scals)( a33, b32 ); \ PASTEMAC(ch,scals)( a33, b33 ); \ \ *(b + 3*rs_b + 0*cs_b) = b30; \ *(b + 3*rs_b + 1*cs_b) = b31; \ *(b + 3*rs_b + 2*cs_b) = b32; \ *(b + 3*rs_b + 3*cs_b) = b33; \ \ *(c + 3*rs_c + 0*cs_c) = b30; \ *(c + 3*rs_c + 1*cs_c) = b31; \ *(c + 3*rs_c + 2*cs_c) = b32; \ *(c + 3*rs_c + 3*cs_c) = b33; \ } INSERT_GENTFUNC_BASIC0( trsm_l_c99_4x4 ) cython-blis-0.9.1/blis/_src/kernels/old/c99/3/bli_trsm_u_c99_4x4.c000066400000000000000000000135611427272030600243210ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname ) \ \ void PASTEMAC(ch,opname) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const dim_t rs_a = 1; \ const dim_t cs_a = 4; \ \ const dim_t rs_b = 4; \ const dim_t cs_b = 1; \ \ ctype a00, a01, a02, a03; \ ctype a11, a12, a13; \ ctype a22, a23; \ ctype a33; \ \ ctype b00, b01, b02, b03; \ ctype b10, b11, b12, b13; \ ctype b20, b21, b22, b23; \ ctype b30, b31, b32, b33; \ \ \ /* Load contents of B. */ \ \ b00 = *(b + 0*rs_b + 0*cs_b); \ b01 = *(b + 0*rs_b + 1*cs_b); \ b02 = *(b + 0*rs_b + 2*cs_b); \ b03 = *(b + 0*rs_b + 3*cs_b); \ \ b10 = *(b + 1*rs_b + 0*cs_b); \ b11 = *(b + 1*rs_b + 1*cs_b); \ b12 = *(b + 1*rs_b + 2*cs_b); \ b13 = *(b + 1*rs_b + 3*cs_b); \ \ b20 = *(b + 2*rs_b + 0*cs_b); \ b21 = *(b + 2*rs_b + 1*cs_b); \ b22 = *(b + 2*rs_b + 2*cs_b); \ b23 = *(b + 2*rs_b + 3*cs_b); \ \ b30 = *(b + 3*rs_b + 0*cs_b); \ b31 = *(b + 3*rs_b + 1*cs_b); \ b32 = *(b + 3*rs_b + 2*cs_b); \ b33 = *(b + 3*rs_b + 3*cs_b); \ \ \ /* iteration 0 */ \ \ a33 = *(a + 3*rs_a + 3*cs_a); \ \ PASTEMAC(ch,scals)( a33, b30 ); \ PASTEMAC(ch,scals)( a33, b31 ); \ PASTEMAC(ch,scals)( a33, b32 ); \ PASTEMAC(ch,scals)( a33, b33 ); \ \ *(b + 3*rs_b + 0*cs_b) = b30; \ *(b + 3*rs_b + 1*cs_b) = b31; \ *(b + 3*rs_b + 2*cs_b) = b32; \ *(b + 3*rs_b + 3*cs_b) = b33; \ \ *(c + 3*rs_c + 0*cs_c) = b30; \ *(c + 3*rs_c + 1*cs_c) = b31; \ *(c + 3*rs_c + 2*cs_c) = b32; \ *(c + 3*rs_c + 3*cs_c) = b33; \ \ \ /* iteration 1 */ \ \ a22 = *(a + 2*rs_a + 2*cs_a); \ a23 = *(a + 2*rs_a + 3*cs_a); \ \ PASTEMAC(ch,axmys)( a23, b30, b20 ); \ PASTEMAC(ch,axmys)( a23, b31, b21 ); \ PASTEMAC(ch,axmys)( a23, b32, b22 ); \ PASTEMAC(ch,axmys)( a23, b33, b23 ); \ \ PASTEMAC(ch,scals)( a22, b20 ); \ PASTEMAC(ch,scals)( a22, b21 ); \ PASTEMAC(ch,scals)( a22, b22 ); \ PASTEMAC(ch,scals)( a22, b23 ); \ \ *(b + 2*rs_b + 0*cs_b) = b20; \ *(b + 2*rs_b + 1*cs_b) = b21; \ *(b + 2*rs_b + 2*cs_b) = b22; \ *(b + 2*rs_b + 3*cs_b) = b23; \ \ *(c + 2*rs_c + 0*cs_c) = b20; \ *(c + 2*rs_c + 1*cs_c) = b21; \ *(c + 2*rs_c + 2*cs_c) = b22; \ *(c + 2*rs_c + 3*cs_c) = b23; \ \ \ /* iteration 2 */ \ \ a11 = *(a + 1*rs_a + 1*cs_a); \ a12 = *(a + 1*rs_a + 2*cs_a); \ a13 = *(a + 1*rs_a + 3*cs_a); \ \ PASTEMAC(ch,axmys)( a12, b20, b10 ); \ PASTEMAC(ch,axmys)( a12, b21, b11 ); \ PASTEMAC(ch,axmys)( a12, b22, b12 ); \ PASTEMAC(ch,axmys)( a12, b23, b13 ); \ \ PASTEMAC(ch,axmys)( a13, b30, b10 ); \ PASTEMAC(ch,axmys)( a13, b31, b11 ); \ PASTEMAC(ch,axmys)( a13, b32, b12 ); \ PASTEMAC(ch,axmys)( a13, b33, b13 ); \ \ PASTEMAC(ch,scals)( a11, b10 ); \ PASTEMAC(ch,scals)( a11, b11 ); \ PASTEMAC(ch,scals)( a11, b12 ); \ PASTEMAC(ch,scals)( a11, b13 ); \ \ *(b + 1*rs_b + 0*cs_b) = b10; \ *(b + 1*rs_b + 1*cs_b) = b11; \ *(b + 1*rs_b + 2*cs_b) = b12; \ *(b + 1*rs_b + 3*cs_b) = b13; \ \ *(c + 1*rs_c + 0*cs_c) = b10; \ *(c + 1*rs_c + 1*cs_c) = b11; \ *(c + 1*rs_c + 2*cs_c) = b12; \ *(c + 1*rs_c + 3*cs_c) = b13; \ \ \ /* iteration 3 */ \ \ a00 = *(a + 0*rs_a + 0*cs_a); \ a01 = *(a + 0*rs_a + 1*cs_a); \ a02 = *(a + 0*rs_a + 2*cs_a); \ a03 = *(a + 0*rs_a + 3*cs_a); \ \ PASTEMAC(ch,axmys)( a01, b10, b00 ); \ PASTEMAC(ch,axmys)( a01, b11, b01 ); \ PASTEMAC(ch,axmys)( a01, b12, b02 ); \ PASTEMAC(ch,axmys)( a01, b13, b03 ); \ \ PASTEMAC(ch,axmys)( a02, b20, b00 ); \ PASTEMAC(ch,axmys)( a02, b21, b01 ); \ PASTEMAC(ch,axmys)( a02, b22, b02 ); \ PASTEMAC(ch,axmys)( a02, b23, b03 ); \ \ PASTEMAC(ch,axmys)( a03, b30, b00 ); \ PASTEMAC(ch,axmys)( a03, b31, b01 ); \ PASTEMAC(ch,axmys)( a03, b32, b02 ); \ PASTEMAC(ch,axmys)( a03, b33, b03 ); \ \ PASTEMAC(ch,scals)( a00, b00 ); \ PASTEMAC(ch,scals)( a00, b01 ); \ PASTEMAC(ch,scals)( a00, b02 ); \ PASTEMAC(ch,scals)( a00, b03 ); \ \ *(b + 0*rs_b + 0*cs_b) = b00; \ *(b + 0*rs_b + 1*cs_b) = b01; \ *(b + 0*rs_b + 2*cs_b) = b02; \ *(b + 0*rs_b + 3*cs_b) = b03; \ \ *(c + 0*rs_c + 0*cs_c) = b00; \ *(c + 0*rs_c + 1*cs_c) = b01; \ *(c + 0*rs_c + 2*cs_c) = b02; \ *(c + 0*rs_c + 3*cs_c) = b03; \ } INSERT_GENTFUNC_BASIC0( trsm_u_c99_4x4 ) cython-blis-0.9.1/blis/_src/kernels/old/c99/bli_kernels_c99.h000066400000000000000000000045061427272030600236160ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( gemm_c99_4x4 ) GEMM_UKR_PROT( gemm_c99_4x4 ) GEMM_UKR_PROT( gemm_c99_4x4 ) GEMM_UKR_PROT( gemm_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_l_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_l_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_l_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_l_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_u_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_u_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_u_c99_4x4 ) GEMMTRSM_UKR_PROT( gemmtrsm_u_c99_4x4 ) TRSM_UKR_PROT( trsm_l_c99_4x4 ) TRSM_UKR_PROT( trsm_l_c99_4x4 ) TRSM_UKR_PROT( trsm_l_c99_4x4 ) TRSM_UKR_PROT( trsm_l_c99_4x4 ) TRSM_UKR_PROT( trsm_u_c99_4x4 ) TRSM_UKR_PROT( trsm_u_c99_4x4 ) TRSM_UKR_PROT( trsm_u_c99_4x4 ) TRSM_UKR_PROT( trsm_u_c99_4x4 ) cython-blis-0.9.1/blis/_src/kernels/old/loongson3a/000077500000000000000000000000001427272030600221415ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/loongson3a/3/000077500000000000000000000000001427272030600223035ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/loongson3a/3/bli_gemm_loongson3a_opt_d4x4.c000066400000000000000000000550131427272030600301150ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_dgemm_loongson3a_opt_4x4 ( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t k_iter = k / 4; uint64_t k_left = k % 4; __asm__ volatile ( //General purpose registers // //$8=k_iter, $9=k_left //$10=a address, $11=b address //$12=prefetch a, $13=prefetch b //$14=rs_c, $15=cs_c, // //$16=c00 address, $17=c01 address, //$18=c02 address, $19=c03 address, // //Floating-point registers // //$f0=a0, $f1=a1, $f2=a4, $f3=a3 //$f4=next_a0, $f5=next_a1, $f6=next_a2, $f7=next_a3 // //$f8=b0, $f9=b1, $f10=b2, $f11=b3 //$f12=next_b0, $f13=next_b1, $f14=next_b2, $f15=next_b3 // //$f16=a0b0, $f17=a0b1, $f18=a0b2, $f19=a0b3 //$f20=a1b0, $f21=a1b1, $f22=a1b2, $f23=a1b3 //$f24=a2b0, $f25=a2b1, $f26=a2b2, $f27=a2b3 //$f28=a3b0, $f29=a3b1, $f30=a3b2, $f31=a3b3 // "ld $8, %0 \n\t" //load k_iter "dmtc1 $0, $f16 \n\t" //Init "ld $9, %1 \n\t" //load k_left "dmtc1 $0, $f17 \n\t" //Init "ld $14, %7 \n\t" //load rs_c "dmtc1 $0, $f18 \n\t" //Init "ld $15, %8 \n\t" //load cs_c "dmtc1 $0, $f19 \n\t" //Init "ld $16, %6 \n\t" //load c "dmtc1 $0, $f20 \n\t" //Init "ld $10, %2 \n\t" //load a "dmtc1 $0, $f21 \n\t" //Init "ld $11, %3 \n\t" //load b "dmtc1 $0, $f22 \n\t" //Init "dsll $14, $14, 3 \n\t" //rs_c * sizeof(double) "dmtc1 $0, $f23 \n\t" //Init "dsll $15, $15, 3 \n\t" //cs_c * sizeof(double) "dmtc1 $0, $f24 \n\t" //Init "dadd $17, $16, $15 \n\t" //c01 address "ld $12, %9 \n\t" //load kc "dmtc1 $0, $f25 \n\t" //Init "dmtc1 $0, $f26 \n\t" //Init "dadd $18, $17, $15 \n\t" //c02 address "dsll $13, $12, 5 \n\t" //B prefetch distance= next panel B(nr*kc = kc*4*8bytes = kc<<5) "dmtc1 $0, $f27 \n\t" //Init "dmtc1 $0, $f28 \n\t" //Init "dadd $19, $18, $15 \n\t" //c03 address "dsll $12, $12, 4 \n\t" //A prefetch distance= panel A/2(mr*kc/2 = kc*4*8bytes/2 = kc<<4) "dmtc1 $0, $f29 \n\t" //Init "dmtc1 $0, $f30 \n\t" //Init "dadd $13, $11, $13 \n\t" //B prefetch address "ld $0, 0($16) \n\t" //prefetch c00 "dmtc1 $0, $f31 \n\t" //Init "dadd $12, $10, $12 \n\t" //A prefetch address "ld $0, 0($17) \n\t" //prefetch c01 "gsLQC1 $f1, $f0, 0($10) \n\t" //load 2 values from a "gsLQC1 $f9, $f8, 0($11) \n\t" //load 2 values from b "gsLQC1 $f3, $f2, 1*16($10) \n\t" //load 2 values from a "gsLQC1 $f11, $f10, 1*16($11) \n\t" //load 2 values from b "ld $0, 0($18) \n\t" //prefetch c02 "ld $0, 0($19) \n\t" //prefetch c03 "beqz $8, .Remain \n\t" ".align 4 \n\t" ".MainLoop: \n\t" " \n\t" //iteration 0 "daddiu $8, $8, -1 \n\t" //k_iter-- "gsLQC1 $f5, $f4, 2*16($10) \n\t" //load next 2 values from a "madd.d $f16, $f16, $f0, $f8 \n\t" //a0b0 "madd.d $f20, $f20, $f1, $f8 \n\t" //a1b0 " \n\t" "gsLQC1 $f13, $f12, 2*16($11) \n\t" //load next 2 values from b "madd.d $f17, $f17, $f0, $f9 \n\t" //a0b1 "madd.d $f21, $f21, $f1, $f9 \n\t" //a1b1 " \n\t" "gsLQC1 $f7, $f6, 3*16($10) \n\t" //load next 2 values from a "madd.d $f24, $f24, $f2, $f8 \n\t" //a2b0 "madd.d $f28, $f28, $f3, $f8 \n\t" //a3b0 " \n\t" "gsLQC1 $f15, $f14, 3*16($11) \n\t" //load next 2 values from b "madd.d $f25, $f25, $f2, $f9 \n\t" //a2b1 "madd.d $f29, $f29, $f3, $f9 \n\t" //a3b1 " \n\t" "ld $0, 0($13) \n\t" //prefetch B "madd.d $f18, $f18, $f0, $f10 \n\t" //a0b2 "madd.d $f22, $f22, $f1, $f10 \n\t" //a1b2 " \n\t" "madd.d $f19, $f19, $f0, $f11 \n\t" //a0b3 "madd.d $f23, $f23, $f1, $f11 \n\t" //a1b3 " \n\t" "ld $0, 0($12) \n\t" //prefetch A "madd.d $f26, $f26, $f2, $f10 \n\t" //a2b2 "madd.d $f30, $f30, $f3, $f10 \n\t" //a3b2 " \n\t" "madd.d $f27, $f27, $f2, $f11 \n\t" //a2b3 "madd.d $f31, $f31, $f3, $f11 \n\t" //a3b3 " \n\t" //iteration 1 "gsLQC1 $f1, $f0, 4*16($10) \n\t" //load next 2 values from a "madd.d $f16, $f16, $f4, $f12 \n\t" //a0b0 "madd.d $f20, $f20, $f5, $f12 \n\t" //a1b0 " \n\t" "gsLQC1 $f9, $f8, 4*16($11) \n\t" //load next 2 values from b "madd.d $f17, $f17, $f4, $f13 \n\t" //a0b1 "madd.d $f21, $f21, $f5, $f13 \n\t" //a1b1 " \n\t" "gsLQC1 $f3, $f2, 5*16($10) \n\t" //load next 2 values from a "madd.d $f24, $f24, $f6, $f12 \n\t" //a2b0 "madd.d $f28, $f28, $f7, $f12 \n\t" //a3b0 " \n\t" "gsLQC1 $f11, $f10, 5*16($11) \n\t" //load next 2 values from b "madd.d $f25, $f25, $f6, $f13 \n\t" //a2b1 "madd.d $f29, $f29, $f7, $f13 \n\t" //a3b1 " \n\t" "ld $0, 4*8($13) \n\t" //prefetch B "madd.d $f18, $f18, $f4, $f14 \n\t" //a0b2 "madd.d $f22, $f22, $f5, $f14 \n\t" //a1b2 " \n\t" "madd.d $f19, $f19, $f4, $f15 \n\t" //a0b3 "madd.d $f23, $f23, $f5, $f15 \n\t" //a1b3 " \n\t" "ld $0, 4*8($12) \n\t" //prefetch A "madd.d $f26, $f26, $f6, $f14 \n\t" //a2b2 "madd.d $f30, $f30, $f7, $f14 \n\t" //a3b2 " \n\t" "madd.d $f27, $f27, $f6, $f15 \n\t" //a2b3 "madd.d $f31, $f31, $f7, $f15 \n\t" //a3b3 " \n\t" //iteration 2 "gsLQC1 $f5, $f4, 6*16($10) \n\t" //load next 2 values from a "madd.d $f16, $f16, $f0, $f8 \n\t" //a0b0 "madd.d $f20, $f20, $f1, $f8 \n\t" //a1b0 " \n\t" "gsLQC1 $f13, $f12, 6*16($11) \n\t" //load next 2 values from b "madd.d $f17, $f17, $f0, $f9 \n\t" //a0b1 "madd.d $f21, $f21, $f1, $f9 \n\t" //a1b1 " \n\t" "gsLQC1 $f7, $f6, 7*16($10) \n\t" //load next 2 values from a "madd.d $f24, $f24, $f2, $f8 \n\t" //a2b0 "madd.d $f28, $f28, $f3, $f8 \n\t" //a3b0 "daddu $10, $10, 16*8 \n\t" //move A address " \n\t" "gsLQC1 $f15, $f14, 7*16($11) \n\t" //load next 2 values from b "madd.d $f25, $f25, $f2, $f9 \n\t" //a2b1 "madd.d $f29, $f29, $f3, $f9 \n\t" //a3b1 "daddu $11, $11, 16*8 \n\t" //move B address " \n\t" "ld $0, 8*8($13) \n\t" //prefetch B "madd.d $f18, $f18, $f0, $f10 \n\t" //a0b2 "madd.d $f22, $f22, $f1, $f10 \n\t" //a1b2 " \n\t" "madd.d $f19, $f19, $f0, $f11 \n\t" //a0b3 "madd.d $f23, $f23, $f1, $f11 \n\t" //a1b3 " \n\t" "ld $0, 8*8($12) \n\t" //prefetch A "madd.d $f26, $f26, $f2, $f10 \n\t" //a2b2 "madd.d $f30, $f30, $f3, $f10 \n\t" //a3b2 " \n\t" "madd.d $f27, $f27, $f2, $f11 \n\t" //a2b3 "madd.d $f31, $f31, $f3, $f11 \n\t" //a3b3 " \n\t" //iteration 3 "gsLQC1 $f1, $f0, 0($10) \n\t" //load next 2 values from a "madd.d $f16, $f16, $f4, $f12 \n\t" //a0b0 "madd.d $f20, $f20, $f5, $f12 \n\t" //a1b0 " \n\t" "gsLQC1 $f9, $f8, 0($11) \n\t" //load next 2 values from b "madd.d $f17, $f17, $f4, $f13 \n\t" //a0b1 "madd.d $f21, $f21, $f5, $f13 \n\t" //a1b1 " \n\t" "gsLQC1 $f3, $f2, 1*16($10) \n\t" //load next 2 values from a "madd.d $f24, $f24, $f6, $f12 \n\t" //a2b0 "madd.d $f28, $f28, $f7, $f12 \n\t" //a3b0 " \n\t" "gsLQC1 $f11, $f10, 1*16($11) \n\t" //load next 2 values from b "madd.d $f25, $f25, $f6, $f13 \n\t" //a2b1 "madd.d $f29, $f29, $f7, $f13 \n\t" //a3b1 " \n\t" "ld $0, 12*8($13) \n\t" //prefetch B "madd.d $f18, $f18, $f4, $f14 \n\t" //a0b2 "madd.d $f22, $f22, $f5, $f14 \n\t" //a1b2 "daddu $13, $13, 16*8 \n\t" //move prefetch B address " \n\t" "madd.d $f19, $f19, $f4, $f15 \n\t" //a0b3 "madd.d $f23, $f23, $f5, $f15 \n\t" //a1b3 " \n\t" "ld $0, 12*8($12) \n\t" //prefetch A "madd.d $f26, $f26, $f6, $f14 \n\t" //a2b2 "madd.d $f30, $f30, $f7, $f14 \n\t" //a3b2 "daddu $12, $12, 16*8 \n\t" //move prefetch B address " \n\t" "madd.d $f27, $f27, $f6, $f15 \n\t" //a2b3 "madd.d $f31, $f31, $f7, $f15 \n\t" //a3b3 "bnez $8, .MainLoop \n\t" ".align 4 \n\t" ".Remain: \n\t" //deal with the tail. k%4 "beqz $9, .StoreC \n\t" "andi $8, $9, 2 \n\t" "nop \n\t" "nop \n\t" "beqz $8, .Remaink1 \n\t" "nop \n\t" " \n\t" // k%4=2 "gsLQC1 $f5, $f4, 2*16($10) \n\t" //load next 2 values from a "madd.d $f16, $f16, $f0, $f8 \n\t" //a0b0 "madd.d $f20, $f20, $f1, $f8 \n\t" //a1b0 " \n\t" "gsLQC1 $f13, $f12, 2*16($11) \n\t" //load next 2 values from b "madd.d $f17, $f17, $f0, $f9 \n\t" //a0b1 "madd.d $f21, $f21, $f1, $f9 \n\t" //a1b1 " \n\t" "gsLQC1 $f7, $f6, 3*16($10) \n\t" //load next 2 values from a "madd.d $f24, $f24, $f2, $f8 \n\t" //a2b0 "madd.d $f28, $f28, $f3, $f8 \n\t" //a3b0 "daddu $10, $10, 8*8 \n\t" //move A address " \n\t" "gsLQC1 $f15, $f14, 3*16($11) \n\t" //load next 2 values from b "madd.d $f25, $f25, $f2, $f9 \n\t" //a2b1 "madd.d $f29, $f29, $f3, $f9 \n\t" //a3b1 "daddu $11, $11, 8*8 \n\t" //move B address " \n\t" "ld $0, 0($13) \n\t" //prefetch B "madd.d $f18, $f18, $f0, $f10 \n\t" //a0b2 "madd.d $f22, $f22, $f1, $f10 \n\t" //a1b2 " \n\t" "madd.d $f19, $f19, $f0, $f11 \n\t" //a0b3 "madd.d $f23, $f23, $f1, $f11 \n\t" //a1b3 " \n\t" "ld $0, 0($12) \n\t" //prefetch A "madd.d $f26, $f26, $f2, $f10 \n\t" //a2b2 "madd.d $f30, $f30, $f3, $f10 \n\t" //a3b2 " \n\t" "madd.d $f27, $f27, $f2, $f11 \n\t" //a2b3 "madd.d $f31, $f31, $f3, $f11 \n\t" //a3b3 " \n\t" "gsLQC1 $f1, $f0, 0*16($10) \n\t" //load next 2 values from a "madd.d $f16, $f16, $f4, $f12 \n\t" //a0b0 "madd.d $f20, $f20, $f5, $f12 \n\t" //a1b0 " \n\t" "gsLQC1 $f9, $f8, 0*16($11) \n\t" //load next 2 values from b "madd.d $f17, $f17, $f4, $f13 \n\t" //a0b1 "madd.d $f21, $f21, $f5, $f13 \n\t" //a1b1 " \n\t" "gsLQC1 $f3, $f2, 1*16($10) \n\t" //load next 2 values from a "madd.d $f24, $f24, $f6, $f12 \n\t" //a2b0 "madd.d $f28, $f28, $f7, $f12 \n\t" //a3b0 " \n\t" "gsLQC1 $f11, $f10, 1*16($11) \n\t" //load next 2 values from b "madd.d $f25, $f25, $f6, $f13 \n\t" //a2b1 "madd.d $f29, $f29, $f7, $f13 \n\t" //a3b1 " \n\t" "ld $0, 4*8($13) \n\t" //prefetch B "madd.d $f18, $f18, $f4, $f14 \n\t" //a0b2 "madd.d $f22, $f22, $f5, $f14 \n\t" //a1b2 " \n\t" "daddu $13, $13, 8*8 \n\t" "madd.d $f19, $f19, $f4, $f15 \n\t" //a0b3 "madd.d $f23, $f23, $f5, $f15 \n\t" //a1b3 " \n\t" "ld $0, 4*8($12) \n\t" //prefetch A "madd.d $f26, $f26, $f6, $f14 \n\t" //a2b2 "madd.d $f30, $f30, $f7, $f14 \n\t" //a3b2 " \n\t" "daddu $12, $12, 8*8 \n\t" "madd.d $f27, $f27, $f6, $f15 \n\t" //a2b3 "madd.d $f31, $f31, $f7, $f15 \n\t" //a3b3 ".align 4 \n\t" ".Remaink1: \n\t" // k%4=1 "andi $8, $9, 1 \n\t" "beqz $8, .StoreC \n\t" "nop \n\t" " \n\t" "ld $0, 0($13) \n\t" //prefetch B "madd.d $f16, $f16, $f0, $f8 \n\t" //a0b0 "madd.d $f20, $f20, $f1, $f8 \n\t" //a1b0 " \n\t" "madd.d $f17, $f17, $f0, $f9 \n\t" //a0b1 "madd.d $f21, $f21, $f1, $f9 \n\t" //a1b1 " \n\t" "ld $0, 0($12) \n\t" //prefetch A "madd.d $f24, $f24, $f2, $f8 \n\t" //a2b0 "madd.d $f28, $f28, $f3, $f8 \n\t" //a3b0 " \n\t" "madd.d $f25, $f25, $f2, $f9 \n\t" //a2b1 "madd.d $f29, $f29, $f3, $f9 \n\t" //a3b1 " \n\t" "madd.d $f18, $f18, $f0, $f10 \n\t" //a0b2 "madd.d $f22, $f22, $f1, $f10 \n\t" //a1b2 " \n\t" "madd.d $f19, $f19, $f0, $f11 \n\t" //a0b3 "madd.d $f23, $f23, $f1, $f11 \n\t" //a1b3 " \n\t" "madd.d $f26, $f26, $f2, $f10 \n\t" //a2b2 "madd.d $f30, $f30, $f3, $f10 \n\t" //a3b2 " \n\t" "madd.d $f27, $f27, $f2, $f11 \n\t" //a2b3 "madd.d $f31, $f31, $f3, $f11 \n\t" //a3b3 ".align 4 \n\t" ".StoreC: \n\t" //Write C " \n\t" //$f14=alpha, $f15=beta " \n\t" "ld $8, %4 \n\t" //load alpha address "ld $9, %5 \n\t" //load beta address "ldc1 $f14, 0($8) \n\t" //load alpha "ldc1 $f15, 0($9) \n\t" //load beta " \n\t" "ldc1 $f0, 0($16) \n\t" //load c00 "dadd $20, $16, $14 \n\t" "ldc1 $f1, 0($17) \n\t" //load c01 "dadd $21, $17, $14 \n\t" "ldc1 $f2, 0($18) \n\t" //load c02 "dadd $22, $18, $14 \n\t" "ldc1 $f3, 0($19) \n\t" //load c03 "dadd $23, $19, $14 \n\t" " \n\t" "ldc1 $f4, 0($20) \n\t" //load c10 "dadd $8, $20, $14 \n\t" "mul.d $f0, $f0, $f15 \n\t" //c00 * beta "ldc1 $f5, 0($21) \n\t" //load c11 "dadd $9, $21, $14 \n\t" "mul.d $f1, $f1, $f15 \n\t" //c01 * beta "ldc1 $f6, 0($22) \n\t" //load c12 "dadd $10, $22, $14 \n\t" "mul.d $f2, $f2, $f15 \n\t" //c02 * beta "ldc1 $f7, 0($23) \n\t" //load c13 "dadd $11, $23, $14 \n\t" "mul.d $f3, $f3, $f15 \n\t" //c03 * beta " \n\t" "ldc1 $f8, 0($8) \n\t" //load c20 "dadd $12, $8, $14 \n\t" "mul.d $f4, $f4, $f15 \n\t" //c10 * beta "madd.d $f16, $f0, $f16, $f14\n\t" //c00+=alpha*a0b0 "ldc1 $f9, 0($9) \n\t" //load c21 "dadd $13, $9, $14 \n\t" "mul.d $f5, $f5, $f15 \n\t" //c11 * beta "madd.d $f17, $f1, $f17, $f14\n\t" //c01+=alpha*a0b1 "ldc1 $f10, 0($10) \n\t" //load c22 "dadd $24, $10, $14 \n\t" "mul.d $f6, $f6, $f15 \n\t" //c12 * beta "madd.d $f18, $f2, $f18, $f14\n\t" //c02+=alpha*a0b2 "ldc1 $f11, 0($11) \n\t" //load c23 "dadd $25, $11, $14 \n\t" "mul.d $f7, $f7, $f15 \n\t" //c13 * beta "madd.d $f19, $f3, $f19, $f14\n\t" //c03+=alpha*a0b3 " \n\t" "ldc1 $f12, 0($12) \n\t" //load c30 "mul.d $f8, $f8, $f15 \n\t" //c20 * beta "madd.d $f20, $f4, $f20, $f14 \n\t" //c10+=alpha*a1b0 "ldc1 $f13, 0($13) \n\t" //load c31 "mul.d $f9, $f9, $f15 \n\t" //c21 * beta "madd.d $f21, $f5, $f21, $f14 \n\t" //c11+=alpha*a1b1 "ldc1 $f0, 0($24) \n\t" //load c32 "mul.d $f10, $f10, $f15 \n\t" //c22 * beta "madd.d $f22, $f6, $f22, $f14 \n\t" //c12+=alpha*a1b2 "ldc1 $f1, 0($25) \n\t" //load c33 "mul.d $f11, $f11, $f15 \n\t" //c23 * beta "madd.d $f23, $f7, $f23, $f14 \n\t" //c13+=alpha*a1b3 " \n\t" "sdc1 $f16, 0($16) \n\t" //store c00 "mul.d $f12, $f12, $f15 \n\t" //c30 * beta "madd.d $f24, $f8, $f24, $f14 \n\t" //c20+=alpha*a2b0 "sdc1 $f17, 0($17) \n\t" //store c01 "mul.d $f13, $f13, $f15 \n\t" //c31 * beta "madd.d $f25, $f9, $f25, $f14 \n\t" //c21+=alpha*a2b1 "sdc1 $f18, 0($18) \n\t" //store c02 "mul.d $f0, $f0, $f15 \n\t" //c32 * beta "madd.d $f26, $f10, $f26, $f14 \n\t" //c22+=alpha*a2b2 "sdc1 $f19, 0($19) \n\t" //store c03 "mul.d $f1, $f1, $f15 \n\t" //c33 * beta "madd.d $f27, $f11, $f27, $f14 \n\t" //c23+=alpha*a2b3 " \n\t" "sdc1 $f20, 0($20) \n\t" //store c10 "madd.d $f28, $f12, $f28, $f14 \n\t" //c30+=alpha*a3b0 "sdc1 $f21, 0($21) \n\t" //store c11 "madd.d $f29, $f13, $f29, $f14 \n\t" //c31+=alpha*a3b1 "sdc1 $f22, 0($22) \n\t" //store c12 "madd.d $f30, $f0, $f30, $f14 \n\t" //c32+=alpha*a3b2 "sdc1 $f23, 0($23) \n\t" //store c13 "madd.d $f31, $f1, $f31, $f14 \n\t" //c33+=alpha*a3b3 " \n\t" "sdc1 $f24, 0($8) \n\t" //store c20 "sdc1 $f25, 0($9) \n\t" //store c21 "sdc1 $f26, 0($10) \n\t" //store c22 "sdc1 $f27, 0($11) \n\t" //store c23 " \n\t" "sdc1 $f28, 0($12) \n\t" //store c30 "sdc1 $f29, 0($13) \n\t" //store c31 "sdc1 $f30, 0($24) \n\t" //store c32 "sdc1 $f31, 0($25) \n\t" //store c33 " \n\t" ://output operands (none) ://input operands "m" (k_iter), "m" (k_left), "m" (a), "m" (b), "m" (alpha), "m" (beta), "m" (c), "m" (rs_c), "m" (cs_c), "m" (k) ://register clober list //general purpose registers "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23", "$24", "$25", //floating-point registers "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15", "$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23", "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31", "memory" ); } cython-blis-0.9.1/blis/_src/kernels/old/nacl/000077500000000000000000000000001427272030600207745ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/nacl/pnacl/000077500000000000000000000000001427272030600220715ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/nacl/pnacl/1/000077500000000000000000000000001427272030600222315ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/nacl/pnacl/1/bli_axpyv_opt.c000066400000000000000000000115241427272030600252570ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if PPAPI_RELEASE >= 36 typedef float v4sf __attribute__ ((vector_size(16))); inline v4sf v4sf_splat(float x) { return (v4sf) { x, x, x, x }; } inline v4sf v4sf_load(const float* a) { return *((const v4sf*)a); } inline v4sf v4sf_cload(const scomplex* a) { return *((const v4sf*)a); } inline void v4sf_store(float* a, v4sf x) { *((v4sf*)a) = x; } inline void v4sf_cstore(scomplex* a, v4sf x) { *((v4sf*)a) = x; } inline v4sf v4sf_zero() { return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; } #endif void bli_saxpyv_opt( conj_t conjx, dim_t n, float alpha[restrict static 1], float x[restrict static n], inc_t incx, float y[restrict static n], inc_t incy) { if (bli_zero_dim1(n)) { return; } if (bli_seq0(*alpha)) { return; } #if PPAPI_RELEASE >= 36 if (!bli_has_nonunit_inc2(incx, incy)) { const v4sf alphav = v4sf_splat(*alpha); while (n >= 4) { const v4sf xv = v4sf_load(x); v4sf yv = v4sf_load(y); yv += xv * alphav; v4sf_store(y, yv); x += 4; y += 4; n -= 4; } const float alphac = *alpha; while (n--) { (*y++) += (*x++) * alphac; } } #endif /* Just call the reference implementation. */ BLIS_SAXPYV_KERNEL_REF( conjx, n, alpha, x, incx, y, incy); } void bli_caxpyv_opt( conj_t conjx, dim_t n, scomplex alpha[restrict static 1], scomplex x[restrict static n], inc_t incx, scomplex y[restrict static n], inc_t incy) { if (bli_zero_dim1(n)) { return; } if (bli_ceq0(*alpha)) { return; } #if PPAPI_RELEASE >= 36 if (!bli_has_nonunit_inc2(incx, incy)) { if (bli_is_noconj(conjx)) { const v4sf alphav0 = v4sf_splat(alpha->real); const v4sf alphav1 = (v4sf) { -alpha->imag, alpha->imag, -alpha->imag, alpha->imag }; while (n >= 2) { const v4sf xv0 = v4sf_cload(x); v4sf yv = v4sf_cload(y); const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2); yv += xv0 * alphav0 + xv1 * alphav1; v4sf_cstore(y, yv); x += 2; y += 2; n -= 2; } const float alphar = alpha->real; const float alphai = alpha->imag; while (n--) { const float xr = x->real; const float xi = x->imag; const float yr = y->real; const float yi = y->imag; y->real = yr + xr * alphar - xi * alphai; y->imag = yi + xr * alphai + xi * alphar; x += 1; y += 1; } } else { const v4sf alphav0 = (v4sf) { alpha->real, -alpha->real, alpha->real, -alpha->real }; const v4sf alphav1 = v4sf_splat(alpha->imag); while (n >= 2) { const v4sf xv0 = v4sf_cload(x); v4sf yv = v4sf_cload(y); const v4sf xv1 = __builtin_shufflevector(xv0, xv0, 1, 0, 3, 2); yv += xv0 * alphav0 + xv1 * alphav1; v4sf_cstore(y, yv); x += 2; y += 2; n -= 2; } const float alphar = alpha->real; const float alphai = alpha->imag; while (n--) { const float xr = x->real; const float xi = x->imag; const float yr = y->real; const float yi = y->imag; y->real = yr + xr * alphar + xi * alphai; y->imag = yi + xr * alphai - xi * alphar; x += 1; y += 1; } } } #endif /* Just call the reference implementation. */ BLIS_CAXPYV_KERNEL_REF( conjx, n, alpha, x, incx, y, incy); } cython-blis-0.9.1/blis/_src/kernels/old/nacl/pnacl/1/bli_dotv_opt.c000066400000000000000000000336141427272030600250700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if PPAPI_RELEASE >= 36 typedef float v4sf __attribute__ ((vector_size(16))); inline v4sf v4sf_splat(float x) { return (v4sf) { x, x, x, x }; } inline v4sf v4sf_load(const float* a) { return *((const v4sf*)a); } inline v4sf v4sf_cload(const scomplex* a) { return *((const v4sf*)a); } inline void v4sf_store(float* a, v4sf x) { *((v4sf*)a) = x; } inline void v4sf_cstore(scomplex* a, v4sf x) { *((v4sf*)a) = x; } inline v4sf v4sf_zero() { return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; } #endif void bli_sdotv_opt( conj_t conjx, conj_t conjy, dim_t n, float x[restrict static n], inc_t incx, float y[restrict static n], inc_t incy, float rho[restrict static 1]) { #if PPAPI_RELEASE >= 36 // If the vector lengths are zero, set rho to zero and return. if (bli_zero_dim1(n)) { *rho = 0.0f; return; } // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if (bli_has_nonunit_inc2(incx, incy)) { float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f; while (n >= 6) { sum0 += (*x) * (*y); x += incx; y += incy; sum1 += (*x) * (*y); x += incx; y += incy; sum2 += (*x) * (*y); x += incx; y += incy; sum3 += (*x) * (*y); x += incx; y += incy; sum4 += (*x) * (*y); x += incx; y += incy; sum5 += (*x) * (*y); x += incx; y += incy; n -= 6; } float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5); while (n--) { sum += (*x) * (*y); x += incx; y += incy; } *rho = sum; } else { v4sf vsum0 = v4sf_zero(), vsum1 = v4sf_zero(), vsum2 = v4sf_zero(); v4sf vsum3 = v4sf_zero(), vsum4 = v4sf_zero(), vsum5 = v4sf_zero(); while (n >= 24) { vsum0 += v4sf_load(x) * v4sf_load(y); vsum1 += v4sf_load(x+4) * v4sf_load(y+4); vsum2 += v4sf_load(x+8) * v4sf_load(y+8); vsum3 += v4sf_load(x+12) * v4sf_load(y+12); vsum4 += v4sf_load(x+16) * v4sf_load(y+16); vsum5 += v4sf_load(x+20) * v4sf_load(y+20); x += 24; y += 24; n -= 24; } v4sf vsum = (vsum0 + vsum1 + vsum2) + (vsum3 + vsum4 + vsum5); while (n >= 4) { vsum += v4sf_load(x) * v4sf_load(y); x += 4; y += 4; n -= 4; } float sum = (vsum[0] + vsum[1]) + (vsum[2] + vsum[3]); while (n--) { sum += (*x++) * (*y++); } *rho = sum; } #else float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f, sum5 = 0.0f; while (n >= 6) { sum0 += (*x) * (*y); x += incx; y += incy; sum1 += (*x) * (*y); x += incx; y += incy; sum2 += (*x) * (*y); x += incx; y += incy; sum3 += (*x) * (*y); x += incx; y += incy; sum4 += (*x) * (*y); x += incx; y += incy; sum5 += (*x) * (*y); x += incx; y += incy; n -= 6; } float sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5); while (n--) { sum += (*x) * (*y); x += incx; y += incy; } *rho = sum; #endif } void bli_ddotv_opt( conj_t conjx, conj_t conjy, dim_t n, double x[restrict static n], inc_t incx, double y[restrict static n], inc_t incy, double rho[restrict static 1]) { double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0; while (n >= 6) { sum0 += (*x) * (*y); x += incx; y += incy; sum1 += (*x) * (*y); x += incx; y += incy; sum2 += (*x) * (*y); x += incx; y += incy; sum3 += (*x) * (*y); x += incx; y += incy; sum4 += (*x) * (*y); x += incx; y += incy; sum5 += (*x) * (*y); x += incx; y += incy; n -= 6; } double sum = (sum0 + sum1 + sum2) + (sum3 + sum4 + sum5); while (n--) { sum += (*x) * (*y); x += incx; y += incy; } *rho = sum; } void bli_cdotv_opt( conj_t conjx, conj_t conjy, dim_t n, scomplex x[restrict static n], inc_t incx, scomplex y[restrict static n], inc_t incy, scomplex rho[restrict static 1]) { if (bli_is_conj(conjy)) { bli_toggle_conj(&conjx); } if (bli_zero_dim1(n)) { rho->real = 0.0f; rho->imag = 0.0f; return; } float sumr; float sumi; #if PPAPI_RELEASE >= 36 if (bli_is_noconj(conjx)) { if (bli_has_nonunit_inc2(incx, incy)) { float sum0r = 0.0f, sum1r = 0.0f; float sum0i = 0.0f, sum1i = 0.0f; while (n >= 2) { const float x0r = x->real; const float x0i = x->imag; const float y0r = y->real; const float y0i = y->imag; sum0r += x0r * y0r - x0i * y0i; sum0i += x0r * y0i + x0i * y0r; x += incx; y += incy; const float x1r = x->real; const float x1i = x->imag; const float y1r = y->real; const float y1i = y->imag; sum1r += x1r * y1r - x1i * y1i; sum1i += x1r * y1i + x1i * y1r; x += incx; y += incy; n -= 2; } sumr = sum0r + sum1r; sumi = sum0i + sum1i; } else { v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero(); v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero(); while (n >= 8) { const v4sf xv0t = v4sf_cload(x); const v4sf xv0b = v4sf_cload(x+2); const v4sf yv0t = v4sf_cload(y); const v4sf yv0b = v4sf_cload(y+2); const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6); const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7); const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6); const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7); sumv0r += xv0r * yv0r - xv0i * yv0i; sumv0i += xv0r * yv0i + xv0i * yv0r; const v4sf xv1t = v4sf_cload(x+4); const v4sf xv1b = v4sf_cload(x+6); const v4sf yv1t = v4sf_cload(y+4); const v4sf yv1b = v4sf_cload(y+6); const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6); const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7); const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6); const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7); sumv1r += xv1r * yv1r - xv1i * yv1i; sumv1i += xv1r * yv1i + xv1i * yv1r; x += 8; y += 8; n -= 8; } const v4sf sumvr = sumv0r + sumv1r; const v4sf sumvi = sumv0i + sumv1i; sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]); sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]); } while (n--) { const float xr = x->real; const float xi = x->imag; const float yr = y->real; const float yi = y->imag; sumr += xr * yr - xi * yi; sumi += xr * yi + xi * yr; x += incx; y += incy; } } else { if (bli_has_nonunit_inc2(incx, incy)) { float sum0r = 0.0f, sum1r = 0.0f; float sum0i = 0.0f, sum1i = 0.0f; while (n >= 2) { const float x0r = x->real; const float x0i = x->imag; const float y0r = y->real; const float y0i = y->imag; sum0r += x0r * y0r + x0i * y0i; sum0i += x0r * y0i - x0i * y0r; x += incx; y += incy; const float x1r = x->real; const float x1i = x->imag; const float y1r = y->real; const float y1i = y->imag; sum1r += x1r * y1r + x1i * y1i; sum1i += x1r * y1i - x1i * y1r; x += incx; y += incy; n -= 2; } sumr = sum0r + sum1r; sumi = sum0i + sum1i; } else { v4sf sumv0r = v4sf_zero(), sumv1r = v4sf_zero(); v4sf sumv0i = v4sf_zero(), sumv1i = v4sf_zero(); while (n >= 8) { const v4sf xv0t = v4sf_cload(x); const v4sf xv0b = v4sf_cload(x+2); const v4sf yv0t = v4sf_cload(y); const v4sf yv0b = v4sf_cload(y+2); const v4sf xv0r = __builtin_shufflevector(xv0t, xv0b, 0, 2, 4, 6); const v4sf xv0i = __builtin_shufflevector(xv0t, xv0b, 1, 3, 5, 7); const v4sf yv0r = __builtin_shufflevector(yv0t, yv0b, 0, 2, 4, 6); const v4sf yv0i = __builtin_shufflevector(yv0t, yv0b, 1, 3, 5, 7); sumv0r += xv0r * yv0r + xv0i * yv0i; sumv0i += xv0r * yv0i - xv0i * yv0r; const v4sf xv1t = v4sf_cload(x+4); const v4sf xv1b = v4sf_cload(x+6); const v4sf yv1t = v4sf_cload(y+4); const v4sf yv1b = v4sf_cload(y+6); const v4sf xv1r = __builtin_shufflevector(xv1t, xv1b, 0, 2, 4, 6); const v4sf xv1i = __builtin_shufflevector(xv1t, xv1b, 1, 3, 5, 7); const v4sf yv1r = __builtin_shufflevector(yv1t, yv1b, 0, 2, 4, 6); const v4sf yv1i = __builtin_shufflevector(yv1t, yv1b, 1, 3, 5, 7); sumv1r += xv1r * yv1r + xv1i * yv1i; sumv1i += xv1r * yv1i - xv1i * yv1r; x += 8; y += 8; n -= 8; } const v4sf sumvr = sumv0r + sumv1r; const v4sf sumvi = sumv0i + sumv1i; sumr = (sumvr[0] + sumvr[1]) + (sumvr[2] + sumvr[3]); sumi = (sumvi[0] + sumvi[1]) + (sumvi[2] + sumvi[3]); } while (n--) { const float xr = x->real; const float xi = x->imag; const float yr = y->real; const float yi = y->imag; sumr += xr * yr + xi * yi; sumi += xr * yi - xi * yr; x += incx; y += incy; } } #else if (bli_is_noconj(conjx)) { float sum0r = 0.0f, sum1r = 0.0f; float sum0i = 0.0f, sum1i = 0.0f; while (n >= 2) { const float x0r = x->real; const float x0i = x->imag; const float y0r = y->real; const float y0i = y->imag; sum0r += x0r * y0r - x0i * y0i; sum0i += x0r * y0i + x0i * y0r; x += incx; y += incy; const float x1r = x->real; const float x1i = x->imag; const float y1r = y->real; const float y1i = y->imag; sum1r += x1r * y1r - x1i * y1i; sum1i += x1r * y1i + x1i * y1r; x += incx; y += incy; n -= 2; } sumr = sum0r + sum1r; sumi = sum0i + sum1i; if (n != 0) { const float xr = x->real; const float xi = x->imag; const float yr = y->real; const float yi = y->imag; sumr += xr * yr - xi * yi; sumi += xr * yi + xi * yr; } } else { float sum0r = 0.0f, sum1r = 0.0f; float sum0i = 0.0f, sum1i = 0.0f; while (n >= 2) { const float x0r = x->real; const float x0i = x->imag; const float y0r = y->real; const float y0i = y->imag; sum0r += x0r * y0r + x0i * y0i; sum0i += x0r * y0i - x0i * y0r; x += incx; y += incy; const float x1r = x->real; const float x1i = x->imag; const float y1r = y->real; const float y1i = y->imag; sum1r += x1r * y1r + x1i * y1i; sum1i += x1r * y1i - x1i * y1r; x += incx; y += incy; n -= 2; } sumr = sum0r + sum1r; sumi = sum0i + sum1i; if (n != 0) { const float xr = x->real; const float xi = x->imag; const float yr = y->real; const float yi = y->imag; sumr += xr * yr + xi * yi; sumi += xr * yi - xi * yr; } } #endif rho->real = sumr; rho->imag = bli_is_conj(conjy) ? -sumi : sumi; } void bli_zdotv_opt( conj_t conjx, conj_t conjy, dim_t n, dcomplex x[restrict static n], inc_t incx, dcomplex y[restrict static n], inc_t incy, dcomplex rho[restrict static 1]) { if (bli_is_conj(conjy)) { bli_toggle_conj(&conjx); } if (bli_zero_dim1(n)) { rho->real = 0.0; rho->imag = 0.0; return; } double sumr; double sumi; if (bli_is_noconj(conjx)) { double sum0r = 0.0, sum1r = 0.0; double sum0i = 0.0, sum1i = 0.0; while (n >= 2) { const double x0r = x->real; const double x0i = x->imag; const double y0r = y->real; const double y0i = y->imag; sum0r += x0r * y0r - x0i * y0i; sum0i += x0r * y0i + x0i * y0r; x += incx; y += incy; const double x1r = x->real; const double x1i = x->imag; const double y1r = y->real; const double y1i = y->imag; sum1r += x1r * y1r - x1i * y1i; sum1i += x1r * y1i + x1i * y1r; x += incx; y += incy; n -= 2; } sumr = sum0r + sum1r; sumi = sum0i + sum1i; if (n != 0) { const double xr = x->real; const double xi = x->imag; const double yr = y->real; const double yi = y->imag; sumr += xr * yr - xi * yi; sumi += xr * yi + xi * yr; } } else { double sum0r = 0.0, sum1r = 0.0; double sum0i = 0.0, sum1i = 0.0; while (n >= 2) { const double x0r = x->real; const double x0i = x->imag; const double y0r = y->real; const double y0i = y->imag; sum0r += x0r * y0r + x0i * y0i; sum0i += x0r * y0i - x0i * y0r; x += incx; y += incy; const double x1r = x->real; const double x1i = x->imag; const double y1r = y->real; const double y1i = y->imag; sum1r += x1r * y1r + x1i * y1i; sum1i += x1r * y1i - x1i * y1r; x += incx; y += incy; n -= 2; } sumr = sum0r + sum1r; sumi = sum0i + sum1i; if (n != 0) { const double xr = x->real; const double xi = x->imag; const double yr = y->real; const double yi = y->imag; sumr += xr * yr + xi * yi; sumi += xr * yi - xi * yr; } } rho->real = sumr; rho->imag = bli_is_conj(conjy) ? -sumi : sumi; } cython-blis-0.9.1/blis/_src/kernels/old/nacl/pnacl/3/000077500000000000000000000000001427272030600222335ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/nacl/pnacl/3/bli_gemm_opt.c000066400000000000000000000335631427272030600250460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if PPAPI_RELEASE >= 36 typedef float v4sf __attribute__ ((vector_size(16))); inline v4sf v4sf_splat(float x) { return (v4sf) { x, x, x, x }; } inline v4sf v4sf_load(const float* a) { return *((const v4sf*)a); } inline v4sf v4sf_cload(const scomplex* a) { return *((const v4sf*)a); } inline void v4sf_store(float* a, v4sf x) { *((v4sf*)a) = x; } inline void v4sf_cstore(scomplex* a, v4sf x) { *((v4sf*)a) = x; } inline v4sf v4sf_zero() { return (v4sf) { 0.0f, 0.0f, 0.0f, 0.0f }; } void bli_sgemm_opt ( dim_t k, float alpha[restrict static 1], float a[restrict static 8*k], float b[restrict static k*4], float beta[restrict static 1], float c[restrict static 8*4], inc_t rs_c, inc_t cs_c, auxinfo_t* data, cntx_t* cntx ) { // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) v4sf abv0t = v4sf_zero(), abv1t = v4sf_zero(), abv2t = v4sf_zero(), abv3t = v4sf_zero(); v4sf abv0b = v4sf_zero(), abv1b = v4sf_zero(), abv2b = v4sf_zero(), abv3b = v4sf_zero(); for (dim_t i = 0; i < k; i += 1) { const v4sf avt = v4sf_load(a); const v4sf avb = v4sf_load(a+4); const v4sf bv_xxxx = v4sf_splat(b[0]); abv0t += avt * bv_xxxx; abv0b += avb * bv_xxxx; const v4sf bv_yyyy = v4sf_splat(b[1]); abv1t += avt * bv_yyyy; abv1b += avb * bv_yyyy; const v4sf bv_zzzz = v4sf_splat(b[2]); abv2t += avt * bv_zzzz; abv2b += avb * bv_zzzz; const v4sf bv_wwww = v4sf_splat(b[3]); abv3t += avt * bv_wwww; abv3b += avb * bv_wwww; a += 8; b += 4; } const v4sf alphav = v4sf_splat(*alpha); abv0t *= alphav; abv0b *= alphav; abv1t *= alphav; abv1b *= alphav; abv2t *= alphav; abv2b *= alphav; abv3t *= alphav; abv3b *= alphav; if (rs_c == 1) { v4sf cv0t = v4sf_load(&c[0*rs_c + 0*cs_c]); v4sf cv1t = v4sf_load(&c[0*rs_c + 1*cs_c]); v4sf cv2t = v4sf_load(&c[0*rs_c + 2*cs_c]); v4sf cv3t = v4sf_load(&c[0*rs_c + 3*cs_c]); v4sf cv0b = v4sf_load(&c[4*rs_c + 0*cs_c]); v4sf cv1b = v4sf_load(&c[4*rs_c + 1*cs_c]); v4sf cv2b = v4sf_load(&c[4*rs_c + 2*cs_c]); v4sf cv3b = v4sf_load(&c[4*rs_c + 3*cs_c]); const v4sf betav = v4sf_splat(*beta); cv0t = cv0t * betav + abv0t; cv1t = cv1t * betav + abv1t; cv2t = cv2t * betav + abv2t; cv3t = cv3t * betav + abv3t; cv0b = cv0b * betav + abv0b; cv1b = cv1b * betav + abv1b; cv2b = cv2b * betav + abv2b; cv3b = cv3b * betav + abv3b; v4sf_store(&c[0*rs_c + 0*cs_c], cv0t); v4sf_store(&c[0*rs_c + 1*cs_c], cv1t); v4sf_store(&c[0*rs_c + 2*cs_c], cv2t); v4sf_store(&c[0*rs_c + 3*cs_c], cv3t); v4sf_store(&c[4*rs_c + 0*cs_c], cv0b); v4sf_store(&c[4*rs_c + 1*cs_c], cv1b); v4sf_store(&c[4*rs_c + 2*cs_c], cv2b); v4sf_store(&c[4*rs_c + 3*cs_c], cv3b); } else { // Load columns 0, 1, 2, 3 (top part) v4sf cv0t = (v4sf){ c[0*rs_c + 0*cs_c], c[1*rs_c + 0*cs_c], c[2*rs_c + 0*cs_c], c[3*rs_c + 0*cs_c] }; v4sf cv1t = (v4sf){ c[0*rs_c + 1*cs_c], c[1*rs_c + 1*cs_c], c[2*rs_c + 1*cs_c], c[3*rs_c + 1*cs_c] }; v4sf cv2t = (v4sf){ c[0*rs_c + 2*cs_c], c[1*rs_c + 2*cs_c], c[2*rs_c + 2*cs_c], c[3*rs_c + 2*cs_c] }; v4sf cv3t = (v4sf){ c[0*rs_c + 3*cs_c], c[1*rs_c + 3*cs_c], c[2*rs_c + 3*cs_c], c[3*rs_c + 3*cs_c] }; // Load columns 0, 1, 2, 3 (bottom part) v4sf cv0b = (v4sf){ c[4*rs_c + 0*cs_c], c[5*rs_c + 0*cs_c], c[6*rs_c + 0*cs_c], c[7*rs_c + 0*cs_c] }; v4sf cv1b = (v4sf){ c[4*rs_c + 1*cs_c], c[5*rs_c + 1*cs_c], c[6*rs_c + 1*cs_c], c[7*rs_c + 1*cs_c] }; v4sf cv2b = (v4sf){ c[4*rs_c + 2*cs_c], c[5*rs_c + 2*cs_c], c[6*rs_c + 2*cs_c], c[7*rs_c + 2*cs_c] }; v4sf cv3b = (v4sf){ c[4*rs_c + 3*cs_c], c[5*rs_c + 3*cs_c], c[6*rs_c + 3*cs_c], c[7*rs_c + 3*cs_c] }; const v4sf betav = v4sf_splat(*beta); cv0t = cv0t * betav + abv0t; cv1t = cv1t * betav + abv1t; cv2t = cv2t * betav + abv2t; cv3t = cv3t * betav + abv3t; cv0b = cv0b * betav + abv0b; cv1b = cv1b * betav + abv1b; cv2b = cv2b * betav + abv2b; cv3b = cv3b * betav + abv3b; // Store column 0 c[0*rs_c + 0*cs_c] = cv0t[0]; c[1*rs_c + 0*cs_c] = cv0t[1]; c[2*rs_c + 0*cs_c] = cv0t[2]; c[3*rs_c + 0*cs_c] = cv0t[3]; c[4*rs_c + 0*cs_c] = cv0b[0]; c[5*rs_c + 0*cs_c] = cv0b[1]; c[6*rs_c + 0*cs_c] = cv0b[2]; c[7*rs_c + 0*cs_c] = cv0b[3]; // Store column 1 c[0*rs_c + 1*cs_c] = cv1t[0]; c[1*rs_c + 1*cs_c] = cv1t[1]; c[2*rs_c + 1*cs_c] = cv1t[2]; c[3*rs_c + 1*cs_c] = cv1t[3]; c[4*rs_c + 1*cs_c] = cv1b[0]; c[5*rs_c + 1*cs_c] = cv1b[1]; c[6*rs_c + 1*cs_c] = cv1b[2]; c[7*rs_c + 1*cs_c] = cv1b[3]; // Store column 2 c[0*rs_c + 2*cs_c] = cv2t[0]; c[1*rs_c + 2*cs_c] = cv2t[1]; c[2*rs_c + 2*cs_c] = cv2t[2]; c[3*rs_c + 2*cs_c] = cv2t[3]; c[4*rs_c + 2*cs_c] = cv2b[0]; c[5*rs_c + 2*cs_c] = cv2b[1]; c[6*rs_c + 2*cs_c] = cv2b[2]; c[7*rs_c + 2*cs_c] = cv2b[3]; // Store column 3 c[0*rs_c + 3*cs_c] = cv3t[0]; c[1*rs_c + 3*cs_c] = cv3t[1]; c[2*rs_c + 3*cs_c] = cv3t[2]; c[3*rs_c + 3*cs_c] = cv3t[3]; c[4*rs_c + 3*cs_c] = cv3b[0]; c[5*rs_c + 3*cs_c] = cv3b[1]; c[6*rs_c + 3*cs_c] = cv3b[2]; c[7*rs_c + 3*cs_c] = cv3b[3]; } } void bli_cgemm_opt ( dim_t k, scomplex alpha[restrict static 1], scomplex a[restrict static 4*k], scomplex b[restrict static k*4], scomplex beta[restrict static 1], scomplex c[restrict static 4*4], inc_t rs_c, inc_t cs_c, auxinfo_t* data, cntx_t* cntx ) { // Vectors for accummulating column 0, 1, 2, 3 (initialize to 0.0) v4sf abv0r = v4sf_zero(), abv1r = v4sf_zero(), abv2r = v4sf_zero(), abv3r = v4sf_zero(); v4sf abv0i = v4sf_zero(), abv1i = v4sf_zero(), abv2i = v4sf_zero(), abv3i = v4sf_zero(); for (dim_t i = 0; i < k; i += 1) { const v4sf avt = v4sf_cload(a); const v4sf avb = v4sf_cload(a+2); const v4sf avr = __builtin_shufflevector(avt, avb, 0, 2, 4, 6); const v4sf avi = __builtin_shufflevector(avt, avb, 1, 3, 5, 7); const v4sf bv0r = v4sf_splat(b[0].real); const v4sf bv0i = v4sf_splat(b[0].imag); abv0r += avr * bv0r - avi * bv0i; abv0i += avr * bv0i + avi * bv0r; const v4sf bv1r = v4sf_splat(b[1].real); const v4sf bv1i = v4sf_splat(b[1].imag); abv1r += avr * bv1r - avi * bv1i; abv1i += avr * bv1i + avi * bv1r; const v4sf bv2r = v4sf_splat(b[2].real); const v4sf bv2i = v4sf_splat(b[2].imag); abv2r += avr * bv2r - avi * bv2i; abv2i += avr * bv2i + avi * bv2r; const v4sf bv3r = v4sf_splat(b[3].real); const v4sf bv3i = v4sf_splat(b[3].imag); abv3r += avr * bv3r - avi * bv3i; abv3i += avr * bv3i + avi * bv3r; a += 4; b += 4; } const v4sf alphavr = v4sf_splat(alpha->real); const v4sf alphavi = v4sf_splat(alpha->imag); v4sf temp; temp = abv0r * alphavr - abv0i * alphavi; abv0i = abv0r * alphavi + abv0i * alphavr; abv0r = temp; temp = abv1r * alphavr - abv1i * alphavi; abv1i = abv1r * alphavi + abv1i * alphavr; abv1r = temp; temp = abv2r * alphavr - abv2i * alphavi; abv2i = abv2r * alphavi + abv2i * alphavr; abv2r = temp; temp = abv3r * alphavr - abv3i * alphavi; abv3i = abv3r * alphavi + abv3i * alphavr; abv3r = temp; if (rs_c == 1) { const v4sf cv0t = v4sf_cload(&c[0*rs_c + 0*cs_c]); const v4sf cv1t = v4sf_cload(&c[0*rs_c + 1*cs_c]); const v4sf cv2t = v4sf_cload(&c[0*rs_c + 2*cs_c]); const v4sf cv3t = v4sf_cload(&c[0*rs_c + 3*cs_c]); const v4sf cv0b = v4sf_cload(&c[2*rs_c + 0*cs_c]); const v4sf cv1b = v4sf_cload(&c[2*rs_c + 1*cs_c]); const v4sf cv2b = v4sf_cload(&c[2*rs_c + 2*cs_c]); const v4sf cv3b = v4sf_cload(&c[2*rs_c + 3*cs_c]); v4sf cv0r = __builtin_shufflevector(cv0t, cv0b, 0, 2, 4, 6); v4sf cv0i = __builtin_shufflevector(cv0t, cv0b, 1, 3, 5, 7); v4sf cv1r = __builtin_shufflevector(cv1t, cv1b, 0, 2, 4, 6); v4sf cv1i = __builtin_shufflevector(cv1t, cv1b, 1, 3, 5, 7); v4sf cv2r = __builtin_shufflevector(cv2t, cv2b, 0, 2, 4, 6); v4sf cv2i = __builtin_shufflevector(cv2t, cv2b, 1, 3, 5, 7); v4sf cv3r = __builtin_shufflevector(cv3t, cv3b, 0, 2, 4, 6); v4sf cv3i = __builtin_shufflevector(cv3t, cv3b, 1, 3, 5, 7); const v4sf betavr = v4sf_splat(beta->real); const v4sf betavi = v4sf_splat(beta->imag); temp = abv0r + cv0r * betavr - cv0i * betavi; cv0i = abv0i + cv0r * betavi + cv0i * betavr; cv0r = temp; temp = abv1r + cv1r * betavr - cv1i * betavi; cv1i = abv1i + cv1r * betavi + cv1i * betavr; cv1r = temp; temp = abv2r + cv2r * betavr - cv2i * betavi; cv2i = abv2i + cv2r * betavi + cv2i * betavr; cv2r = temp; temp = abv3r + cv3r * betavr - cv3i * betavi; cv3i = abv3i + cv3r * betavi + cv3i * betavr; cv3r = temp; v4sf_cstore(&c[0*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 0, 4, 1, 5)); v4sf_cstore(&c[2*rs_c + 0*cs_c], __builtin_shufflevector(cv0r, cv0i, 2, 6, 3, 7)); v4sf_cstore(&c[0*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 0, 4, 1, 5)); v4sf_cstore(&c[2*rs_c + 1*cs_c], __builtin_shufflevector(cv1r, cv1i, 2, 6, 3, 7)); v4sf_cstore(&c[0*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 0, 4, 1, 5)); v4sf_cstore(&c[2*rs_c + 2*cs_c], __builtin_shufflevector(cv2r, cv2i, 2, 6, 3, 7)); v4sf_cstore(&c[0*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 0, 4, 1, 5)); v4sf_cstore(&c[2*rs_c + 3*cs_c], __builtin_shufflevector(cv3r, cv3i, 2, 6, 3, 7)); } else { // Load columns 0, 1, 2, 3 (real part) v4sf cv0r = (v4sf){ c[0*rs_c + 0*cs_c].real, c[1*rs_c + 0*cs_c].real, c[2*rs_c + 0*cs_c].real, c[3*rs_c + 0*cs_c].real }; v4sf cv1r = (v4sf){ c[0*rs_c + 1*cs_c].real, c[1*rs_c + 1*cs_c].real, c[2*rs_c + 1*cs_c].real, c[3*rs_c + 1*cs_c].real }; v4sf cv2r = (v4sf){ c[0*rs_c + 2*cs_c].real, c[1*rs_c + 2*cs_c].real, c[2*rs_c + 2*cs_c].real, c[3*rs_c + 2*cs_c].real }; v4sf cv3r = (v4sf){ c[0*rs_c + 3*cs_c].real, c[1*rs_c + 3*cs_c].real, c[2*rs_c + 3*cs_c].real, c[3*rs_c + 3*cs_c].real }; // Load columns 0, 1, 2, 3 (imaginary part) v4sf cv0i = (v4sf){ c[0*rs_c + 0*cs_c].imag, c[1*rs_c + 0*cs_c].imag, c[2*rs_c + 0*cs_c].imag, c[3*rs_c + 0*cs_c].imag }; v4sf cv1i = (v4sf){ c[0*rs_c + 1*cs_c].imag, c[1*rs_c + 1*cs_c].imag, c[2*rs_c + 1*cs_c].imag, c[3*rs_c + 1*cs_c].imag }; v4sf cv2i = (v4sf){ c[0*rs_c + 2*cs_c].imag, c[1*rs_c + 2*cs_c].imag, c[2*rs_c + 2*cs_c].imag, c[3*rs_c + 2*cs_c].imag }; v4sf cv3i = (v4sf){ c[0*rs_c + 3*cs_c].imag, c[1*rs_c + 3*cs_c].imag, c[2*rs_c + 3*cs_c].imag, c[3*rs_c + 3*cs_c].imag }; const v4sf betavr = v4sf_splat(beta->real); const v4sf betavi = v4sf_splat(beta->imag); temp = abv0r + cv0r * betavr - cv0i * betavi; cv0i = abv0i + cv0r * betavi + cv0i * betavr; cv0r = temp; temp = abv1r + cv1r * betavr - cv1i * betavi; cv1i = abv1i + cv1r * betavi + cv1i * betavr; cv1r = temp; temp = abv2r + cv2r * betavr - cv2i * betavi; cv2i = abv2i + cv2r * betavi + cv2i * betavr; cv2r = temp; temp = abv3r + cv3r * betavr - cv3i * betavi; cv3i = abv3i + cv3r * betavi + cv3i * betavr; cv3r = temp; // Store column 0 c[0*rs_c + 0*cs_c].real = cv0r[0]; c[0*rs_c + 0*cs_c].imag = cv0i[0]; c[1*rs_c + 0*cs_c].real = cv0r[1]; c[1*rs_c + 0*cs_c].imag = cv0i[1]; c[2*rs_c + 0*cs_c].real = cv0r[2]; c[2*rs_c + 0*cs_c].imag = cv0i[2]; c[3*rs_c + 0*cs_c].real = cv0r[3]; c[3*rs_c + 0*cs_c].imag = cv0i[3]; // Store column 1 c[0*rs_c + 1*cs_c].real = cv1r[0]; c[0*rs_c + 1*cs_c].imag = cv1i[0]; c[1*rs_c + 1*cs_c].real = cv1r[1]; c[1*rs_c + 1*cs_c].imag = cv1i[1]; c[2*rs_c + 1*cs_c].real = cv1r[2]; c[2*rs_c + 1*cs_c].imag = cv1i[2]; c[3*rs_c + 1*cs_c].real = cv1r[3]; c[3*rs_c + 1*cs_c].imag = cv1i[3]; // Store column 2 c[0*rs_c + 2*cs_c].real = cv2r[0]; c[0*rs_c + 2*cs_c].imag = cv2i[0]; c[1*rs_c + 2*cs_c].real = cv2r[1]; c[1*rs_c + 2*cs_c].imag = cv2i[1]; c[2*rs_c + 2*cs_c].real = cv2r[2]; c[2*rs_c + 2*cs_c].imag = cv2i[2]; c[3*rs_c + 2*cs_c].real = cv2r[3]; c[3*rs_c + 2*cs_c].imag = cv2i[3]; // Store column 3 c[0*rs_c + 3*cs_c].real = cv3r[0]; c[0*rs_c + 3*cs_c].imag = cv3i[0]; c[1*rs_c + 3*cs_c].real = cv3r[1]; c[1*rs_c + 3*cs_c].imag = cv3i[1]; c[2*rs_c + 3*cs_c].real = cv3r[2]; c[2*rs_c + 3*cs_c].imag = cv3i[2]; c[3*rs_c + 3*cs_c].real = cv3r[3]; c[3*rs_c + 3*cs_c].imag = cv3i[3]; } } #endif cython-blis-0.9.1/blis/_src/kernels/old/x86/000077500000000000000000000000001427272030600205045ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/x86/1m/000077500000000000000000000000001427272030600210215ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/x86/1m/bli_packm_2xk.c000066400000000000000000000242731427272030600237020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_spackm_2xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dpackm_2xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { double* restrict beta_cast = beta; double* restrict alpha1 = a; double* restrict pi1 = p; inc_t off1 = 1 * inca * sizeof(double); inc_t ldas = lda * sizeof(double); if ( bli_deq1( *beta_cast ) ) { dim_t n_iter = n / 4; dim_t n_left = n % 4; __asm__ volatile ( " \n\t" "movl %2, %%edi \n\t" // load a "movl %3, %%ebp \n\t" // load p " \n\t" "movl %4, %%eax \n\t" // load ldas "leal (%%edi,%%eax), %%edx \n\t" // load a + ldas "sall $1, %%eax \n\t" // ldas *= 2; " \n\t" "movl %5, %%ebx \n\t" // load off1 " \n\t" " \n\t" "movl %0, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DCONSIDERKLEFT \n\t" " \n\t" " \n\t" ".DLOOPKITER: \n\t" " \n\t" "addl $64, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" // iteration 0 "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -8 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edx ), %%xmm1 \n\t" // iteration 1 "movhpd (%%edx,%%ebx, ), %%xmm1 \n\t" "addl %%eax, %%edx \n\t" "movapd %%xmm1, -6 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edi ), %%xmm2 \n\t" // iteration 2 "movhpd (%%edi,%%ebx, ), %%xmm2 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm2, -4 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edx ), %%xmm3 \n\t" // iteration 3 "movhpd (%%edx,%%ebx, ), %%xmm3 \n\t" "addl %%eax, %%edx \n\t" "movapd %%xmm3, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKITER \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDERKLEFT: \n\t" " \n\t" "movl %1, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DDONE \n\t" " \n\t" " \n\t" " \n\t" ".DLOOPKLEFT: \n\t" " \n\t" "addl $16, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKLEFT \n\t" " \n\t" " \n\t" " \n\t" ".DDONE: \n\t" " \n\t" : // output operands : // input operands "m" (n_iter), "m" (n_left), "m" (alpha1), "m" (pi1), "m" (ldas), "m" (off1) : // register clobber list "eax", "ebx", "ecx", "edx", "edi", "ebp", "esi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } else { dim_t n_iter = n / 4; dim_t n_left = n % 4; __asm__ volatile ( " \n\t" "movl %2, %%edi \n\t" // load a "movl %3, %%ebp \n\t" // load p " \n\t" "movl %4, %%eax \n\t" // load ldas "leal (%%edi,%%eax), %%edx \n\t" // load a + ldas "sall $1, %%eax \n\t" // ldas *= 2; " \n\t" "movl %5, %%ebx \n\t" // load off1 " \n\t" "movl %6, %%esi \n\t" // load beta "movddup (%%esi), %%xmm7 \n\t" // load and duplicate *beta " \n\t" "movl %0, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DCONSIDERKLEFT2 \n\t" " \n\t" " \n\t" ".DLOOPKITER2: \n\t" " \n\t" "addl $64, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" // iteration 0 "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "mulpd %%xmm7, %%xmm0 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -8 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edx ), %%xmm1 \n\t" // iteration 1 "movhpd (%%edx,%%ebx, ), %%xmm1 \n\t" "mulpd %%xmm7, %%xmm1 \n\t" "addl %%eax, %%edx \n\t" "movapd %%xmm1, -6 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edi ), %%xmm2 \n\t" // iteration 2 "movhpd (%%edi,%%ebx, ), %%xmm2 \n\t" "mulpd %%xmm7, %%xmm2 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm2, -4 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edx ), %%xmm3 \n\t" // iteration 3 "movhpd (%%edx,%%ebx, ), %%xmm3 \n\t" "mulpd %%xmm7, %%xmm3 \n\t" "addl %%eax, %%edx \n\t" "movapd %%xmm3, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKITER2 \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDERKLEFT2: \n\t" " \n\t" "movl %1, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DDONE2 \n\t" " \n\t" " \n\t" " \n\t" ".DLOOPKLEFT2: \n\t" " \n\t" "addl $16, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "mulpd %%xmm7, %%xmm0 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKLEFT2 \n\t" " \n\t" " \n\t" " \n\t" ".DDONE2: \n\t" " \n\t" : // output operands : // input operands "m" (n_iter), "m" (n_left), "m" (alpha1), "m" (pi1), "m" (ldas), "m" (off1), "m" (beta) : // register clobber list "eax", "ebx", "ecx", "edx", "edi", "ebp", "esi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } } void bli_cpackm_2xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zpackm_2xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } cython-blis-0.9.1/blis/_src/kernels/old/x86/1m/bli_packm_2xk.h000066400000000000000000000040101427272030600236720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ dim_t n, \ void* beta, \ void* a, inc_t inca, inc_t lda, \ void* p \ ); INSERT_GENTPROT_BASIC( packm_2xk ) cython-blis-0.9.1/blis/_src/kernels/old/x86/1m/bli_packm_4xk.c000066400000000000000000000261621427272030600237030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_spackm_4xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dpackm_4xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { double* restrict beta_cast = beta; double* restrict alpha1 = a; double* restrict pi1 = p; inc_t off1 = 1 * inca * sizeof(double); inc_t off3 = 3 * inca * sizeof(double); inc_t ldas = lda * sizeof(double); if ( bli_deq1( *beta_cast ) ) { dim_t n_iter = n / 4; dim_t n_left = n % 4; __asm__ volatile ( " \n\t" //"movapd 4096(%%ebp), %%xmm7 \n\t" //"movapd %%xmm7, 4096(%%ebp) \n\t" " \n\t" "movl %2, %%edi \n\t" // load a "movl %3, %%ebp \n\t" // load p " \n\t" "movl %4, %%eax \n\t" // load ldas "leal (%%edi,%%eax), %%edx \n\t" // load a + ldas "sall $1, %%eax \n\t" // ldas *= 2; " \n\t" "movl %5, %%ebx \n\t" // load off1 "movl %6, %%ecx \n\t" // load off3 " \n\t" "movl %0, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DCONSIDERKLEFT \n\t" " \n\t" " \n\t" " \n\t" ".DLOOPKITER: \n\t" " \n\t" "addl $128, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" // iteration 0 "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "movlpd (%%edi,%%ebx,2), %%xmm1 \n\t" "movhpd (%%edi,%%ecx, ), %%xmm1 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -16 * 8(%%ebp) \n\t" "movapd %%xmm1, -14 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edx ), %%xmm2 \n\t" // iteration 1 "movhpd (%%edx,%%ebx, ), %%xmm2 \n\t" "movlpd (%%edx,%%ebx,2), %%xmm3 \n\t" "movhpd (%%edx,%%ecx, ), %%xmm3 \n\t" "addl %%eax, %%edx \n\t" "movapd %%xmm2, -12 * 8(%%ebp) \n\t" "movapd %%xmm3, -10 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edi ), %%xmm4 \n\t" // iteration 2 "movhpd (%%edi,%%ebx, ), %%xmm4 \n\t" "movlpd (%%edi,%%ebx,2), %%xmm5 \n\t" "movhpd (%%edi,%%ecx, ), %%xmm5 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm4, -8 * 8(%%ebp) \n\t" "movapd %%xmm5, -6 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edx ), %%xmm6 \n\t" // iteration 3 "movhpd (%%edx,%%ebx, ), %%xmm6 \n\t" "movlpd (%%edx,%%ebx,2), %%xmm7 \n\t" "movhpd (%%edx,%%ecx, ), %%xmm7 \n\t" "addl %%eax, %%edx \n\t" "movapd %%xmm6, -4 * 8(%%ebp) \n\t" "movapd %%xmm7, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKITER \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDERKLEFT: \n\t" " \n\t" "movl %1, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DDONE \n\t" " \n\t" " \n\t" " \n\t" ".DLOOPKLEFT: \n\t" " \n\t" "addl $32, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "movlpd (%%edi,%%ebx,2), %%xmm1 \n\t" "movhpd (%%edi,%%ecx, ), %%xmm1 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -4 * 8(%%ebp) \n\t" "movapd %%xmm1, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKLEFT \n\t" " \n\t" " \n\t" " \n\t" ".DDONE: \n\t" " \n\t" : // output operands : // input operands "m" (n_iter), "m" (n_left), "m" (alpha1), "m" (pi1), "m" (ldas), "m" (off1), "m" (off3) : // register clobber list "eax", "ebx", "ecx", "edx", "edi", "ebp", "esi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } else { dim_t n_iter = n / 2; dim_t n_left = n % 2; __asm__ volatile ( " \n\t" "movl %2, %%edi \n\t" // load a "movl %3, %%ebp \n\t" // load p " \n\t" "movl %4, %%eax \n\t" // load ldas "leal (%%edi,%%eax), %%edx \n\t" // load a + ldas "sall $1, %%eax \n\t" // ldas *= 2; " \n\t" "movl %5, %%ebx \n\t" // load off1 "movl %6, %%ecx \n\t" // load off3 " \n\t" "movl %7, %%esi \n\t" // load beta "movddup (%%esi), %%xmm7 \n\t" // load and duplicate *beta " \n\t" "movl %0, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DCONSIDERKLEFT2 \n\t" " \n\t" " \n\t" ".DLOOPKITER2: \n\t" " \n\t" "addl $64, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" // iteration 0 "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "movlpd (%%edi,%%ebx,2), %%xmm1 \n\t" "movhpd (%%edi,%%ecx, ), %%xmm1 \n\t" "mulpd %%xmm7, %%xmm0 \n\t" "mulpd %%xmm7, %%xmm1 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -8 * 8(%%ebp) \n\t" "movapd %%xmm1, -6 * 8(%%ebp) \n\t" " \n\t" "movlpd (%%edx ), %%xmm2 \n\t" // iteration 1 "movhpd (%%edx,%%ebx, ), %%xmm2 \n\t" "movlpd (%%edx,%%ebx,2), %%xmm3 \n\t" "movhpd (%%edx,%%ecx, ), %%xmm3 \n\t" "mulpd %%xmm7, %%xmm2 \n\t" "mulpd %%xmm7, %%xmm3 \n\t" "addl %%eax, %%edx \n\t" "movapd %%xmm2, -4 * 8(%%ebp) \n\t" "movapd %%xmm3, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKITER2 \n\t" " \n\t" " \n\t" " \n\t" ".DCONSIDERKLEFT2: \n\t" " \n\t" "movl %1, %%esi \n\t" "testl %%esi, %%esi \n\t" "je .DDONE2 \n\t" " \n\t" " \n\t" " \n\t" ".DLOOPKLEFT2: \n\t" " \n\t" "addl $32, %%ebp \n\t" " \n\t" "movlpd (%%edi ), %%xmm0 \n\t" "movhpd (%%edi,%%ebx, ), %%xmm0 \n\t" "movlpd (%%edi,%%ebx,2), %%xmm1 \n\t" "movhpd (%%edi,%%ecx, ), %%xmm1 \n\t" "mulpd %%xmm7, %%xmm0 \n\t" "mulpd %%xmm7, %%xmm1 \n\t" "addl %%eax, %%edi \n\t" "movapd %%xmm0, -4 * 8(%%ebp) \n\t" "movapd %%xmm1, -2 * 8(%%ebp) \n\t" " \n\t" "decl %%esi \n\t" "jne .DLOOPKLEFT2 \n\t" " \n\t" " \n\t" " \n\t" ".DDONE2: \n\t" " \n\t" : // output operands : // input operands "m" (n_iter), "m" (n_left), "m" (alpha1), "m" (pi1), "m" (ldas), "m" (off1), "m" (off3), "m" (beta) : // register clobber list "eax", "ebx", "ecx", "edx", "edi", "ebp", "esi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } } void bli_cpackm_4xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zpackm_4xk( conj_t conja, dim_t n, void* beta, void* a, inc_t inca, inc_t lda, void* p ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } cython-blis-0.9.1/blis/_src/kernels/old/x86/1m/bli_packm_4xk.h000066400000000000000000000040101427272030600236740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #undef GENTPROT #define GENTPROT( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname)( \ conj_t conja, \ dim_t n, \ void* beta, \ void* a, inc_t inca, inc_t lda, \ void* p \ ); INSERT_GENTPROT_BASIC( packm_4xk ) cython-blis-0.9.1/blis/_src/kernels/old/x86/3/000077500000000000000000000000001427272030600206465ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/old/x86/3/bli_gemm_opt_d2x4.c000066400000000000000000000407361427272030600243220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_sgemm_opt_d2x4( dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemm_opt_d2x4( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c ) { dim_t k_iter; dim_t k_left; k_iter = k / 8; k_left = k % 8; __asm__ volatile ( " \n\t" "movl %6, %%ecx \n\t" // load address of c " \n\t" "movl %8, %%edi \n\t" // load cs_c "sall $3, %%edi \n\t" // cs_c *= sizeof(double) " \n\t" "leal (%%ecx,%%edi,2), %%edx \n\t" // load address of c + 2*cs_c " \n\t" "prefetcht0 (%%ecx) \n\t" // give a T0 prefetch hint for c00. "prefetcht0 (%%ecx,%%edi) \n\t" // give a T0 prefetch hint for c01. "prefetcht0 (%%edx) \n\t" // give a T0 prefetch hint for c02. "prefetcht0 (%%edx,%%edi) \n\t" // give a T0 prefetch hint for c03. " \n\t" "movl %2, %%eax \n\t" // load address of a. "movl %3, %%ebx \n\t" // load address of b. " \n\t" "addl $8 * 16, %%eax \n\t" // increment pointers to allow byte "addl $8 * 16, %%ebx \n\t" // offsets in the unrolled iterations. " \n\t" "movapd -8 * 16(%%eax), %%xmm0 \n\t" // initialize loop by pre-loading elements "movapd -8 * 16(%%ebx), %%xmm1 \n\t" // of a and b. " \n\t" "pxor %%xmm2, %%xmm2 \n\t" "pxor %%xmm3, %%xmm3 \n\t" " \n\t" "pxor %%xmm4, %%xmm4 \n\t" "pxor %%xmm5, %%xmm5 \n\t" "pxor %%xmm6, %%xmm6 \n\t" "pxor %%xmm7, %%xmm7 \n\t" " \n\t" " \n\t" " \n\t" "movl %0, %%esi \n\t" // i = k_iter; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".LOOPKITER: \n\t" // MAIN LOOP " \n\t" "prefetcht0 (8*21+4)*8(%%eax) \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 0 "movapd -7 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd -6 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 1 "movapd -5 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd -4 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -6 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 2 "movapd -3 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd -2 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -5 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 3 "movapd -1 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd 0 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -4 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" "prefetcht0 (8*21+12)*8(%%eax) \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 4 "movapd 1 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd 2 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -3 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 5 "movapd 3 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd 4 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -2 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 6 "movapd 5 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd 6 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -1 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration 7 "movapd 7 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd 8 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd 0 * 16(%%eax), %%xmm0 \n\t" " \n\t" "addl $256, %%ebx \n\t" // b += 8*2*2 (unroll x nr x ndup) "addl $128, %%eax \n\t" // a += 8*4 (unroll x mr) " \n\t" " \n\t" "decl %%esi \n\t" // i -= 1; "jne .LOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".CONSIDERKLEFT: \n\t" " \n\t" "movl %1, %%esi \n\t" // i = k_left; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".LOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "addpd %%xmm3, %%xmm7 \n\t" // iteration i "movapd -7 * 16(%%ebx), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "pshufd $0x4e, %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" " \n\t" "addpd %%xmm1, %%xmm5 \n\t" "movapd -6 * 16(%%ebx), %%xmm1 \n\t" "addpd %%xmm2, %%xmm4 \n\t" "pshufd $0x4e, %%xmm3, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm3 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" " \n\t" " \n\t" " \n\t" " \n\t" "addl $32, %%ebx \n\t" // b += 4 (1 x mr) "addl $16, %%eax \n\t" // a += 2*2 (1 x nr x ndup) " \n\t" "decl %%esi \n\t" // i -= 1; "jne .LOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".POSTACCUM: \n\t" " \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm3, %%xmm7 \n\t" " \n\t" " \n\t" "movl %4, %%eax \n\t" // load address of alpha "movl %5, %%ebx \n\t" // load address of beta "movddup (%%eax), %%xmm2 \n\t" // load alpha and duplicate "movddup (%%ebx), %%xmm3 \n\t" // load beta and duplicate " \n\t" " \n\t" " \n\t" "movl %7, %%esi \n\t" // load rs_c "sall $3, %%esi \n\t" // rs_c *= sizeof(double) " \n\t" " \n\t" "movapd %%xmm4, %%xmm0 \n\t" "movsd %%xmm5, %%xmm4 \n\t" "movsd %%xmm0, %%xmm5 \n\t" " \n\t" "movapd %%xmm6, %%xmm0 \n\t" "movsd %%xmm7, %%xmm6 \n\t" "movsd %%xmm0, %%xmm7 \n\t" " \n\t" " \n\t" " \n\t" "movlpd (%%ecx), %%xmm0 \n\t" // load c00 and c10, "movhpd (%%ecx,%%esi), %%xmm0 \n\t" "mulpd %%xmm2, %%xmm4 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm0 \n\t" // scale by beta, "addpd %%xmm4, %%xmm0 \n\t" // add the gemm result, "movlpd %%xmm0, (%%ecx) \n\t" // and store back to memory. "movhpd %%xmm0, (%%ecx,%%esi) \n\t" "addl %%edi, %%ecx \n\t" " \n\t" "movlpd (%%edx), %%xmm1 \n\t" // load c02 and c12, "movhpd (%%edx,%%esi), %%xmm1 \n\t" "mulpd %%xmm2, %%xmm6 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm1 \n\t" // scale by beta, "addpd %%xmm6, %%xmm1 \n\t" // add the gemm result, "movlpd %%xmm1, (%%edx) \n\t" // and store back to memory. "movhpd %%xmm1, (%%edx,%%esi) \n\t" "addl %%edi, %%edx \n\t" " \n\t" "movlpd (%%ecx), %%xmm0 \n\t" // load c01 and c11, "movhpd (%%ecx,%%esi), %%xmm0 \n\t" "mulpd %%xmm2, %%xmm5 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm0 \n\t" // scale by beta, "addpd %%xmm5, %%xmm0 \n\t" // add the gemm result, "movlpd %%xmm0, (%%ecx) \n\t" // and store back to memory. "movhpd %%xmm0, (%%ecx,%%esi) \n\t" " \n\t" "movlpd (%%edx), %%xmm1 \n\t" // load c03 and c13, "movhpd (%%edx,%%esi), %%xmm1 \n\t" "mulpd %%xmm2, %%xmm7 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm1 \n\t" // scale by beta, "addpd %%xmm7, %%xmm1 \n\t" // add the gemm result, "movlpd %%xmm1, (%%edx) \n\t" // and store back to memory. "movhpd %%xmm1, (%%edx,%%esi) \n\t" " \n\t" " \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), "m" (k_left), "m" (a), "m" (b), "m" (alpha), "m" (beta), "m" (c), "m" (rs_c), "m" (cs_c) : // register clobber list "eax", "ebx", "ecx", "edx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } void bli_cgemm_opt_d2x4( dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemm_opt_d2x4( dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } cython-blis-0.9.1/blis/_src/kernels/old/x86/3/bli_gemm_opt_d4x2.c000066400000000000000000000356511427272030600243220ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_sgemm_opt_d4x2( dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, float* restrict a_next, float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemm_opt_d4x2( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, double* restrict a_next, double* restrict b_next ) { dim_t k_iter; dim_t k_left; k_iter = k / 8; k_left = k % 8; __asm__ volatile ( " \n\t" "movl %6, %%ecx \n\t" // load address of c " \n\t" "movl %8, %%edi \n\t" // load cs_c "sall $3, %%edi \n\t" // cs_c *= sizeof(double) " \n\t" "prefetcht0 (%%ecx) \n\t" // give a T0 prefetch hint for c00. "prefetcht0 (%%ecx,%%edi) \n\t" // give a T0 prefetch hint for c01. " \n\t" "movl %2, %%eax \n\t" // load address of a. "movl %3, %%ebx \n\t" // load address of b. " \n\t" "addl $8 * 16, %%eax \n\t" // increment pointers to allow byte "addl $8 * 16, %%ebx \n\t" // offsets in the unrolled iterations. " \n\t" "movapd -8 * 16(%%eax), %%xmm0 \n\t" // initialize loop by pre-loading elements "movapd -4 * 16(%%eax), %%xmm3 \n\t" // of a. " \n\t" "pxor %%xmm4, %%xmm4 \n\t" "pxor %%xmm5, %%xmm5 \n\t" "pxor %%xmm6, %%xmm6 \n\t" "pxor %%xmm7, %%xmm7 \n\t" " \n\t" " \n\t" " \n\t" "movl %0, %%esi \n\t" // i = k_iter; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".LOOPKITER: \n\t" // MAIN LOOP " \n\t" "movapd -8 * 16(%%ebx), %%xmm1 \n\t" // iteration 0 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -7 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd -6 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -6 * 16(%%ebx), %%xmm1 \n\t" // iteration 1 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -5 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -5 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 0 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -4 * 16(%%ebx), %%xmm1 \n\t" // iteration 2 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -3 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -3 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd -2 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -2 * 16(%%ebx), %%xmm1 \n\t" // iteration 3 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -1 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -1 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd 4 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 0 * 16(%%ebx), %%xmm1 \n\t" // iteration 4 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 1 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd 1 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 2 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 2 * 16(%%ebx), %%xmm1 \n\t" // iteration 5 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 3 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd 3 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 8 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 4 * 16(%%ebx), %%xmm1 \n\t" // iteration 6 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 5 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd 5 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd 6 * 16(%%eax), %%xmm3 \n\t" "addl $8 * 4 * 8, %%eax \n\t" // a += 8*4 (unroll x mr) "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 6 * 16(%%ebx), %%xmm1 \n\t" // iteration 7 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 7 * 16(%%ebx), %%xmm1 \n\t" "addl $8 * 2 * 2 * 8, %%ebx \n\t" // b += 8*2*2 (unroll x nr x ndup) "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -9 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "decl %%esi \n\t" // i -= 1; "movapd -4 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "jne .LOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".CONSIDERKLEFT: \n\t" " \n\t" "movl %1, %%esi \n\t" // i = k_left; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".LOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "movapd -8 * 16(%%ebx), %%xmm1 \n\t" // iteration i "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -7 * 16(%%ebx), %%xmm1 \n\t" "addl $1 * 2 * 2 * 8, %%ebx \n\t" // b += 2*2 (1 x nr x ndup) "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd -6 * 16(%%eax), %%xmm0 \n\t" "addl $1 * 4 * 8, %%eax \n\t" // a += 4 (1 x mr) "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "decl %%esi \n\t" // i -= 1; "jne .LOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".POSTACCUM: \n\t" " \n\t" "movl %4, %%eax \n\t" // load address of alpha "movl %5, %%ebx \n\t" // load address of beta "movddup (%%eax), %%xmm2 \n\t" // load alpha and duplicate "movddup (%%ebx), %%xmm3 \n\t" // load beta and duplicate " \n\t" " \n\t" " \n\t" "movl %7, %%esi \n\t" // load rs_c "sall $3, %%esi \n\t" // rs_c *= sizeof(double) " \n\t" "leal (%%ecx,%%esi,2), %%edx \n\t" // load address of c + 2*rs_c; " \n\t" " \n\t" " \n\t" "movlpd (%%ecx), %%xmm0 \n\t" // load c00 and c10, "movhpd (%%ecx,%%esi), %%xmm0 \n\t" "mulpd %%xmm2, %%xmm4 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm0 \n\t" // scale by beta, "addpd %%xmm4, %%xmm0 \n\t" // add the gemm result, "movlpd %%xmm0, (%%ecx) \n\t" // and store back to memory. "movhpd %%xmm0, (%%ecx,%%esi) \n\t" "addl %%edi, %%ecx \n\t" " \n\t" "movlpd (%%edx), %%xmm1 \n\t" // load c01 and c11, "movhpd (%%edx,%%esi), %%xmm1 \n\t" "mulpd %%xmm2, %%xmm6 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm1 \n\t" // scale by beta, "addpd %%xmm6, %%xmm1 \n\t" // add the gemm result, "movlpd %%xmm1, (%%edx) \n\t" // and store back to memory. "movhpd %%xmm1, (%%edx,%%esi) \n\t" "addl %%edi, %%edx \n\t" " \n\t" "movlpd (%%ecx), %%xmm0 \n\t" // load c20 and c30, "movhpd (%%ecx,%%esi), %%xmm0 \n\t" "mulpd %%xmm2, %%xmm5 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm0 \n\t" // scale by beta, "addpd %%xmm5, %%xmm0 \n\t" // add the gemm result, "movlpd %%xmm0, (%%ecx) \n\t" // and store back to memory. "movhpd %%xmm0, (%%ecx,%%esi) \n\t" " \n\t" "movlpd (%%edx), %%xmm1 \n\t" // load c21 and c31, "movhpd (%%edx,%%esi), %%xmm1 \n\t" "mulpd %%xmm2, %%xmm7 \n\t" // scale by alpha, "mulpd %%xmm3, %%xmm1 \n\t" // scale by beta, "addpd %%xmm7, %%xmm1 \n\t" // add the gemm result, "movlpd %%xmm1, (%%edx) \n\t" // and store back to memory. "movhpd %%xmm1, (%%edx,%%esi) \n\t" " \n\t" " \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), "m" (k_left), "m" (a), "m" (b), "m" (alpha), "m" (beta), "m" (c), "m" (rs_c), "m" (cs_c) : // register clobber list "eax", "ebx", "ecx", "edx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } void bli_cgemm_opt_d4x2( dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, scomplex* restrict a_next, scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemm_opt_d4x2( dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, dcomplex* restrict a_next, dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } cython-blis-0.9.1/blis/_src/kernels/old/x86/3/bli_gemmtrsm_l_opt_d4x2.c000066400000000000000000000516121427272030600255360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_sgemmtrsm_l_opt_d4x2( dim_t k, float* restrict alpha, float* restrict a10, float* restrict a11, float* restrict bd01, float* restrict bd11, float* restrict b11, float* restrict c11, inc_t rs_c, inc_t cs_c, float* restrict a_next, float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemmtrsm_l_opt_d4x2( dim_t k, double* restrict alpha, double* restrict a10, double* restrict a11, double* restrict bd01, double* restrict bd11, double* restrict b11, double* restrict c11, inc_t rs_c, inc_t cs_c, double* restrict a_next, double* restrict b_next ) { dim_t k_iter; dim_t k_left; k_iter = k / 8; k_left = k % 8; __asm__ volatile ( " \n\t" "movl %2, %%eax \n\t" // load address of a10. "movl %4, %%ebx \n\t" // load address of bd01. " \n\t" "addl $8 * 16, %%eax \n\t" // increment pointers to allow byte "addl $8 * 16, %%ebx \n\t" // offsets in the unrolled iterations. " \n\t" "movapd -8 * 16(%%eax), %%xmm0 \n\t" // initialize loop by pre-loading elements "movapd -4 * 16(%%eax), %%xmm3 \n\t" // and of a. " \n\t" "pxor %%xmm4, %%xmm4 \n\t" "pxor %%xmm5, %%xmm5 \n\t" "pxor %%xmm6, %%xmm6 \n\t" "pxor %%xmm7, %%xmm7 \n\t" " \n\t" " \n\t" " \n\t" "movl %0, %%esi \n\t" // i = k_iter; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".LOOPKITER: \n\t" // MAIN LOOP " \n\t" "movapd -8 * 16(%%ebx), %%xmm1 \n\t" // iteration 0 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -7 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd -6 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -6 * 16(%%ebx), %%xmm1 \n\t" // iteration 1 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -5 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -5 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 0 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -4 * 16(%%ebx), %%xmm1 \n\t" // iteration 2 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -3 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -3 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd -2 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -2 * 16(%%ebx), %%xmm1 \n\t" // iteration 3 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -1 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -1 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd 4 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 0 * 16(%%ebx), %%xmm1 \n\t" // iteration 4 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 1 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd 1 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 2 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 2 * 16(%%ebx), %%xmm1 \n\t" // iteration 5 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 3 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd 3 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 8 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 4 * 16(%%ebx), %%xmm1 \n\t" // iteration 6 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 5 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd 5 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd 6 * 16(%%eax), %%xmm3 \n\t" "addl $8 * 4 * 8, %%eax \n\t" // a += 8*4 (unroll x mr) "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 6 * 16(%%ebx), %%xmm1 \n\t" // iteration 7 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 7 * 16(%%ebx), %%xmm1 \n\t" "addl $8 * 2 * 2 * 8, %%ebx \n\t" // b += 8*2*2 (unroll x nr x ndup) "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -9 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "decl %%esi \n\t" // i -= 1; "movapd -4 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "jne .LOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".CONSIDERKLEFT: \n\t" " \n\t" "movl %1, %%esi \n\t" // i = k_left; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".LOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "movapd -8 * 16(%%ebx), %%xmm1 \n\t" // iteration i "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -7 * 16(%%ebx), %%xmm1 \n\t" "addl $1 * 2 * 2 * 8, %%ebx \n\t" // b += 2*2 (1 x nr x ndup) "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd -6 * 16(%%eax), %%xmm0 \n\t" "addl $1 * 4 * 8, %%eax \n\t" // a += 4 (1 x mr) "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "decl %%esi \n\t" // i -= 1; "jne .LOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".POSTACCUM: \n\t" " \n\t" " \n\t" "movl %6, %%ebx \n\t" // load address of b11. " \n\t" " \n\t" // xmm4 == ( ab00 xmm5 == ( ab01 " \n\t" // ab10 ) ab11 ) " \n\t" // xmm6 == ( ab20 xmm7 == ( ab21 " \n\t" // ab30 ) ab31 ) "movapd %%xmm4, %%xmm0 \n\t" "unpcklpd %%xmm5, %%xmm0 \n\t" "unpckhpd %%xmm5, %%xmm4 \n\t" "movapd %%xmm4, %%xmm1 \n\t" " \n\t" "movapd %%xmm6, %%xmm2 \n\t" "unpcklpd %%xmm7, %%xmm2 \n\t" "unpckhpd %%xmm7, %%xmm6 \n\t" "movapd %%xmm6, %%xmm3 \n\t" " \n\t" // xmm0 == ( ab00 ab01 ) " \n\t" // xmm1 == ( ab10 ab11 ) " \n\t" // xmm2 == ( ab20 ab21 ) " \n\t" // xmm3 == ( ab30 ab31 ) " \n\t" "movl %10, %%eax \n\t" // load address of alpha "movddup (%%eax), %%xmm7 \n\t" // load alpha and duplicate " \n\t" "movapd 0 * 16(%%ebx), %%xmm4 \n\t" "movapd 1 * 16(%%ebx), %%xmm5 \n\t" "mulpd %%xmm7, %%xmm4 \n\t" // xmm4 = alpha * ( beta00 beta01 ) "mulpd %%xmm7, %%xmm5 \n\t" // xmm5 = alpha * ( beta10 beta11 ) "movapd 2 * 16(%%ebx), %%xmm6 \n\t" "mulpd %%xmm7, %%xmm6 \n\t" // xmm6 = alpha * ( beta20 beta21 ) "mulpd 3 * 16(%%ebx), %%xmm7 \n\t" // xmm7 = alpha * ( beta30 beta31 ) " \n\t" "subpd %%xmm0, %%xmm4 \n\t" // xmm4 -= xmm0 "subpd %%xmm1, %%xmm5 \n\t" // xmm5 -= xmm1 "subpd %%xmm2, %%xmm6 \n\t" // xmm6 -= xmm2 "subpd %%xmm3, %%xmm7 \n\t" // xmm7 -= xmm3 " \n\t" " \n\t" " \n\t" ".TRSM: \n\t" " \n\t" " \n\t" "movl %3, %%eax \n\t" // load address of a11 "movl %7, %%ecx \n\t" // load address of c11 " \n\t" "movl %8, %%edi \n\t" // load rs_c "movl %9, %%esi \n\t" // load cs_c "sall $3, %%edi \n\t" // rs_c *= sizeof( double ) "sall $3, %%esi \n\t" // cs_c *= sizeof( double ) " \n\t" " \n\t" " \n\t" " \n\t" // iteration 0 " \n\t" "movddup (0+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) " \n\t" "mulpd %%xmm0, %%xmm4 \n\t" // xmm4 *= (1/alpha00); " \n\t" "movapd %%xmm4, 0 * 16(%%ebx) \n\t" // store ( beta00 beta01 ) = xmm4 "movlpd %%xmm4, (%%ecx) \n\t" // store ( gamma00 ) = xmm4[0] "movhpd %%xmm4, (%%ecx,%%esi) \n\t" // store ( gamma01 ) = xmm4[1] "addl %%edi, %%ecx \n\t" // c11 += rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 1 " \n\t" "movddup (1+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = alpha10 "movddup (1+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) " \n\t" "mulpd %%xmm4, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 ) "subpd %%xmm0, %%xmm5 \n\t" // xmm5 -= xmm0 "mulpd %%xmm1, %%xmm5 \n\t" // xmm5 *= (1/alpha11); " \n\t" "movapd %%xmm5, 1 * 16(%%ebx) \n\t" // store ( beta10 beta11 ) = xmm5 "movlpd %%xmm5, (%%ecx) \n\t" // store ( gamma10 ) = xmm5[0] "movhpd %%xmm5, (%%ecx,%%esi) \n\t" // store ( gamma11 ) = xmm5[1] "addl %%edi, %%ecx \n\t" // c11 += rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 2 " \n\t" "movddup (2+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = alpha20 "movddup (2+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = alpha21 "movddup (2+2*4)*8(%%eax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) " \n\t" "mulpd %%xmm4, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 ) "mulpd %%xmm5, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 ) "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; "subpd %%xmm0, %%xmm6 \n\t" // xmm6 -= xmm0 "mulpd %%xmm2, %%xmm6 \n\t" // xmm6 *= (1/alpha22); " \n\t" "movapd %%xmm6, 2 * 16(%%ebx) \n\t" // store ( beta20 beta21 ) = xmm6 "movlpd %%xmm6, (%%ecx) \n\t" // store ( gamma20 ) = xmm6[0] "movhpd %%xmm6, (%%ecx,%%esi) \n\t" // store ( gamma21 ) = xmm6[1] "addl %%edi, %%ecx \n\t" // c11 += rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 3 " \n\t" "movddup (3+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = alpha30 "movddup (3+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = alpha31 "movddup (3+2*4)*8(%%eax), %%xmm2 \n\t" // load xmm2 = alpha32 "movddup (3+3*4)*8(%%eax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) " \n\t" "mulpd %%xmm4, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 ) "mulpd %%xmm5, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 ) "mulpd %%xmm6, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 ) "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1 "addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2 "subpd %%xmm0, %%xmm7 \n\t" // xmm7 -= xmm0 "mulpd %%xmm3, %%xmm7 \n\t" // xmm7 *= (1/alpha33); " \n\t" "movapd %%xmm7, 3 * 16(%%ebx) \n\t" // store ( beta30 beta31 ) = xmm7 "movlpd %%xmm7, (%%ecx) \n\t" // store ( gamma30 ) = xmm7[0] "movhpd %%xmm7, (%%ecx,%%esi) \n\t" // store ( gamma31 ) = xmm7[1] " \n\t" " \n\t" " \n\t" ".UPDATEBD11: \n\t" " \n\t" " \n\t" "movl %5, %%edx \n\t" " \n\t" "movddup %%xmm4, %%xmm0 \n\t" "movddup %%xmm5, %%xmm1 \n\t" "movddup %%xmm6, %%xmm2 \n\t" "movddup %%xmm7, %%xmm3 \n\t" " \n\t" "unpckhpd %%xmm4, %%xmm4 \n\t" "unpckhpd %%xmm5, %%xmm5 \n\t" "unpckhpd %%xmm6, %%xmm6 \n\t" "unpckhpd %%xmm7, %%xmm7 \n\t" " \n\t" "movapd %%xmm0, 0 * 16(%%edx) \n\t" "movapd %%xmm4, 1 * 16(%%edx) \n\t" "movapd %%xmm1, 2 * 16(%%edx) \n\t" "movapd %%xmm5, 3 * 16(%%edx) \n\t" "movapd %%xmm2, 4 * 16(%%edx) \n\t" "movapd %%xmm6, 5 * 16(%%edx) \n\t" "movapd %%xmm3, 6 * 16(%%edx) \n\t" "movapd %%xmm7, 7 * 16(%%edx) \n\t" " \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), "m" (k_left), "m" (a10), "m" (a11), "m" (bd01), "m" (bd11), "m" (b11), "m" (c11), "m" (rs_c), "m" (cs_c), "m" (alpha) : // register clobber list "eax", "ebx", "ecx", "edx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } void bli_cgemmtrsm_l_opt_d4x2( dim_t k, scomplex* restrict alpha, scomplex* restrict a10, scomplex* restrict a11, scomplex* restrict bd01, scomplex* restrict bd11, scomplex* restrict b11, scomplex* restrict c11, inc_t rs_c, inc_t cs_c, scomplex* restrict a_next, scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemmtrsm_l_opt_d4x2( dim_t k, dcomplex* restrict alpha, dcomplex* restrict a10, dcomplex* restrict a11, dcomplex* restrict bd01, dcomplex* restrict bd11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, dcomplex* restrict a_next, dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } cython-blis-0.9.1/blis/_src/kernels/old/x86/3/bli_gemmtrsm_u_opt_d4x2.c000066400000000000000000000522251427272030600255500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_sgemmtrsm_u_opt_d4x2( dim_t k, float* restrict alpha, float* restrict a12, float* restrict a11, float* restrict bd21, float* restrict bd11, float* restrict b11, float* restrict c11, inc_t rs_c, inc_t cs_c, float* restrict a_next, float* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dgemmtrsm_u_opt_d4x2( dim_t k, double* restrict alpha, double* restrict a12, double* restrict a11, double* restrict bd21, double* restrict bd11, double* restrict b11, double* restrict c11, inc_t rs_c, inc_t cs_c, double* restrict a_next, double* restrict b_next ) { dim_t k_iter; dim_t k_left; k_iter = k / 8; k_left = k % 8; __asm__ volatile ( " \n\t" "movl %2, %%eax \n\t" // load address of a12. "movl %4, %%ebx \n\t" // load address of bd21. " \n\t" "addl $8 * 16, %%eax \n\t" // increment pointers to allow byte "addl $8 * 16, %%ebx \n\t" // offsets in the unrolled iterations. " \n\t" "movapd -8 * 16(%%eax), %%xmm0 \n\t" // initialize loop by pre-loading elements "movapd -4 * 16(%%eax), %%xmm3 \n\t" // of a. " \n\t" "pxor %%xmm4, %%xmm4 \n\t" "pxor %%xmm5, %%xmm5 \n\t" "pxor %%xmm6, %%xmm6 \n\t" "pxor %%xmm7, %%xmm7 \n\t" " \n\t" " \n\t" " \n\t" "movl %0, %%esi \n\t" // i = k_iter; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .CONSIDERKLEFT \n\t" // if i == 0, jump to code that " \n\t" // contains the k_left loop. " \n\t" " \n\t" ".LOOPKITER: \n\t" // MAIN LOOP " \n\t" "movapd -8 * 16(%%ebx), %%xmm1 \n\t" // iteration 0 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -7 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd -6 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -6 * 16(%%ebx), %%xmm1 \n\t" // iteration 1 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -5 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -5 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 0 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -4 * 16(%%ebx), %%xmm1 \n\t" // iteration 2 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -3 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -3 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd -2 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd -2 * 16(%%ebx), %%xmm1 \n\t" // iteration 3 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -1 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -1 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd 4 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 0 * 16(%%ebx), %%xmm1 \n\t" // iteration 4 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 1 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd 1 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 2 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 2 * 16(%%ebx), %%xmm1 \n\t" // iteration 5 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 3 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd 3 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd 8 * 16(%%eax), %%xmm0 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 4 * 16(%%ebx), %%xmm1 \n\t" // iteration 6 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 5 * 16(%%ebx), %%xmm1 \n\t" "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd 5 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "movapd 6 * 16(%%eax), %%xmm3 \n\t" "addl $8 * 4 * 8, %%eax \n\t" // a += 8*4 (unroll x mr) "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "movapd 6 * 16(%%ebx), %%xmm1 \n\t" // iteration 7 "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd 7 * 16(%%ebx), %%xmm1 \n\t" "addl $8 * 2 * 2 * 8, %%ebx \n\t" // b += 8*2*2 (unroll x nr x ndup) "mulpd %%xmm1, %%xmm3 \n\t" "addpd %%xmm3, %%xmm5 \n\t" "movapd -9 * 16(%%eax), %%xmm3 \n\t" "mulpd %%xmm3, %%xmm2 \n\t" "mulpd %%xmm3, %%xmm1 \n\t" "decl %%esi \n\t" // i -= 1; "movapd -4 * 16(%%eax), %%xmm3 \n\t" "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "jne .LOOPKITER \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".CONSIDERKLEFT: \n\t" " \n\t" "movl %1, %%esi \n\t" // i = k_left; "testl %%esi, %%esi \n\t" // check i via logical AND. "je .POSTACCUM \n\t" // if i == 0, we're done; jump to end. " \n\t" // else, we prepare to enter k_left loop. " \n\t" " \n\t" ".LOOPKLEFT: \n\t" // EDGE LOOP " \n\t" "movapd -8 * 16(%%ebx), %%xmm1 \n\t" // iteration i "movapd %%xmm1, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "addpd %%xmm1, %%xmm4 \n\t" "movapd -7 * 16(%%ebx), %%xmm1 \n\t" "addl $1 * 2 * 2 * 8, %%ebx \n\t" // b += 2*2 (1 x nr x ndup) "mulpd %%xmm1, %%xmm0 \n\t" "addpd %%xmm0, %%xmm5 \n\t" "movapd -7 * 16(%%eax), %%xmm0 \n\t" "mulpd %%xmm0, %%xmm2 \n\t" "mulpd %%xmm0, %%xmm1 \n\t" "movapd -6 * 16(%%eax), %%xmm0 \n\t" "addl $1 * 4 * 8, %%eax \n\t" // a += 4 (1 x mr) "addpd %%xmm2, %%xmm6 \n\t" "addpd %%xmm1, %%xmm7 \n\t" " \n\t" "decl %%esi \n\t" // i -= 1; "jne .LOOPKLEFT \n\t" // iterate again if i != 0. " \n\t" " \n\t" " \n\t" ".POSTACCUM: \n\t" " \n\t" " \n\t" "movl %6, %%ebx \n\t" // load address of b11. " \n\t" " \n\t" // xmm4 == ( ab00 xmm5 == ( ab01 " \n\t" // ab10 ) ab11 ) " \n\t" // xmm6 == ( ab20 xmm7 == ( ab21 " \n\t" // ab30 ) ab31 ) "movapd %%xmm4, %%xmm0 \n\t" "unpcklpd %%xmm5, %%xmm0 \n\t" "unpckhpd %%xmm5, %%xmm4 \n\t" "movapd %%xmm4, %%xmm1 \n\t" " \n\t" "movapd %%xmm6, %%xmm2 \n\t" "unpcklpd %%xmm7, %%xmm2 \n\t" "unpckhpd %%xmm7, %%xmm6 \n\t" "movapd %%xmm6, %%xmm3 \n\t" " \n\t" // xmm0 == ( ab00 ab01 ) " \n\t" // xmm1 == ( ab10 ab11 ) " \n\t" // xmm2 == ( ab20 ab21 ) " \n\t" // xmm3 == ( ab30 ab31 ) " \n\t" "movl %10, %%eax \n\t" // load address of alpha "movddup (%%eax), %%xmm7 \n\t" // load alpha and duplicate " \n\t" "movapd 0 * 16(%%ebx), %%xmm4 \n\t" // load xmm4 = ( beta00 beta01 ) "movapd 1 * 16(%%ebx), %%xmm5 \n\t" // load xmm5 = ( beta10 beta11 ) "movapd 2 * 16(%%ebx), %%xmm6 \n\t" // load xmm6 = ( beta20 beta21 ) "mulpd %%xmm7, %%xmm4 \n\t" // xmm4 *= alpha "mulpd %%xmm7, %%xmm5 \n\t" // xmm5 *= alpha "mulpd %%xmm7, %%xmm6 \n\t" // xmm6 *= alpha //"movapd 3 * 16(%%ebx), %%xmm7 \n\t" // load xmm7 = ( beta30 beta31 ) "mulpd 3 * 16(%%ebx), %%xmm7 \n\t" // xmm7 = alpha * ( beta30 beta31 ) " \n\t" "subpd %%xmm0, %%xmm4 \n\t" // xmm4 -= xmm0 "subpd %%xmm1, %%xmm5 \n\t" // xmm5 -= xmm1 "subpd %%xmm2, %%xmm6 \n\t" // xmm6 -= xmm2 "subpd %%xmm3, %%xmm7 \n\t" // xmm7 -= xmm3 " \n\t" " \n\t" " \n\t" ".TRSM: \n\t" " \n\t" " \n\t" "movl %3, %%eax \n\t" // load address of a11 "movl %7, %%ecx \n\t" // load address of c11 " \n\t" "movl %8, %%edi \n\t" // load rs_c "movl %9, %%esi \n\t" // load cs_c "sall $3, %%edi \n\t" // rs_c *= sizeof( double ) "sall $3, %%esi \n\t" // cs_c *= sizeof( double ) " \n\t" "addl %%edi, %%ecx \n\t" // c11 += (4-1)*rs_c "addl %%edi, %%ecx \n\t" "addl %%edi, %%ecx \n\t" " \n\t" " \n\t" " \n\t" // iteration 0 " \n\t" "movddup (3+3*4)*8(%%eax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) " \n\t" "mulpd %%xmm3, %%xmm7 \n\t" // xmm7 *= (1/alpha33); " \n\t" "movapd %%xmm7, 3 * 16(%%ebx) \n\t" // store ( beta30 beta31 ) = xmm7 "movlpd %%xmm7, (%%ecx) \n\t" // store ( gamma30 ) = xmm7[0] "movhpd %%xmm7, (%%ecx,%%esi) \n\t" // store ( gamma31 ) = xmm7[1] "subl %%edi, %%ecx \n\t" // c11 -= rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 1 " \n\t" "movddup (2+2*4)*8(%%eax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) "movddup (2+3*4)*8(%%eax), %%xmm3 \n\t" // load xmm3 = alpha23 " \n\t" "mulpd %%xmm7, %%xmm3 \n\t" // xmm3 = alpha23 * ( beta30 beta31 ) "subpd %%xmm3, %%xmm6 \n\t" // xmm6 -= xmm3 "mulpd %%xmm2, %%xmm6 \n\t" // xmm6 *= (1/alpha22); " \n\t" "movapd %%xmm6, 2 * 16(%%ebx) \n\t" // store ( beta20 beta21 ) = xmm6 "movlpd %%xmm6, (%%ecx) \n\t" // store ( gamma20 ) = xmm6[0] "movhpd %%xmm6, (%%ecx,%%esi) \n\t" // store ( gamma21 ) = xmm6[1] "subl %%edi, %%ecx \n\t" // c11 -= rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 2 " \n\t" "movddup (1+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) "movddup (1+2*4)*8(%%eax), %%xmm2 \n\t" // load xmm2 = alpha12 "movddup (1+3*4)*8(%%eax), %%xmm3 \n\t" // load xmm3 = alpha13 " \n\t" "mulpd %%xmm6, %%xmm2 \n\t" // xmm2 = alpha12 * ( beta20 beta21 ) "mulpd %%xmm7, %%xmm3 \n\t" // xmm3 = alpha13 * ( beta30 beta31 ) "addpd %%xmm3, %%xmm2 \n\t" // xmm2 += xmm3; "subpd %%xmm2, %%xmm5 \n\t" // xmm5 -= xmm2 "mulpd %%xmm1, %%xmm5 \n\t" // xmm5 *= (1/alpha11); " \n\t" "movapd %%xmm5, 1 * 16(%%ebx) \n\t" // store ( beta10 beta11 ) = xmm5 "movlpd %%xmm5, (%%ecx) \n\t" // store ( gamma10 ) = xmm5[0] "movhpd %%xmm5, (%%ecx,%%esi) \n\t" // store ( gamma11 ) = xmm5[1] "subl %%edi, %%ecx \n\t" // c11 -= rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 3 " \n\t" "movddup (0+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) "movddup (0+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = alpha01 "movddup (0+2*4)*8(%%eax), %%xmm2 \n\t" // load xmm2 = alpha02 "movddup (0+3*4)*8(%%eax), %%xmm3 \n\t" // load xmm3 = alpha03 " \n\t" "mulpd %%xmm5, %%xmm1 \n\t" // xmm1 = alpha01 * ( beta10 beta11 ) "mulpd %%xmm6, %%xmm2 \n\t" // xmm2 = alpha02 * ( beta20 beta21 ) "mulpd %%xmm7, %%xmm3 \n\t" // xmm3 = alpha03 * ( beta30 beta31 ) "addpd %%xmm2, %%xmm1 \n\t" // xmm1 += xmm2; "addpd %%xmm3, %%xmm1 \n\t" // xmm1 += xmm3; "subpd %%xmm1, %%xmm4 \n\t" // xmm4 -= xmm1 "mulpd %%xmm0, %%xmm4 \n\t" // xmm4 *= (1/alpha00); " \n\t" "movapd %%xmm4, 0 * 16(%%ebx) \n\t" // store ( beta00 beta01 ) = xmm4 "movlpd %%xmm4, (%%ecx) \n\t" // store ( gamma00 ) = xmm4[0] "movhpd %%xmm4, (%%ecx,%%esi) \n\t" // store ( gamma01 ) = xmm4[1] " \n\t" " \n\t" " \n\t" ".UPDATEBD11: \n\t" " \n\t" " \n\t" "movl %5, %%edx \n\t" " \n\t" "movddup %%xmm4, %%xmm0 \n\t" "movddup %%xmm5, %%xmm1 \n\t" "movddup %%xmm6, %%xmm2 \n\t" "movddup %%xmm7, %%xmm3 \n\t" " \n\t" "unpckhpd %%xmm4, %%xmm4 \n\t" "unpckhpd %%xmm5, %%xmm5 \n\t" "unpckhpd %%xmm6, %%xmm6 \n\t" "unpckhpd %%xmm7, %%xmm7 \n\t" " \n\t" "movapd %%xmm0, 0 * 16(%%edx) \n\t" "movapd %%xmm4, 1 * 16(%%edx) \n\t" "movapd %%xmm1, 2 * 16(%%edx) \n\t" "movapd %%xmm5, 3 * 16(%%edx) \n\t" "movapd %%xmm2, 4 * 16(%%edx) \n\t" "movapd %%xmm6, 5 * 16(%%edx) \n\t" "movapd %%xmm3, 6 * 16(%%edx) \n\t" "movapd %%xmm7, 7 * 16(%%edx) \n\t" " \n\t" " \n\t" : // output operands (none) : // input operands "m" (k_iter), "m" (k_left), "m" (a12), "m" (a11), "m" (bd21), "m" (bd11), "m" (b11), "m" (c11), "m" (rs_c), "m" (cs_c), "m" (alpha) : // register clobber list "eax", "ebx", "ecx", "edx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } void bli_cgemmtrsm_u_opt_d4x2( dim_t k, scomplex* restrict alpha, scomplex* restrict a12, scomplex* restrict a11, scomplex* restrict bd21, scomplex* restrict bd11, scomplex* restrict b11, scomplex* restrict c11, inc_t rs_c, inc_t cs_c, scomplex* restrict a_next, scomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_zgemmtrsm_u_opt_d4x2( dim_t k, dcomplex* restrict alpha, dcomplex* restrict a12, dcomplex* restrict a11, dcomplex* restrict bd21, dcomplex* restrict bd11, dcomplex* restrict b11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c, dcomplex* restrict a_next, dcomplex* restrict b_next ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } cython-blis-0.9.1/blis/_src/kernels/old/x86/3/bli_trsm_l_opt_d4x2.c000066400000000000000000000226171427272030600246730ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_strsm_l_opt_d4x2( float* restrict a11, float* restrict b11, float* restrict bd11, float* restrict c11, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_dtrsm_l_opt_d4x2( double* restrict a11, double* restrict b11, double* restrict bd11, double* restrict c11, inc_t rs_c, inc_t cs_c ) { __asm__ volatile ( " \n\t" "movl %1, %%ebx \n\t" // load address of b11. " \n\t" "movapd 0 * 16(%%ebx), %%xmm4 \n\t" // load xmm4 = ( beta00 beta01 ) "movapd 1 * 16(%%ebx), %%xmm5 \n\t" // load xmm5 = ( beta10 beta11 ) "movapd 2 * 16(%%ebx), %%xmm6 \n\t" // load xmm6 = ( beta20 beta21 ) "movapd 3 * 16(%%ebx), %%xmm7 \n\t" // load xmm7 = ( beta30 beta31 ) " \n\t" " \n\t" "movl %0, %%eax \n\t" // load address of a11 "movl %3, %%ecx \n\t" // load address of c11 " \n\t" "movl %4, %%edi \n\t" // load rs_c "movl %5, %%esi \n\t" // load cs_c "sall $3, %%edi \n\t" // rs_c *= sizeof( double ) "sall $3, %%esi \n\t" // cs_c *= sizeof( double ) " \n\t" " \n\t" " \n\t" " \n\t" // iteration 0 " \n\t" "movddup (0+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = (1/alpha00) " \n\t" "mulpd %%xmm0, %%xmm4 \n\t" // xmm4 *= (1/alpha00); " \n\t" "movapd %%xmm4, 0 * 16(%%ebx) \n\t" // store ( beta00 beta01 ) = xmm4 "movlpd %%xmm4, (%%ecx) \n\t" // store ( gamma00 ) = xmm4[0] "movhpd %%xmm4, (%%ecx,%%esi) \n\t" // store ( gamma01 ) = xmm4[1] "addl %%edi, %%ecx \n\t" // c11 += rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 1 " \n\t" "movddup (1+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = alpha10 "movddup (1+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = (1/alpha11) " \n\t" "mulpd %%xmm4, %%xmm0 \n\t" // xmm0 = alpha10 * ( beta00 beta01 ) "subpd %%xmm0, %%xmm5 \n\t" // xmm5 -= xmm0 "mulpd %%xmm1, %%xmm5 \n\t" // xmm5 *= (1/alpha11); " \n\t" "movapd %%xmm5, 1 * 16(%%ebx) \n\t" // store ( beta10 beta11 ) = xmm5 "movlpd %%xmm5, (%%ecx) \n\t" // store ( gamma10 ) = xmm5[0] "movhpd %%xmm5, (%%ecx,%%esi) \n\t" // store ( gamma11 ) = xmm5[1] "addl %%edi, %%ecx \n\t" // c11 += rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 2 " \n\t" "movddup (2+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = alpha20 "movddup (2+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = alpha21 "movddup (2+2*4)*8(%%eax), %%xmm2 \n\t" // load xmm2 = (1/alpha22) " \n\t" "mulpd %%xmm4, %%xmm0 \n\t" // xmm0 = alpha20 * ( beta00 beta01 ) "mulpd %%xmm5, %%xmm1 \n\t" // xmm1 = alpha21 * ( beta10 beta11 ) "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1; "subpd %%xmm0, %%xmm6 \n\t" // xmm6 -= xmm0 "mulpd %%xmm2, %%xmm6 \n\t" // xmm6 *= (1/alpha22); " \n\t" "movapd %%xmm6, 2 * 16(%%ebx) \n\t" // store ( beta20 beta21 ) = xmm6 "movlpd %%xmm6, (%%ecx) \n\t" // store ( gamma20 ) = xmm6[0] "movhpd %%xmm6, (%%ecx,%%esi) \n\t" // store ( gamma21 ) = xmm6[1] "addl %%edi, %%ecx \n\t" // c11 += rs_c " \n\t" " \n\t" " \n\t" " \n\t" // iteration 3 " \n\t" "movddup (3+0*4)*8(%%eax), %%xmm0 \n\t" // load xmm0 = alpha30 "movddup (3+1*4)*8(%%eax), %%xmm1 \n\t" // load xmm1 = alpha31 "movddup (3+2*4)*8(%%eax), %%xmm2 \n\t" // load xmm2 = alpha32 "movddup (3+3*4)*8(%%eax), %%xmm3 \n\t" // load xmm3 = (1/alpha33) " \n\t" "mulpd %%xmm4, %%xmm0 \n\t" // xmm0 = alpha30 * ( beta00 beta01 ) "mulpd %%xmm5, %%xmm1 \n\t" // xmm1 = alpha31 * ( beta10 beta11 ) "mulpd %%xmm6, %%xmm2 \n\t" // xmm2 = alpha32 * ( beta20 beta21 ) "addpd %%xmm1, %%xmm0 \n\t" // xmm0 += xmm1 "addpd %%xmm2, %%xmm0 \n\t" // xmm0 += xmm2 "subpd %%xmm0, %%xmm7 \n\t" // xmm7 -= xmm0 "mulpd %%xmm3, %%xmm7 \n\t" // xmm7 *= (1/alpha33); " \n\t" "movapd %%xmm7, 3 * 16(%%ebx) \n\t" // store ( beta30 beta31 ) = xmm7 "movlpd %%xmm7, (%%ecx) \n\t" // store ( gamma30 ) = xmm7[0] "movhpd %%xmm7, (%%ecx,%%esi) \n\t" // store ( gamma31 ) = xmm7[1] " \n\t" " \n\t" " \n\t" ".UPDATEBD11: \n\t" " \n\t" " \n\t" "movl %2, %%edx \n\t" " \n\t" "movddup %%xmm4, %%xmm0 \n\t" "movddup %%xmm5, %%xmm1 \n\t" "movddup %%xmm6, %%xmm2 \n\t" "movddup %%xmm7, %%xmm3 \n\t" " \n\t" "unpckhpd %%xmm4, %%xmm4 \n\t" "unpckhpd %%xmm5, %%xmm5 \n\t" "unpckhpd %%xmm6, %%xmm6 \n\t" "unpckhpd %%xmm7, %%xmm7 \n\t" " \n\t" "movapd %%xmm0, 0 * 16(%%edx) \n\t" "movapd %%xmm4, 1 * 16(%%edx) \n\t" "movapd %%xmm1, 2 * 16(%%edx) \n\t" "movapd %%xmm5, 3 * 16(%%edx) \n\t" "movapd %%xmm2, 4 * 16(%%edx) \n\t" "movapd %%xmm6, 5 * 16(%%edx) \n\t" "movapd %%xmm3, 6 * 16(%%edx) \n\t" "movapd %%xmm7, 7 * 16(%%edx) \n\t" " \n\t" " \n\t" : // output operands (none) : // input operands "m" (a11), "m" (b11), "m" (bd11), "m" (c11), "m" (rs_c), "m" (cs_c) : // register clobber list "eax", "ebx", "ecx", "edx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory" ); } void bli_ctrsm_l_opt_d4x2( scomplex* restrict a11, scomplex* restrict b11, scomplex* restrict bd11, scomplex* restrict c11, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } void bli_ztrsm_l_opt_d4x2( dcomplex* restrict a11, dcomplex* restrict b11, dcomplex* restrict bd11, dcomplex* restrict c11, inc_t rs_c, inc_t cs_c ) { bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); } cython-blis-0.9.1/blis/_src/kernels/penryn/000077500000000000000000000000001427272030600206145ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/penryn/1/000077500000000000000000000000001427272030600207545ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/penryn/1/bli_axpyv_penryn_int.c000066400000000000000000000106701427272030600253660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pmmintrin.h" #include "blis.h" typedef union { __m128d v; double d[2]; } v2df_t; void bli_daxpyv_penryn_int ( conj_t conjx, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { double* restrict alpha_cast = alpha; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 4; dim_t n_pre; dim_t n_run; dim_t n_left; double* restrict x1; double* restrict y1; double alpha1c, x1c; v2df_t alpha1v; v2df_t x1v, x2v, x3v, x4v; v2df_t y1v, y2v, y3v, y4v; bool use_ref = FALSE; if ( bli_zero_dim1( n ) ) return; n_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( incx != 1 || incy != 1 ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) || bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( ( siz_t )x, 16 ) && bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = FALSE; n_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); f ( conjx, n, alpha, x, incx, y, incy, cntx ); return; } n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll ); alpha1c = *alpha_cast; x1 = x_cast; y1 = y_cast; if ( n_pre == 1 ) { x1c = *x1; *y1 += alpha1c * x1c; x1 += incx; y1 += incy; } alpha1v.v = _mm_loaddup_pd( ( double* )&alpha1c ); for ( i = 0; i < n_run; ++i ) { y1v.v = _mm_load_pd( ( double* )y1 ); x1v.v = _mm_load_pd( ( double* )x1 ); y1v.v += alpha1v.v * x1v.v; _mm_store_pd( ( double* )(y1 ), y1v.v ); y2v.v = _mm_load_pd( ( double* )(y1 + 2) ); x2v.v = _mm_load_pd( ( double* )(x1 + 2) ); y2v.v += alpha1v.v * x2v.v; _mm_store_pd( ( double* )(y1 + 2), y2v.v ); y3v.v = _mm_load_pd( ( double* )(y1 + 4) ); x3v.v = _mm_load_pd( ( double* )(x1 + 4) ); y3v.v += alpha1v.v * x3v.v; _mm_store_pd( ( double* )(y1 + 4), y3v.v ); y4v.v = _mm_load_pd( ( double* )(y1 + 6) ); x4v.v = _mm_load_pd( ( double* )(x1 + 6) ); y4v.v += alpha1v.v * x4v.v; _mm_store_pd( ( double* )(y1 + 6), y4v.v ); x1 += n_elem_per_reg * n_iter_unroll; y1 += n_elem_per_reg * n_iter_unroll; } if ( n_left > 0 ) { for ( i = 0; i < n_left; ++i ) { x1c = *x1; *y1 += alpha1c * x1c; x1 += incx; y1 += incy; } } } cython-blis-0.9.1/blis/_src/kernels/penryn/1/bli_dotv_penryn_int.c000066400000000000000000000076341427272030600252010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pmmintrin.h" #include "blis.h" typedef union { __m128d v; double d[2]; } v2df_t; void bli_ddotv_penryn_int ( conj_t conjx, conj_t conjy, dim_t n, double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, cntx_t* restrict cntx ) { double* restrict x_cast = x; double* restrict y_cast = y; double* restrict rho_cast = rho; dim_t i; dim_t n_pre; dim_t n_run; dim_t n_left; double* restrict x1; double* restrict y1; double rho1; double x1c, y1c; v2df_t rho1v; v2df_t x1v, y1v; bool use_ref = FALSE; // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { PASTEMAC(d,set0s)( *rho_cast ); return; } n_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( incx != 1 || incy != 1 ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) || bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( ( siz_t )x, 16 ) && bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = FALSE; n_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { ddotv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx ); f ( conjx, conjy, n, x, incx, y, incy, rho, cntx ); return; } n_run = ( n - n_pre ) / 2; n_left = ( n - n_pre ) % 2; x1 = x_cast; y1 = y_cast; PASTEMAC(d,set0s)( rho1 ); if ( n_pre == 1 ) { x1c = *x1; y1c = *y1; rho1 += x1c * y1c; x1 += incx; y1 += incy; } rho1v.v = _mm_setzero_pd(); for ( i = 0; i < n_run; ++i ) { x1v.v = _mm_load_pd( ( double* )x1 ); y1v.v = _mm_load_pd( ( double* )y1 ); rho1v.v += x1v.v * y1v.v; //x1 += 2*incx; //y1 += 2*incy; x1 += 2; y1 += 2; } rho1 += rho1v.d[0] + rho1v.d[1]; if ( n_left > 0 ) { for ( i = 0; i < n_left; ++i ) { x1c = *x1; y1c = *y1; rho1 += x1c * y1c; x1 += incx; y1 += incy; } } PASTEMAC(d,copys)( rho1, *rho_cast ); } cython-blis-0.9.1/blis/_src/kernels/penryn/1f/000077500000000000000000000000001427272030600211225ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/penryn/1f/bli_axpy2v_penryn_int.c000066400000000000000000000153301427272030600256140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pmmintrin.h" #include "blis.h" typedef union { __m128d v; double d[2]; } v2df_t; void bli_daxpy2v_penryn_int ( conj_t conjx, conj_t conjy, dim_t n, double* restrict alpha, double* restrict beta, double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict z, inc_t incz, cntx_t* restrict cntx ) { double* restrict alpha_cast = alpha; double* restrict beta_cast = beta; double* restrict x_cast = x; double* restrict y_cast = y; double* restrict z_cast = z; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 4; dim_t n_pre; dim_t n_run; dim_t n_left; double* restrict x1; double* restrict y1; double* restrict z1; double alphac, betac, x1c, y1c; v2df_t alphav, betav; v2df_t x1v, y1v, z1v; v2df_t x2v, y2v, z2v; bool use_ref = FALSE; if ( bli_zero_dim1( n ) ) return; n_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( incx != 1 || incy != 1 || incz != 1 ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) || bli_is_unaligned_to( ( siz_t )y, 16 ) || bli_is_unaligned_to( ( siz_t )z, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( ( siz_t )x, 16 ) && bli_is_unaligned_to( ( siz_t )y, 16 ) && bli_is_unaligned_to( ( siz_t )z, 16 ) ) { use_ref = FALSE; n_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { daxpy2v_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx ); f ( conjx, conjy, n, alpha, beta, x, incx, y, incy, z, incz, cntx ); return; } n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll ); alphac = *alpha_cast; betac = *beta_cast; x1 = x_cast; y1 = y_cast; z1 = z_cast; if ( n_pre == 1 ) { x1c = *x1; y1c = *y1; *z1 += alphac * x1c + betac * y1c; x1 += incx; y1 += incy; z1 += incz; } alphav.v = _mm_loaddup_pd( ( double* )alpha_cast ); betav.v = _mm_loaddup_pd( ( double* )beta_cast ); for ( i = 0; i < n_run; ++i ) { /* z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg ); x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg ); y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg ); z1v.v += alphav.v * x1v.v; z1v.v += betav.v * y1v.v; _mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v ); z1v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg ); x1v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg ); y1v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg ); z1v.v += alphav.v * x1v.v; z1v.v += betav.v * y1v.v; _mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z1v.v ); */ /* z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg ); x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg ); y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg ); z2v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg ); x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg ); y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg ); z1v.v += alphav.v * x1v.v; z1v.v += betav.v * y1v.v; _mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v ); z2v.v += alphav.v * x2v.v; z2v.v += betav.v * y2v.v; _mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v ); */ z1v.v = _mm_load_pd( ( double* )z1 + 0*n_elem_per_reg ); x1v.v = _mm_load_pd( ( double* )x1 + 0*n_elem_per_reg ); y1v.v = _mm_load_pd( ( double* )y1 + 0*n_elem_per_reg ); z2v.v = _mm_load_pd( ( double* )z1 + 1*n_elem_per_reg ); x2v.v = _mm_load_pd( ( double* )x1 + 1*n_elem_per_reg ); y2v.v = _mm_load_pd( ( double* )y1 + 1*n_elem_per_reg ); z1v.v += alphav.v * x1v.v; z1v.v += betav.v * y1v.v; _mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg ), z1v.v ); z1v.v = _mm_load_pd( ( double* )z1 + 2*n_elem_per_reg ); x1v.v = _mm_load_pd( ( double* )x1 + 2*n_elem_per_reg ); y1v.v = _mm_load_pd( ( double* )y1 + 2*n_elem_per_reg ); z2v.v += alphav.v * x2v.v; z2v.v += betav.v * y2v.v; _mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg ), z2v.v ); z2v.v = _mm_load_pd( ( double* )z1 + 3*n_elem_per_reg ); x2v.v = _mm_load_pd( ( double* )x1 + 3*n_elem_per_reg ); y2v.v = _mm_load_pd( ( double* )y1 + 3*n_elem_per_reg ); z1v.v += alphav.v * x1v.v; z1v.v += betav.v * y1v.v; _mm_store_pd( ( double* )(z1 + 2*n_elem_per_reg ), z1v.v ); z2v.v += alphav.v * x2v.v; z2v.v += betav.v * y2v.v; _mm_store_pd( ( double* )(z1 + 3*n_elem_per_reg ), z2v.v ); x1 += n_elem_per_reg * n_iter_unroll; y1 += n_elem_per_reg * n_iter_unroll; z1 += n_elem_per_reg * n_iter_unroll; } if ( n_left > 0 ) { for ( i = 0; i < n_left; ++i ) { x1c = *x1; y1c = *y1; *z1 += alphac * x1c + betac * y1c; x1 += incx; y1 += incy; z1 += incz; } } } cython-blis-0.9.1/blis/_src/kernels/penryn/1f/bli_axpyf_penryn_int.c000066400000000000000000000142611427272030600255140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pmmintrin.h" #include "blis.h" typedef union { __m128d v; double d[2]; } v2df_t; void bli_daxpyf_penryn_int ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { double* restrict alpha_cast = alpha; double* restrict a_cast = a; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict y0; double a0c, a1c, a2c, a3c; double chi0, chi1, chi2, chi3; v2df_t a00v, a01v, a02v, a03v, y0v; v2df_t a10v, a11v, a12v, a13v, y1v; v2df_t chi0v, chi1v, chi2v, chi3v; bool use_ref = FALSE; if ( bli_zero_dim2( m, b_n ) ) return; m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( ( siz_t )(lda*sizeof(double)), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( ( siz_t )a, 16 ) || bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( ( siz_t )a, 16 ) && bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = FALSE; m_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { daxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx ); f ( conja, conjx, m, b_n, alpha_cast, a_cast, inca, lda, x_cast, incx, y_cast, incy, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); a0 = a_cast + 0*lda; a1 = a_cast + 1*lda; a2 = a_cast + 2*lda; a3 = a_cast + 3*lda; y0 = y_cast; chi0 = *(x_cast + 0*incx); chi1 = *(x_cast + 1*incx); chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); PASTEMAC2(d,d,scals)( *alpha_cast, chi0 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi1 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); if ( m_pre == 1 ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } chi0v.v = _mm_loaddup_pd( ( double* )&chi0 ); chi1v.v = _mm_loaddup_pd( ( double* )&chi1 ); chi2v.v = _mm_loaddup_pd( ( double* )&chi2 ); chi3v.v = _mm_loaddup_pd( ( double* )&chi3 ); for ( i = 0; i < m_run; ++i ) { y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) ); a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) ); a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) ); y0v.v += chi0v.v * a00v.v; y0v.v += chi1v.v * a01v.v; a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) ); a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) ); y0v.v += chi2v.v * a02v.v; y0v.v += chi3v.v * a03v.v; _mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v ); y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) ); a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) ); a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) ); y1v.v += chi0v.v * a10v.v; y1v.v += chi1v.v * a11v.v; a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) ); a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) ); y1v.v += chi2v.v * a12v.v; y1v.v += chi3v.v * a13v.v; _mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v ); a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } } } cython-blis-0.9.1/blis/_src/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c000066400000000000000000000116111427272030600262370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pmmintrin.h" #include "blis.h" typedef union { __m128d v; double d[2]; } v2df_t; void bli_ddotaxpyv_penryn_int ( conj_t conjxt, conj_t conjx, conj_t conjy, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, double* restrict z, inc_t incz, cntx_t* restrict cntx ) { double* restrict alpha_cast = alpha; double* restrict x_cast = x; double* restrict y_cast = y; double* restrict rho_cast = rho; double* restrict z_cast = z; dim_t n_pre; dim_t n_run; dim_t n_left; double* restrict chi1; double* restrict psi1; double* restrict zeta1; double alpha1c, chi1c, psi1c, rho1c; dim_t i; //inc_t stepx, stepy, stepz; v2df_t alphav, rhov; v2df_t x1v, y1v, z1v; bool use_ref = FALSE; // If the vector lengths are zero, set rho to zero and return. if ( bli_zero_dim1( n ) ) { PASTEMAC(d,set0s)( *rho_cast ); return; } n_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( incx != 1 || incy != 1 || incz != 1 ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( ( siz_t )x, 16 ) || bli_is_unaligned_to( ( siz_t )y, 16 ) || bli_is_unaligned_to( ( siz_t )z, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( ( siz_t )x, 16 ) && bli_is_unaligned_to( ( siz_t )y, 16 ) && bli_is_unaligned_to( ( siz_t )z, 16 ) ) { use_ref = FALSE; n_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { ddotaxpyv_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx ); f ( conjxt, conjx, conjy, n, alpha, x, incx, y, incy, rho, z, incz, cntx ); return; } n_run = ( n - n_pre ) / ( 2 * 1 ); n_left = ( n - n_pre ) % ( 2 * 1 ); //stepx = 2 * incx; //stepy = 2 * incy; //stepz = 2 * incz; PASTEMAC(d,set0s)( rho1c ); alpha1c = *alpha_cast; chi1 = x_cast; psi1 = y_cast; zeta1 = z_cast; if ( n_pre == 1 ) { chi1c = *chi1; psi1c = *psi1; rho1c += chi1c * psi1c; *zeta1 += alpha1c * chi1c; chi1 += incx; psi1 += incy; zeta1 += incz; } rhov.v = _mm_setzero_pd(); alphav.v = _mm_loaddup_pd( ( double* )alpha_cast ); for ( i = 0; i < n_run; ++i ) { x1v.v = _mm_load_pd( ( double* )chi1 ); y1v.v = _mm_load_pd( ( double* )psi1 ); z1v.v = _mm_load_pd( ( double* )zeta1 ); //y1v.v = _mm_setr_pd( *psi1, *(psi1 + incy) ); //z1v.v = _mm_setr_pd( *zeta1, *(zeta1 + incz) ); rhov.v += x1v.v * y1v.v; z1v.v += alphav.v * x1v.v; _mm_store_pd( ( double* )zeta1, z1v.v ); //chi1 += stepx; //psi1 += stepy; //zeta1 += stepz; chi1 += 2; psi1 += 2; zeta1 += 2; } if ( n_left > 0 ) { for ( i = 0; i < n_left; ++i ) { chi1c = *chi1; psi1c = *psi1; rho1c += chi1c * psi1c; *zeta1 += alpha1c * chi1c; chi1 += incx; psi1 += incy; zeta1 += incz; } } rho1c += rhov.d[0] + rhov.d[1]; *rho_cast = rho1c; } cython-blis-0.9.1/blis/_src/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c000066400000000000000000000225101427272030600264070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pmmintrin.h" #include "blis.h" typedef union { __m128d v; double d[2]; } v2df_t; void bli_ddotxaxpyf_penryn_int ( conj_t conjat, conj_t conja, conj_t conjw, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict w, inc_t incw, double* restrict x, inc_t incx, double* restrict beta, double* restrict y, inc_t incy, double* restrict z, inc_t incz, cntx_t* restrict cntx ) { double* restrict alpha_cast = alpha; double* restrict beta_cast = beta; double* restrict a_cast = a; double* restrict w_cast = w; double* restrict x_cast = x; double* restrict y_cast = y; double* restrict z_cast = z; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict w1; double* restrict z1; double rho0, rho1, rho2, rho3; double chi0, chi1, chi2, chi3; double a0c, a1c, a2c, a3c, w1c, z1c; v2df_t rho0v, rho1v, rho2v, rho3v; v2df_t chi0v, chi1v, chi2v, chi3v; //v2df_t a0v, a1v, a2v, a3v, w1v, z1v; v2df_t a00v, a01v, a02v, a03v; v2df_t a10v, a11v, a12v, a13v; v2df_t w1v, z1v; v2df_t w2v, z2v; v2df_t psi0v, psi1v, betav, alphav; bool use_ref = FALSE; if ( bli_zero_dim1( b_n ) ) return; // If the vector lengths are zero, scale y by beta and return. if ( bli_zero_dim1( m ) ) { dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); f ( BLIS_NO_CONJUGATE, b_n, beta, y, incy, cntx ); return; } m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_XF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incw != 1 || incx != 1 || incy != 1 || incz != 1 || bli_is_unaligned_to( ( siz_t )(lda*sizeof(double)), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( ( siz_t )a, 16 ) || bli_is_unaligned_to( ( siz_t )w, 16 ) || bli_is_unaligned_to( ( siz_t )z, 16 ) || bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( ( siz_t )a, 16 ) && bli_is_unaligned_to( ( siz_t )w, 16 ) && bli_is_unaligned_to( ( siz_t )z, 16 ) && bli_is_aligned_to( ( siz_t )y, 16 ) ) // Note: y is not affected by a, w, and z being unaligned. { use_ref = FALSE; m_pre = 1; } } if ( use_ref == TRUE ) { ddotxaxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx ); f ( conjat, conja, conjw, conjx, m, b_n, alpha_cast, a_cast, inca, lda, w_cast, incw, x_cast, incx, beta_cast, y_cast, incy, z_cast, incz, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); a0 = a_cast + 0*lda; a1 = a_cast + 1*lda; a2 = a_cast + 2*lda; a3 = a_cast + 3*lda; w1 = w_cast; z1 = z_cast; chi0 = *(x_cast + 0*incx); chi1 = *(x_cast + 1*incx); chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); PASTEMAC2(d,d,scals)( *alpha_cast, chi0 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi1 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); PASTEMAC(d,set0s)( rho0 ); PASTEMAC(d,set0s)( rho1 ); PASTEMAC(d,set0s)( rho2 ); PASTEMAC(d,set0s)( rho3 ); if ( m_pre == 1 ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; w1c = *w1; z1c = *z1; rho0 += a0c * w1c; rho1 += a1c * w1c; rho2 += a2c * w1c; rho3 += a3c * w1c; z1c += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; *z1 = z1c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; w1 += incw; z1 += incz; } rho0v.v = _mm_setzero_pd(); rho1v.v = _mm_setzero_pd(); rho2v.v = _mm_setzero_pd(); rho3v.v = _mm_setzero_pd(); chi0v.v = _mm_loaddup_pd( ( double* )&chi0 ); chi1v.v = _mm_loaddup_pd( ( double* )&chi1 ); chi2v.v = _mm_loaddup_pd( ( double* )&chi2 ); chi3v.v = _mm_loaddup_pd( ( double* )&chi3 ); /* y = beta * y + alpha * A^T w; */ \ /* z = z + alpha * A x; */ \ //for ( i = 0; i < m_run; ++i ) for ( i = m_run; i != 0; --i ) { z1v.v = _mm_load_pd( ( double* )(z1 + 0*n_elem_per_reg) ); w1v.v = _mm_load_pd( ( double* )(w1 + 0*n_elem_per_reg) ); a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) ); //a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) ); a01v.v = _mm_load_pd( ( double* )(a0 + 1*lda + 0*n_elem_per_reg) ); rho0v.v += a00v.v * w1v.v; rho1v.v += a01v.v * w1v.v; z1v.v += chi0v.v * a00v.v; z1v.v += chi1v.v * a01v.v; a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) ); //a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) ); a03v.v = _mm_load_pd( ( double* )(a2 + 1*lda + 0*n_elem_per_reg) ); rho2v.v += a02v.v * w1v.v; rho3v.v += a03v.v * w1v.v; z1v.v += chi2v.v * a02v.v; z1v.v += chi3v.v * a03v.v; _mm_store_pd( ( double* )(z1 + 0*n_elem_per_reg), z1v.v ); z2v.v = _mm_load_pd( ( double* )(z1 + 1*n_elem_per_reg) ); w2v.v = _mm_load_pd( ( double* )(w1 + 1*n_elem_per_reg) ); a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) ); //a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) ); a11v.v = _mm_load_pd( ( double* )(a0 + 1*lda + 1*n_elem_per_reg) ); rho0v.v += a10v.v * w2v.v; rho1v.v += a11v.v * w2v.v; z2v.v += chi0v.v * a10v.v; z2v.v += chi1v.v * a11v.v; a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) ); //a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) ); a13v.v = _mm_load_pd( ( double* )(a2 + 1*lda + 1*n_elem_per_reg) ); rho2v.v += a12v.v * w2v.v; rho3v.v += a13v.v * w2v.v; z2v.v += chi2v.v * a12v.v; z2v.v += chi3v.v * a13v.v; _mm_store_pd( ( double* )(z1 + 1*n_elem_per_reg), z2v.v ); a0 += n_elem_per_reg * n_iter_unroll; //a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; //a3 += n_elem_per_reg * n_iter_unroll; w1 += n_elem_per_reg * n_iter_unroll; z1 += n_elem_per_reg * n_iter_unroll; } rho0 += rho0v.d[0] + rho0v.d[1]; rho1 += rho1v.d[0] + rho1v.d[1]; rho2 += rho2v.d[0] + rho2v.d[1]; rho3 += rho3v.d[0] + rho3v.d[1]; if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { a0c = *a0; //a1c = *a1; a1c = *(a0 + lda); a2c = *a2; //a3c = *a3; a3c = *(a2 + lda); w1c = *w1; z1c = *z1; rho0 += a0c * w1c; rho1 += a1c * w1c; rho2 += a2c * w1c; rho3 += a3c * w1c; z1c += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; *z1 = z1c; a0 += inca; //a1 += inca; a2 += inca; //a3 += inca; w1 += incw; z1 += incz; } } rho0v.d[0] = rho0; rho0v.d[1] = rho1; rho1v.d[0] = rho2; rho1v.d[1] = rho3; betav.v = _mm_loaddup_pd( ( double* ) beta_cast ); alphav.v = _mm_loaddup_pd( ( double* ) alpha_cast ); psi0v.v = _mm_load_pd( ( double* )(y_cast + 0*n_elem_per_reg ) ); psi1v.v = _mm_load_pd( ( double* )(y_cast + 1*n_elem_per_reg ) ); psi0v.v = betav.v * psi0v.v + alphav.v * rho0v.v; psi1v.v = betav.v * psi1v.v + alphav.v * rho1v.v; _mm_store_pd( ( double* )(y_cast + 0*n_elem_per_reg ), psi0v.v ); _mm_store_pd( ( double* )(y_cast + 1*n_elem_per_reg ), psi1v.v ); } cython-blis-0.9.1/blis/_src/kernels/penryn/1f/bli_dotxf_penryn_int.c000066400000000000000000000203251427272030600255070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "pmmintrin.h" #include "blis.h" typedef union { __m128d v; double d[2]; } v2df_t; void bli_ddotxf_penryn_int ( conj_t conjat, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict beta, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { double* restrict alpha_cast = alpha; double* restrict beta_cast = beta; double* restrict a_cast = a; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 4; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict x0; double* restrict x1; double* restrict x2; double* restrict x3; double* restrict y0; double rho0, rho1, rho2, rho3; double x0c, x1c, x2c, x3c, y0c; v2df_t rho0v, rho1v, rho2v, rho3v; v2df_t x0v, x1v, x2v, x3v, y0v, betav, alphav; bool use_ref = FALSE; if ( bli_zero_dim1( b_n ) ) return; // If the vector lengths are zero, scale r by beta and return. if ( bli_zero_dim1( m ) ) { dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); f ( BLIS_NO_CONJUGATE, b_n, beta_cast, y_cast, incy, cntx ); return; } m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_DF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( ( siz_t )(lda*sizeof(double)), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( ( siz_t )a, 16 ) || bli_is_unaligned_to( ( siz_t )x, 16 ) || bli_is_unaligned_to( ( siz_t )y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( ( siz_t )a, 16 ) && bli_is_unaligned_to( ( siz_t )x, 16 ) && bli_is_aligned_to( ( siz_t )y, 16 ) ) // Note: r is not affected by x and y being unaligned. { use_ref = FALSE; m_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { ddotxf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx ); f ( conjat, conjx, m, b_n, alpha_cast, a_cast, inca, lda, x_cast, incx, beta_cast, y_cast, incy, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); x0 = a_cast; x1 = a_cast + lda; x2 = a_cast + 2*lda; x3 = a_cast + 3*lda; y0 = x_cast; PASTEMAC(d,set0s)( rho0 ); PASTEMAC(d,set0s)( rho1 ); PASTEMAC(d,set0s)( rho2 ); PASTEMAC(d,set0s)( rho3 ); if ( m_pre == 1 ) { x0c = *x0; x1c = *x1; x2c = *x2; x3c = *x3; y0c = *y0; rho0 += x0c * y0c; rho1 += x1c * y0c; rho2 += x2c * y0c; rho3 += x3c * y0c; x0 += inca; x1 += inca; x2 += inca; x3 += inca; y0 += incx; } rho0v.v = _mm_setzero_pd(); rho1v.v = _mm_setzero_pd(); rho2v.v = _mm_setzero_pd(); rho3v.v = _mm_setzero_pd(); for ( i = 0; i < m_run; ++i ) { x0v.v = _mm_load_pd( ( double* )(x0 + 0*n_elem_per_reg) ); x1v.v = _mm_load_pd( ( double* )(x1 + 0*n_elem_per_reg) ); x2v.v = _mm_load_pd( ( double* )(x2 + 0*n_elem_per_reg) ); x3v.v = _mm_load_pd( ( double* )(x3 + 0*n_elem_per_reg) ); y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) ); rho0v.v += x0v.v * y0v.v; rho1v.v += x1v.v * y0v.v; rho2v.v += x2v.v * y0v.v; rho3v.v += x3v.v * y0v.v; x0v.v = _mm_load_pd( ( double* )(x0 + 1*n_elem_per_reg) ); x1v.v = _mm_load_pd( ( double* )(x1 + 1*n_elem_per_reg) ); x2v.v = _mm_load_pd( ( double* )(x2 + 1*n_elem_per_reg) ); x3v.v = _mm_load_pd( ( double* )(x3 + 1*n_elem_per_reg) ); y0v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) ); rho0v.v += x0v.v * y0v.v; rho1v.v += x1v.v * y0v.v; rho2v.v += x2v.v * y0v.v; rho3v.v += x3v.v * y0v.v; x0v.v = _mm_load_pd( ( double* )(x0 + 2*n_elem_per_reg) ); x1v.v = _mm_load_pd( ( double* )(x1 + 2*n_elem_per_reg) ); x2v.v = _mm_load_pd( ( double* )(x2 + 2*n_elem_per_reg) ); x3v.v = _mm_load_pd( ( double* )(x3 + 2*n_elem_per_reg) ); y0v.v = _mm_load_pd( ( double* )(y0 + 2*n_elem_per_reg) ); rho0v.v += x0v.v * y0v.v; rho1v.v += x1v.v * y0v.v; rho2v.v += x2v.v * y0v.v; rho3v.v += x3v.v * y0v.v; x0v.v = _mm_load_pd( ( double* )(x0 + 3*n_elem_per_reg) ); x1v.v = _mm_load_pd( ( double* )(x1 + 3*n_elem_per_reg) ); x2v.v = _mm_load_pd( ( double* )(x2 + 3*n_elem_per_reg) ); x3v.v = _mm_load_pd( ( double* )(x3 + 3*n_elem_per_reg) ); y0v.v = _mm_load_pd( ( double* )(y0 + 3*n_elem_per_reg) ); rho0v.v += x0v.v * y0v.v; rho1v.v += x1v.v * y0v.v; rho2v.v += x2v.v * y0v.v; rho3v.v += x3v.v * y0v.v; x0 += n_elem_per_reg * n_iter_unroll; x1 += n_elem_per_reg * n_iter_unroll; x2 += n_elem_per_reg * n_iter_unroll; x3 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } rho0 += rho0v.d[0] + rho0v.d[1]; rho1 += rho1v.d[0] + rho1v.d[1]; rho2 += rho2v.d[0] + rho2v.d[1]; rho3 += rho3v.d[0] + rho3v.d[1]; if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { x0c = *x0; x1c = *x1; x2c = *x2; x3c = *x3; y0c = *y0; rho0 += x0c * y0c; rho1 += x1c * y0c; rho2 += x2c * y0c; rho3 += x3c * y0c; x0 += inca; x1 += inca; x2 += inca; x3 += inca; y0 += incx; } } /* PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast ) ); \ PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+1) ); \ PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+2) ); \ PASTEMAC2(d,d,scals)( *beta_cast, *(y_cast+3) ); \ PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho1, *(y_cast ) ); \ PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho2, *(y_cast+1) ); \ PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho3, *(y_cast+2) ); \ PASTEMAC3(d,d,d,axpys)( *alpha_cast, rho4, *(y_cast+3) ); \ */ rho1v.d[0] = rho0; rho1v.d[1] = rho1; rho3v.d[0] = rho2; rho3v.d[1] = rho3; betav.v = _mm_loaddup_pd( ( double* ) beta_cast ); alphav.v = _mm_loaddup_pd( ( double* ) alpha_cast ); rho0v.v = _mm_load_pd( ( double* )(y_cast + 0*n_elem_per_reg) ); rho2v.v = _mm_load_pd( ( double* )(y_cast + 1*n_elem_per_reg) ); rho0v.v *= betav.v; rho2v.v *= betav.v; rho0v.v += alphav.v * rho1v.v; rho2v.v += alphav.v * rho3v.v; _mm_store_pd( ( double* )(y_cast + 0*n_elem_per_reg), rho0v.v ); _mm_store_pd( ( double* )(y_cast + 1*n_elem_per_reg), rho2v.v ); } cython-blis-0.9.1/blis/_src/kernels/penryn/3/000077500000000000000000000000001427272030600207565ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c000066400000000000000000000621351427272030600260020ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" void bli_sgemm_penryn_asm_8x4 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_ALIGNED( s, 8, 4, false, 16 ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r9) // load address of b_next. sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) mov(rdi, r12) // make a copy of cs_c (in bytes) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; prefetch(2, mem(r9, 0*4)) // prefetch b_next xorps(xmm3, xmm3) xorps(xmm4, xmm4) xorps(xmm5, xmm5) xorps(xmm6, xmm6) prefetch(2, mem(rcx, 6*4)) // prefetch c + 0*cs_c xorps(xmm8, xmm8) xorps(xmm9, xmm9) prefetch(2, mem(rcx, rdi, 1, 6*4)) // prefetch c + 1*cs_c xorps(xmm10, xmm10) xorps(xmm11, xmm11) prefetch(2, mem(r10, 6*4)) // prefetch c + 2*cs_c xorps(xmm12, xmm12) xorps(xmm13, xmm13) prefetch(2, mem(r10, rdi, 1, 6*4)) // prefetch c + 3*cs_c xorps(xmm14, xmm14) xorps(xmm15, xmm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP prefetch(0, mem(rax, (4*35+1)*8)) addps(xmm6, xmm10) // iteration 0 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) addps(xmm2, xmm8) movaps(mem(rbx, -7*16), xmm2) addps(xmm3, xmm12) movaps(xmm6, xmm3) pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) mulps(xmm0, xmm4) movaps(mem(rax, -6*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -5*16), xmm1) addps(xmm6, xmm10) // iteration 1 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) addps(xmm2, xmm8) movaps(mem(rbx, -6*16), xmm2) addps(xmm3, xmm12) movaps(xmm6, xmm3) pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) mulps(xmm0, xmm4) movaps(mem(rax, -4*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -3*16), xmm1) addps(xmm6, xmm10) // iteration 2 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) addps(xmm2, xmm8) movaps(mem(rbx, -5*16), xmm2) addps(xmm3, xmm12) movaps(xmm6, xmm3) pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) mulps(xmm0, xmm4) movaps(mem(rax, -2*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -1*16), xmm1) addps(xmm6, xmm10) // iteration 3 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) sub(imm(0-4*8*4), rax) // a += 4*8 (unroll x mr) addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) sub(imm(0-4*4*4), r9) // b_next += 4*4 (unroll x nr) addps(xmm2, xmm8) movaps(mem(rbx, -4*16), xmm2) addps(xmm3, xmm12) movaps(xmm6, xmm3) pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) sub(imm(0-4*4*4), rbx) // b += 4*4 (unroll x nr) addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) mulps(xmm0, xmm4) movaps(mem(rax, -8*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -7*16), xmm1) prefetch(2, mem(r9, 0*4)) // prefetch b_next[0] prefetch(2, mem(r9, 16*4)) // prefetch b_next[16] dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP addps(xmm6, xmm10) // iteration 0 addps(xmm3, xmm14) movaps(xmm2, xmm3) pshufd(imm(0x39), xmm2, xmm7) mulps(xmm0, xmm2) mulps(xmm1, xmm3) addps(xmm4, xmm11) addps(xmm5, xmm15) movaps(xmm7, xmm5) pshufd(imm(0x39), xmm7, xmm6) mulps(xmm0, xmm7) mulps(xmm1, xmm5) addps(xmm2, xmm8) movaps(mem(rbx, -7*16), xmm2) addps(xmm3, xmm12) movaps(xmm6, xmm3) pshufd(imm(0x39), xmm6, xmm4) mulps(xmm0, xmm6) mulps(xmm1, xmm3) addps(xmm7, xmm9) addps(xmm5, xmm13) movaps(xmm4, xmm5) mulps(xmm0, xmm4) movaps(mem(rax, -6*16), xmm0) mulps(xmm1, xmm5) movaps(mem(rax, -5*16), xmm1) sub(imm(0-1*8*4), rax) // a += 8 (1 x mr) sub(imm(0-1*4*4), rbx) // b += 4 (1 x nr) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) addps(xmm6, xmm10) addps(xmm3, xmm14) addps(xmm4, xmm11) addps(xmm5, xmm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta movss(mem(rax), xmm6) // load alpha to bottom 4 bytes of xmm6 movss(mem(rbx), xmm7) // load beta to bottom 4 bytes of xmm7 pshufd(imm(0x00), xmm6, xmm6) // populate xmm6 with four alphas pshufd(imm(0x00), xmm7, xmm7) // populate xmm7 with four betas mov(var(rs_c), rsi) // load rs_c mov(rsi, r8) // make a copy of rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) lea(mem(rsi, rsi, 2), r11) // r11 = 3*(rs_c * sizeof(float)) lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; // xmm8: xmm9: xmm10: xmm11: // ( ab00 ( ab01 ( ab02 ( ab03 // ab11 ab12 ab13 ab10 // ab22 ab23 ab20 ab21 // ab33 ) ab30 ) ab31 ) ab32 ) // // xmm12: xmm13: xmm14: xmm15: // ( ab40 ( ab41 ( ab42 ( ab43 // ab51 ab52 ab53 ab50 // ab62 ab63 ab60 ab61 // ab73 ) ab70 ) ab71 ) ab72 ) movaps(xmm9, xmm4) shufps(imm(0xd8), xmm8, xmm9) shufps(imm(0xd8), xmm11, xmm8) shufps(imm(0xd8), xmm10, xmm11) shufps(imm(0xd8), xmm4, xmm10) movaps(xmm8, xmm4) shufps(imm(0xd8), xmm10, xmm8) shufps(imm(0xd8), xmm4, xmm10) movaps(xmm9, xmm5) shufps(imm(0xd8), xmm11, xmm9) shufps(imm(0xd8), xmm5, xmm11) movaps(xmm13, xmm4) shufps(imm(0xd8), xmm12, xmm13) shufps(imm(0xd8), xmm15, xmm12) shufps(imm(0xd8), xmm14, xmm15) shufps(imm(0xd8), xmm4, xmm14) movaps(xmm12, xmm4) shufps(imm(0xd8), xmm14, xmm12) shufps(imm(0xd8), xmm4, xmm14) movaps(xmm13, xmm5) shufps(imm(0xd8), xmm15, xmm13) shufps(imm(0xd8), xmm5, xmm15) // xmm8: xmm9: xmm10: xmm11: // ( ab00 ( ab01 ( ab02 ( ab03 // ab10 ab11 ab12 ab13 // ab20 ab21 ab22 ab23 // ab30 ) ab31 ) ab32 ) ab33 ) // // xmm12: xmm13: xmm14: xmm15: // ( ab40 ( ab41 ( ab42 ( ab43 // ab50 ab51 ab52 ab53 // ab60 ab61 ab62 ab63 // ab70 ) ab71 ) ab72 ) ab73 ) // now avoid loading C if beta == 0 xorpd(xmm0, xmm0) // set xmm0 to zero. ucomisd(xmm0, xmm7) // check if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case movaps(mem(rcx), xmm0) // load c00 ~ c30, mulps(xmm6, xmm8) // scale by alpha, mulps(xmm7, xmm0) // scale by beta, addps(xmm8, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. add(rdi, rcx) movaps(mem(rdx), xmm1) // load c40 ~ c70, mulps(xmm6, xmm12) // scale by alpha, mulps(xmm7, xmm1) // scale by beta, addps(xmm12, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. add(rdi, rdx) movaps(mem(rcx), xmm0) // load c01 ~ c31, mulps(xmm6, xmm9) // scale by alpha, mulps(xmm7, xmm0) // scale by beta, addps(xmm9, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. add(rdi, rcx) movaps(mem(rdx), xmm1) // load c41 ~ c71, mulps(xmm6, xmm13) // scale by alpha, mulps(xmm7, xmm1) // scale by beta, addps(xmm13, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. add(rdi, rdx) movaps(mem(rcx), xmm0) // load c02 ~ c32, mulps(xmm6, xmm10) // scale by alpha, mulps(xmm7, xmm0) // scale by beta, addps(xmm10, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. add(rdi, rcx) movaps(mem(rdx), xmm1) // load c42 ~ c72, mulps(xmm6, xmm14) // scale by alpha, mulps(xmm7, xmm1) // scale by beta, addps(xmm14, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. add(rdi, rdx) movaps(mem(rcx), xmm0) // load c03 ~ c33, mulps(xmm6, xmm11) // scale by alpha, mulps(xmm7, xmm0) // scale by beta, addps(xmm11, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. movaps(mem(rdx), xmm1) // load c43 ~ c73, mulps(xmm6, xmm15) // scale by alpha, mulps(xmm7, xmm1) // scale by beta, addps(xmm15, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. jmp(.SDONE) // jump to end. label(.SBETAZERO) // skip loading c00 ~ c30, mulps(xmm6, xmm8) // scale by alpha, movaps(xmm8, mem(rcx)) // and store back to memory. add(rdi, rcx) // skip loading c40 ~ c70, mulps(xmm6, xmm12) // scale by alpha, movaps(xmm12, mem(rdx)) // and store back to memory. add(rdi, rdx) // skip loading c01 ~ c31, mulps(xmm6, xmm9) // scale by alpha, movaps(xmm9, mem(rcx)) // and store back to memory. add(rdi, rcx) // skip loading c41 ~ c71, mulps(xmm6, xmm13) // scale by alpha, movaps(xmm13, mem(rdx)) // and store back to memory. add(rdi, rdx) // skip loading c02 ~ c32, mulps(xmm6, xmm10) // scale by alpha, movaps(xmm10, mem(rcx)) // and store back to memory. add(rdi, rcx) // skip loading c42 ~ c72, mulps(xmm6, xmm14) // scale by alpha, movaps(xmm14, mem(rdx)) // and store back to memory. add(rdi, rdx) // skip loading c03 ~ c33, mulps(xmm6, xmm11) // scale by alpha, movaps(xmm11, mem(rcx)) // and store back to memory. // skip loading c43 ~ c73, mulps(xmm6, xmm15) // scale by alpha, movaps(xmm15, mem(rdx)) // and store back to memory. label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [b_next] "m" (b_next)/*, // 9 [a_next] "m" (a_next)*/ // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( s ); } void bli_dgemm_penryn_asm_4x4 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 4; uint64_t k_left = k % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT_ALIGNED( d, 4, 4, false, 16 ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r9) // load address of b_next. mov(var(a_next), r11) // load address of a_next. sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) mov(rdi, r12) // make a copy of cs_c (in bytes) lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c; prefetch(2, mem(r9, 0*8)) // prefetch b_next xorpd(xmm3, xmm3) xorpd(xmm4, xmm4) xorpd(xmm5, xmm5) xorpd(xmm6, xmm6) prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c xorpd(xmm8, xmm8) xorpd(xmm9, xmm9) prefetch(2, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c xorpd(xmm10, xmm10) xorpd(xmm11, xmm11) prefetch(2, mem(r10, 3*8)) // prefetch c + 2*cs_c xorpd(xmm12, xmm12) xorpd(xmm13, xmm13) prefetch(2, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c xorpd(xmm14, xmm14) xorpd(xmm15, xmm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP prefetch(0, mem(rax, (4*35+1)*8)) //prefetch(0, mem(rax, (8*97+4)*8)) //prefetch(0, mem(r11, 67*4*8)) // prefetch a_next[0] addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) addpd(xmm3, xmm11) // iteration 1 movaps(mem(rbx, -5*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -4*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -4*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -3*16), xmm1) prefetch(0, mem(rax, (4*37+1)*8)) //prefetch(0, mem(rax, (8*97+12)*8)) //prefetch(0, mem(r11, 69*4*8)) // prefetch a_next[8] //sub(imm(-4*4*8), r11) // a_next += 4*4 (unroll x mr) addpd(xmm3, xmm11) // iteration 2 movaps(mem(rbx, -3*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -2*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -2*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -1*16), xmm1) addpd(xmm3, xmm11) // iteration 3 movaps(mem(rbx, -1*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) sub(imm(0-4*4*8), r9) // b_next += 4*4 (unroll x nr) addpd(xmm2, xmm9) movaps(mem(rbx, 0*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -8*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -7*16), xmm1) prefetch(2, mem(r9, 0*8)) // prefetch b_next[0] prefetch(2, mem(r9, 8*8)) // prefetch b_next[8] dec(rsi) // i -= 1; jne(.DLOOPKITER) // iterate again if i != 0. //prefetch(2, mem(r9, -8*8)) // prefetch b_next[-8] label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) sub(imm(0-4*1*8), rax) // a += 4 (1 x mr) sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr) dec(rsi) // i -= 1; jne(.DLOOPKLEFT) // iterate again if i != 0. label(.DPOSTACCUM) addpd(xmm3, xmm11) addpd(xmm4, xmm15) addpd(xmm5, xmm10) addpd(xmm6, xmm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta movddup(mem(rax), xmm6) // load alpha and duplicate movddup(mem(rbx), xmm7) // load beta and duplicate mov(var(rs_c), rsi) // load rs_c mov(rsi, r8) // make a copy of rs_c lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(double) lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c; // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 // ab10 ) ab11 ) ab12 ) ab13 ) // // xmm12: xmm13: xmm14: xmm15: // ( ab21 ( ab20 ( ab23 ( ab22 // ab30 ) ab31 ) ab32 ) ab33 ) movaps(xmm8, xmm0) movsd(xmm9, xmm8) movsd(xmm0, xmm9) movaps(xmm10, xmm0) movsd(xmm11, xmm10) movsd(xmm0, xmm11) movaps(xmm12, xmm0) movsd(xmm13, xmm12) movsd(xmm0, xmm13) movaps(xmm14, xmm0) movsd(xmm15, xmm14) movsd(xmm0, xmm15) // xmm8: xmm9: xmm10: xmm11: // ( ab00 ( ab01 ( ab02 ( ab03 // ab10 ) ab11 ) ab12 ) ab13 ) // // xmm12: xmm13: xmm14: xmm15: // ( ab20 ( ab21 ( ab22 ( ab23 // ab30 ) ab31 ) ab32 ) ab33 ) // now avoid loading C if beta == 0 xorpd(xmm0, xmm0) // set xmm0 to zero. ucomisd(xmm0, xmm7) // check if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case movaps(mem(rcx), xmm0) // load c00 and c10, mulpd(xmm6, xmm8) // scale by alpha, mulpd(xmm7, xmm0) // scale by beta, addpd(xmm8, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. add(rdi, rcx) movaps(mem(rdx), xmm1) // load c20 and c30, mulpd(xmm6, xmm12) // scale by alpha, mulpd(xmm7, xmm1) // scale by beta, addpd(xmm12, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. add(rdi, rdx) movaps(mem(rcx), xmm0) // load c01 and c11, mulpd(xmm6, xmm9) // scale by alpha, mulpd(xmm7, xmm0) // scale by beta, addpd(xmm9, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. add(rdi, rcx) movaps(mem(rdx), xmm1) // load c21 and c31, mulpd(xmm6, xmm13) // scale by alpha, mulpd(xmm7, xmm1) // scale by beta, addpd(xmm13, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. add(rdi, rdx) movaps(mem(rcx), xmm0) // load c02 and c12, mulpd(xmm6, xmm10) // scale by alpha, mulpd(xmm7, xmm0) // scale by beta, addpd(xmm10, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. add(rdi, rcx) movaps(mem(rdx), xmm1) // load c22 and c32, mulpd(xmm6, xmm14) // scale by alpha, mulpd(xmm7, xmm1) // scale by beta, addpd(xmm14, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. add(rdi, rdx) movaps(mem(rcx), xmm0) // load c03 and c13, mulpd(xmm6, xmm11) // scale by alpha, mulpd(xmm7, xmm0) // scale by beta, addpd(xmm11, xmm0) // add the gemm result, movaps(xmm0, mem(rcx)) // and store back to memory. movaps(mem(rdx), xmm1) // load c23 and c33, mulpd(xmm6, xmm15) // scale by alpha, mulpd(xmm7, xmm1) // scale by beta, addpd(xmm15, xmm1) // add the gemm result, movaps(xmm1, mem(rdx)) // and store back to memory. jmp(.DDONE) // jump to end. label(.DBETAZERO) // skip loading c00 and c10, mulpd(xmm6, xmm8) // scale by alpha, movaps(xmm8, mem(rcx)) // and store back to memory. add(rdi, rcx) // skip loading c20 and c30, mulpd(xmm6, xmm12) // scale by alpha, movaps(xmm12, mem(rdx)) // and store back to memory. add(rdi, rdx) // skip loading c01 and c11, mulpd(xmm6, xmm9) // scale by alpha, movaps(xmm9, mem(rcx)) // and store back to memory. add(rdi, rcx) // skip loading c21 and c31, mulpd(xmm6, xmm13) // scale by alpha, movaps(xmm13, mem(rdx)) // and store back to memory. add(rdi, rdx) // skip loading c02 and c12, mulpd(xmm6, xmm10) // scale by alpha, movaps(xmm10, mem(rcx)) // and store back to memory. add(rdi, rcx) // skip loading c22 and c32, mulpd(xmm6, xmm14) // scale by alpha, movaps(xmm14, mem(rdx)) // and store back to memory. add(rdi, rdx) // skip loading c03 and c13, mulpd(xmm6, xmm11) // scale by alpha, movaps(xmm11, mem(rcx)) // and store back to memory. // skip loading c23 and c33, mulpd(xmm6, xmm15) // scale by alpha, movaps(xmm15, mem(rdx)) // and store back to memory. label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c000066400000000000000000000402521427272030600272170ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #if 0 void bli_sgemmtrsm_l_penryn_asm_8x4 ( dim_t k0, float* restrict alpha, float* restrict a10, float* restrict a11, float* restrict b01, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif void bli_dgemmtrsm_l_penryn_asm_4x4 ( dim_t m, dim_t n, dim_t k0, double* restrict alpha, double* restrict a10, double* restrict a11, double* restrict b01, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false ); begin_asm() mov(var(a10), rax) // load address of a10. mov(var(b01), rbx) // load address of b01. //mov(var(b_next), r9) // load address of b_next. sub(imm(0-8*16), rax) // increment pointers to allow byte sub(imm(0-8*16), rbx) // offsets in the unrolled iterations. movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) //mov(var(c11), rcx) // load address of c11 //mov(var(rs_c), rdi) // load cs_c //lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) //lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c; //prefetch(2, mem(r9, 0*8)) // prefetch b_next xorpd(xmm3, xmm3) xorpd(xmm4, xmm4) xorpd(xmm5, xmm5) xorpd(xmm6, xmm6) //prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c xorpd(xmm8, xmm8) movaps(xmm8, xmm9) //prefetch(2, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c movaps(xmm8, xmm10) movaps(xmm8, xmm11) //prefetch(2, mem(rdx, 3*8)) // prefetch c + 2*cs_c movaps(xmm8, xmm12) movaps(xmm8, xmm13) //prefetch(2, mem(rdx, rdi, 1, 3*8)) // prefetch c + 3*cs_c movaps(xmm8, xmm14) movaps(xmm8, xmm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CONSIDERKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.LOOPKITER) // MAIN LOOP //prefetch(0, mem(rax, 1264)) prefetch(0, mem(rax, (4*35+1)*8)) addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) addpd(xmm3, xmm11) // iteration 1 movaps(mem(rbx, -5*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -4*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -4*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -3*16), xmm1) //prefetch(0, mem(rax, 1328)) prefetch(0, mem(rax, (4*37+1)*8)) addpd(xmm3, xmm11) // iteration 2 movaps(mem(rbx, -3*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -2*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -2*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -1*16), xmm1) addpd(xmm3, xmm11) // iteration 3 movaps(mem(rbx, -1*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) //sub(imm(-4*4*8), r9) // b_next += 4*4 (unroll x nr) addpd(xmm2, xmm9) movaps(mem(rbx, 0*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -8*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -7*16), xmm1) //prefetch(2, mem(r9, 0*8)) // prefetch b_next[0] //prefetch(2, mem(r9, 8*8)) // prefetch b_next[8] dec(rsi) // i -= 1; jne(.LOOPKITER) // iterate again if i != 0. label(.CONSIDERKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.POSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.LOOPKLEFT) // EDGE LOOP addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) sub(imm(0-4*1*8), rax) // a += 4 (1 x mr) sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr) dec(rsi) // i -= 1; jne(.LOOPKLEFT) // iterate again if i != 0. label(.POSTACCUM) addpd(xmm3, xmm11) addpd(xmm4, xmm15) addpd(xmm5, xmm10) addpd(xmm6, xmm14) mov(var(b11), rbx) // load address of b11. // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 // ab10 ) ab11 ) ab12 ) ab13 ) // // xmm12: xmm13: xmm14: xmm15: // ( ab21 ( ab20 ( ab23 ( ab22 // ab30 ) ab31 ) ab32 ) ab33 ) movaps(xmm9, xmm0) movaps(xmm8, xmm1) unpcklpd(xmm8, xmm0) unpckhpd(xmm9, xmm1) movaps(xmm11, xmm4) movaps(xmm10, xmm5) unpcklpd(xmm10, xmm4) unpckhpd(xmm11, xmm5) movaps(xmm13, xmm2) movaps(xmm12, xmm3) unpcklpd(xmm12, xmm2) unpckhpd(xmm13, xmm3) movaps(xmm15, xmm6) movaps(xmm14, xmm7) unpcklpd(xmm14, xmm6) unpckhpd(xmm15, xmm7) // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) mov(var(alpha), rax) // load address of alpha movddup(mem(rax), xmm15) // load alpha and duplicate movaps(mem(rbx, 0*16), xmm8) movaps(mem(rbx, 1*16), xmm12) mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 ) mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 ) movaps(mem(rbx, 2*16), xmm9) movaps(mem(rbx, 3*16), xmm13) mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 ) mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 ) movaps(mem(rbx, 4*16), xmm10) movaps(mem(rbx, 5*16), xmm14) mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 ) mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 ) movaps(mem(rbx, 6*16), xmm11) mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 ) mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 ) // (Now scaled by alpha:) // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) subpd(xmm0, xmm8) // xmm8 -= xmm0 subpd(xmm1, xmm9) // xmm9 -= xmm1 subpd(xmm2, xmm10) // xmm10 -= xmm2 subpd(xmm3, xmm11) // xmm11 -= xmm3 subpd(xmm4, xmm12) // xmm12 -= xmm4 subpd(xmm5, xmm13) // xmm13 -= xmm5 subpd(xmm6, xmm14) // xmm14 -= xmm6 subpd(xmm7, xmm15) // xmm15 -= xmm7 label(.TRSM) mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c // iteration 0 movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); #else divpd(xmm0, xmm8) // xmm8 /= alpha00; divpd(xmm0, xmm12) // xmm12 /= alpha00; #endif movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c // iteration 1 movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10 movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) movaps(xmm0, xmm4) // xmm4 = xmm0 mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 ) subpd(xmm0, xmm9) // xmm9 -= xmm0 subpd(xmm4, xmm13) // xmm13 -= xmm4 #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); #else divpd(xmm1, xmm9) // xmm9 /= alpha11; divpd(xmm1, xmm13) // xmm13 /= alpha11; #endif movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c // iteration 2 movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20 movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21 movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 ) mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 ) mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 ) addpd(xmm1, xmm0) // xmm0 += xmm1; addpd(xmm5, xmm4) // xmm4 += xmm5; subpd(xmm0, xmm10) // xmm10 -= xmm0 subpd(xmm4, xmm14) // xmm14 -= xmm4 #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); #else divpd(xmm2, xmm10) // xmm10 /= alpha22; divpd(xmm2, xmm14) // xmm14 /= alpha22; #endif movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c // iteration 3 movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30 movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31 movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32 movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 ) mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 ) mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 ) mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 ) mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 ) addpd(xmm1, xmm0) // xmm0 += xmm1; addpd(xmm5, xmm4) // xmm4 += xmm5; addpd(xmm2, xmm0) // xmm0 += xmm2; addpd(xmm6, xmm4) // xmm4 += xmm6; subpd(xmm0, xmm11) // xmm11 -= xmm0 subpd(xmm4, xmm15) // xmm15 -= xmm4 #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); #else divpd(xmm3, xmm11) // xmm11 /= alpha33; divpd(xmm3, xmm15) // xmm15 /= alpha33; #endif movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a10] "m" (a10), // 2 [a11] "m" (a11), // 3 [b01] "m" (b01), // 4 [b11] "m" (b11), // 5 [c11] "m" (c11), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [alpha] "m" (alpha), // 9 [b_next] "m" (b_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMMTRSM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c000066400000000000000000000366711427272030600272420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #if 0 void bli_sgemmtrsm_u_penryn_asm_8x4 ( dim_t k0, float* restrict alpha, float* restrict a12, float* restrict a11, float* restrict b21, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif void bli_dgemmtrsm_u_penryn_asm_4x4 ( dim_t m, dim_t n, dim_t k0, double* restrict alpha, double* restrict a12, double* restrict a11, double* restrict b21, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false ); begin_asm() mov(var(a12), rax) // load address of a12. mov(var(b21), rbx) // load address of b21. //mov(var(b_next), r9) // load address of b_next. add(imm(8*16), rax) // increment pointers to allow byte add(imm(8*16), rbx) // offsets in the unrolled iterations. movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements movaps(mem(rax, -7*16), xmm1) // of a and b. movaps(mem(rbx, -8*16), xmm2) xorpd(xmm3, xmm3) xorpd(xmm4, xmm4) xorpd(xmm5, xmm5) xorpd(xmm6, xmm6) xorpd(xmm8, xmm8) movaps(xmm8, xmm9) movaps(xmm8, xmm10) movaps(xmm8, xmm11) movaps(xmm8, xmm12) movaps(xmm8, xmm13) movaps(xmm8, xmm14) movaps(xmm8, xmm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CONSIDERKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.LOOPKITER) // MAIN LOOP prefetch(0, mem(rax, 1264)) addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) addpd(xmm3, xmm11) // iteration 1 movaps(mem(rbx, -5*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -4*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -4*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -3*16), xmm1) prefetch(0, mem(rax, 1328)) addpd(xmm3, xmm11) // iteration 2 movaps(mem(rbx, -3*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -2*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -2*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -1*16), xmm1) addpd(xmm3, xmm11) // iteration 3 movaps(mem(rbx, -1*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) add(imm(4*4*8), rax) // a += 4*4 (unroll x mr) addpd(xmm2, xmm9) movaps(mem(rbx, 0*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -8*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -7*16), xmm1) dec(rsi) // i -= 1; jne(.LOOPKITER) // iterate again if i != 0. label(.CONSIDERKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.POSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.LOOPKLEFT) // EDGE LOOP addpd(xmm3, xmm11) // iteration 0 movaps(mem(rbx, -7*16), xmm3) addpd(xmm4, xmm15) movaps(xmm2, xmm4) pshufd(imm(0x4e), xmm2, xmm7) mulpd(xmm0, xmm2) mulpd(xmm1, xmm4) addpd(xmm5, xmm10) addpd(xmm6, xmm14) movaps(xmm7, xmm6) mulpd(xmm0, xmm7) mulpd(xmm1, xmm6) addpd(xmm2, xmm9) movaps(mem(rbx, -6*16), xmm2) addpd(xmm4, xmm13) movaps(xmm3, xmm4) pshufd(imm(0x4e), xmm3, xmm5) mulpd(xmm0, xmm3) mulpd(xmm1, xmm4) addpd(xmm7, xmm8) addpd(xmm6, xmm12) movaps(xmm5, xmm6) mulpd(xmm0, xmm5) movaps(mem(rax, -6*16), xmm0) mulpd(xmm1, xmm6) movaps(mem(rax, -5*16), xmm1) add(imm(4*1*8), rax) // a += 4 (1 x mr) add(imm(4*1*8), rbx) // b += 4 (1 x nr) dec(rsi) // i -= 1; jne(.LOOPKLEFT) // iterate again if i != 0. label(.POSTACCUM) addpd(xmm3, xmm11) addpd(xmm4, xmm15) addpd(xmm5, xmm10) addpd(xmm6, xmm14) mov(var(b11), rbx) // load address of b11. // xmm8: xmm9: xmm10: xmm11: // ( ab01 ( ab00 ( ab03 ( ab02 // ab10 ) ab11 ) ab12 ) ab13 ) // // xmm12: xmm13: xmm14: xmm15: // ( ab21 ( ab20 ( ab23 ( ab22 // ab30 ) ab31 ) ab32 ) ab33 ) movaps(xmm9, xmm0) movaps(xmm8, xmm1) unpcklpd(xmm8, xmm0) unpckhpd(xmm9, xmm1) movaps(xmm11, xmm4) movaps(xmm10, xmm5) unpcklpd(xmm10, xmm4) unpckhpd(xmm11, xmm5) movaps(xmm13, xmm2) movaps(xmm12, xmm3) unpcklpd(xmm12, xmm2) unpckhpd(xmm13, xmm3) movaps(xmm15, xmm6) movaps(xmm14, xmm7) unpcklpd(xmm14, xmm6) unpckhpd(xmm15, xmm7) // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 ) // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 ) // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 ) // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 ) mov(var(alpha), rax) // load address of alpha movddup(mem(rax), xmm15) // load alpha and duplicate movaps(mem(rbx, 0*16), xmm8) movaps(mem(rbx, 1*16), xmm12) mulpd(xmm15, xmm8) // xmm8 = alpha * ( beta00 beta01 ) mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 ) movaps(mem(rbx, 2*16), xmm9) movaps(mem(rbx, 3*16), xmm13) mulpd(xmm15, xmm9) // xmm9 = alpha * ( beta10 beta11 ) mulpd(xmm15, xmm13) // xmm13 = alpha * ( beta12 beta13 ) movaps(mem(rbx, 4*16), xmm10) movaps(mem(rbx, 5*16), xmm14) mulpd(xmm15, xmm10) // xmm10 = alpha * ( beta20 beta21 ) mulpd(xmm15, xmm14) // xmm14 = alpha * ( beta22 beta23 ) movaps(mem(rbx, 6*16), xmm11) mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 ) mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 ) // (Now scaled by alpha:) // xmm8: ( beta00 beta01 ) xmm12: ( beta02 beta03 ) // xmm9: ( beta10 beta11 ) xmm13: ( beta12 beta13 ) // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 ) // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 ) subpd(xmm0, xmm8) // xmm8 -= xmm0 subpd(xmm1, xmm9) // xmm9 -= xmm1 subpd(xmm2, xmm10) // xmm10 -= xmm2 subpd(xmm3, xmm11) // xmm11 -= xmm3 subpd(xmm4, xmm12) // xmm12 -= xmm4 subpd(xmm5, xmm13) // xmm13 -= xmm5 subpd(xmm6, xmm14) // xmm14 -= xmm6 subpd(xmm7, xmm15) // xmm15 -= xmm7 label(.TRSM) mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) add(rsi, rcx) // c11 += (4-1)*rs_c add(rsi, rcx) add(rsi, rcx) lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c; // iteration 0 movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); #else divpd(xmm3, xmm11) // xmm11 /= alpha33; divpd(xmm3, xmm15) // xmm15 /= alpha33; #endif movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c // iteration 1 movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 ) subpd(xmm3, xmm10) // xmm10 -= xmm3 subpd(xmm7, xmm14) // xmm14 -= xmm7 #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); #else divpd(xmm2, xmm10) // xmm10 /= alpha22; divpd(xmm2, xmm14) // xmm14 /= alpha22; #endif movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c // iteration 2 movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12 movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13 movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 ) mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 ) mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 ) addpd(xmm3, xmm2) // xmm2 += xmm3; addpd(xmm7, xmm6) // xmm6 += xmm7; subpd(xmm2, xmm9) // xmm9 -= xmm2 subpd(xmm6, xmm13) // xmm13 -= xmm6 #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); #else divpd(xmm1, xmm9) // xmm9 /= alpha11; divpd(xmm1, xmm13) // xmm13 /= alpha11; #endif movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c // iteration 3 movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01 movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02 movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03 movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 ) mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 ) mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 ) mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 ) mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 ) addpd(xmm2, xmm1) // xmm1 += xmm2; addpd(xmm6, xmm5) // xmm5 += xmm6; addpd(xmm3, xmm1) // xmm1 += xmm3; addpd(xmm7, xmm5) // xmm5 += xmm7; subpd(xmm1, xmm8) // xmm8 -= xmm1 subpd(xmm5, xmm12) // xmm12 -= xmm5 #ifdef BLIS_ENABLE_TRSM_PREINVERSION mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); #else divpd(xmm0, xmm8) // xmm8 /= alpha00; divpd(xmm0, xmm12) // xmm12 /= alpha00; #endif movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a12] "m" (a12), // 2 [a11] "m" (a11), // 3 [b21] "m" (b21), // 4 [b11] "m" (b11), // 5 [c11] "m" (c11), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [alpha] "m" (alpha), // 9 [b_next] "m" (b_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMMTRSM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c000066400000000000000000000174761427272030600263650ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #if 0 void bli_strsm_l_penryn_asm_8x4 ( float* restrict a11, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif void bli_dtrsm_l_penryn_asm_4x4 ( double* restrict a11, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; begin_asm() mov(var(b11), rbx) // load address of b11. movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 ) movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 ) movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 ) movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 ) movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 ) movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 ) mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c // iteration 0 movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c // iteration 1 movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10 movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) movaps(xmm0, xmm4) // xmm4 = xmm0 mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 ) subpd(xmm0, xmm9) // xmm9 -= xmm0 subpd(xmm4, xmm13) // xmm13 -= xmm4 mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c // iteration 2 movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20 movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21 movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha20 * ( beta02 beta03 ) mulpd(xmm9, xmm1) // xmm1 = alpha21 * ( beta10 beta11 ) mulpd(xmm13, xmm5) // xmm5 = alpha21 * ( beta12 beta13 ) addpd(xmm1, xmm0) // xmm0 += xmm1; addpd(xmm5, xmm4) // xmm4 += xmm5; subpd(xmm0, xmm10) // xmm10 -= xmm0 subpd(xmm4, xmm14) // xmm14 -= xmm4 mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] add(rsi, rcx) // c11 += rs_c add(rsi, rdx) // c11_2 += rs_c // iteration 3 movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30 movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31 movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32 movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) movaps(xmm0, xmm4) // xmm4 = xmm0 movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 mulpd(xmm8, xmm0) // xmm0 = alpha30 * ( beta00 beta01 ) mulpd(xmm12, xmm4) // xmm4 = alpha30 * ( beta02 beta03 ) mulpd(xmm9, xmm1) // xmm1 = alpha31 * ( beta10 beta11 ) mulpd(xmm13, xmm5) // xmm5 = alpha31 * ( beta12 beta13 ) mulpd(xmm10, xmm2) // xmm2 = alpha32 * ( beta20 beta21 ) mulpd(xmm14, xmm6) // xmm6 = alpha32 * ( beta22 beta23 ) addpd(xmm1, xmm0) // xmm0 += xmm1; addpd(xmm5, xmm4) // xmm4 += xmm5; addpd(xmm2, xmm0) // xmm0 += xmm2; addpd(xmm6, xmm4) // xmm4 += xmm6; subpd(xmm0, xmm11) // xmm11 -= xmm0 subpd(xmm4, xmm15) // xmm15 -= xmm4 mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] end_asm( : // output operands (none) : // input operands [a11] "m" (a11), // 0 [b11] "m" (b11), // 1 [c11] "m" (c11), // 2 [rs_c] "m" (rs_c), // 3 [cs_c] "m" (cs_c) // 4 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"r8", "r9", "r10", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c000066400000000000000000000175241427272030600263700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" #if 0 void bli_strsm_u_penryn_asm_8x4 ( float* restrict a11, float* restrict b11, float* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif void bli_dtrsm_u_penryn_asm_4x4 ( double* restrict a11, double* restrict b11, double* restrict c11, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; begin_asm() mov(var(b11), rbx) // load address of b11. movaps(mem(rbx, 0*16), xmm8) // xmm8 = ( beta00 beta01 ) movaps(mem(rbx, 1*16), xmm12) // xmm9 = ( beta02 beta03 ) movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 ) movaps(mem(rbx, 3*16), xmm13) // xmm11 = ( beta12 beta13 ) movaps(mem(rbx, 4*16), xmm10) // xmm12 = ( beta20 beta21 ) movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 ) movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 ) movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 ) mov(var(a11), rax) // load address of a11 mov(var(c11), rcx) // load address of c11 mov(var(rs_c), rsi) // load rs_c mov(var(cs_c), rdi) // load cs_c sal(imm(3), rsi) // rs_c *= sizeof( double ) sal(imm(3), rdi) // cs_c *= sizeof( double ) add(rsi, rcx) // c11 += (4-1)*rs_c add(rsi, rcx) add(rsi, rcx) lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c; // iteration 0 movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33) mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33); mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33); movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11 movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15 movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0] movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1] movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0] movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c // iteration 1 movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22) movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 ) subpd(xmm3, xmm10) // xmm10 -= xmm3 subpd(xmm7, xmm14) // xmm14 -= xmm7 mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22); mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22); movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10 movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14 movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0] movhpd(xmm10, mem(rcx, rdi, 1)) // store ( gamma21 ) = xmm10[1] movlpd(xmm14, mem(rdx)) // store ( gamma22 ) = xmm14[0] movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c // iteration 2 movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11) movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12 movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13 movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 ) mulpd(xmm14, xmm6) // xmm6 = alpha12 * ( beta22 beta23 ) mulpd(xmm11, xmm3) // xmm3 = alpha13 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha13 * ( beta32 beta33 ) addpd(xmm3, xmm2) // xmm2 += xmm3; addpd(xmm7, xmm6) // xmm6 += xmm7; subpd(xmm2, xmm9) // xmm9 -= xmm2 subpd(xmm6, xmm13) // xmm13 -= xmm6 mulpd(xmm1, xmm9) // xmm9 *= (1/alpha11); mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11); movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9 movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13 movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0] movhpd(xmm9, mem(rcx, rdi, 1)) // store ( gamma11 ) = xmm9[1] movlpd(xmm13, mem(rdx)) // store ( gamma12 ) = xmm13[0] movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1] sub(rsi, rcx) // c11 -= rs_c sub(rsi, rdx) // c11_2 -= rs_c // iteration 3 movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00) movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01 movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02 movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03 movaps(xmm1, xmm5) // xmm5 = xmm1 movaps(xmm2, xmm6) // xmm6 = xmm2 movaps(xmm3, xmm7) // xmm7 = xmm3 mulpd(xmm9, xmm1) // xmm1 = alpha01 * ( beta10 beta11 ) mulpd(xmm13, xmm5) // xmm5 = alpha01 * ( beta12 beta13 ) mulpd(xmm10, xmm2) // xmm2 = alpha02 * ( beta20 beta21 ) mulpd(xmm14, xmm6) // xmm6 = alpha02 * ( beta22 beta23 ) mulpd(xmm11, xmm3) // xmm3 = alpha03 * ( beta30 beta31 ) mulpd(xmm15, xmm7) // xmm7 = alpha03 * ( beta32 beta33 ) addpd(xmm2, xmm1) // xmm1 += xmm2; addpd(xmm6, xmm5) // xmm5 += xmm6; addpd(xmm3, xmm1) // xmm1 += xmm3; addpd(xmm7, xmm5) // xmm5 += xmm7; subpd(xmm1, xmm8) // xmm8 -= xmm1 subpd(xmm5, xmm12) // xmm12 -= xmm5 mulpd(xmm0, xmm8) // xmm8 *= (1/alpha00); mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00); movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8 movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12 movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0] movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1] movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0] movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1] end_asm( : // output operands (none) : // input operands [a11] "m" (a11), // 0 [b11] "m" (b11), // 1 [c11] "m" (c11), // 2 [rs_c] "m" (rs_c), // 3 [cs_c] "m" (cs_c) // 4 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/penryn/bli_kernels_penryn.h000066400000000000000000000037201427272030600246530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( float, s, gemm_penryn_asm_8x4 ) GEMM_UKR_PROT( double, d, gemm_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_l_penryn_asm_4x4 ) GEMMTRSM_UKR_PROT( double, d, gemmtrsm_u_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_l_penryn_asm_4x4 ) TRSM_UKR_PROT( double, d, trsm_u_penryn_asm_4x4 ) cython-blis-0.9.1/blis/_src/kernels/piledriver/000077500000000000000000000000001427272030600214465ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/piledriver/3/000077500000000000000000000000001427272030600216105ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c000066400000000000000000001360051427272030600274670ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* NOTE: The micro-kernels in this file were partially inspired by portions of code found in OpenBLAS 0.2.12 (http://www.openblas.net/). -FGVZ */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" void bli_sgemm_piledriver_asm_16x3 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 8; uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( s, 16, 3, false ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. prefetch(0, mem(rbx, 128)) // prefetch b prefetch(0, mem(rbx, 64+128)) // prefetch b prefetch(0, mem(rbx, 128+128)) // prefetch b add(imm(32*4), rax) add(imm(12*4), rbx) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; vbroadcastss(mem(rbx, -12*4), xmm1) vbroadcastss(mem(rbx, -11*4), xmm2) vbroadcastss(mem(rbx, -10*4), xmm3) vxorps(xmm4, xmm4, xmm4) vxorps(xmm5, xmm5, xmm5) vxorps(xmm6, xmm6, xmm6) vxorps(xmm7, xmm7, xmm7) vxorps(xmm8, xmm8, xmm8) vxorps(xmm9, xmm9, xmm9) vxorps(xmm10, xmm10, xmm10) vxorps(xmm11, xmm11, xmm11) vxorps(xmm12, xmm12, xmm12) vxorps(xmm13, xmm13, xmm13) vxorps(xmm14, xmm14, xmm14) vxorps(xmm15, xmm15, xmm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP je(.SCONSIDKLEFT) // if i == 0, jump to k_left code. prefetch(0, mem(rbx, 16+192)) // prefetch b // iteration 0 vmovaps(mem(rax, -32*4), xmm0) prefetch(0, mem(rax, 384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, -28*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, -24*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, -20*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, -9*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -8*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) // iteration 1 vmovaps(mem(rax, -16*4), xmm0) vbroadcastss(mem(rbx, -7*4), xmm3) prefetch(0, mem(rax, 64+384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, -12*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, -8*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, -4*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, -6*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -5*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) // iteration 2 vmovaps(mem(rax, 0*4), xmm0) vbroadcastss(mem(rbx, -4*4), xmm3) prefetch(0, mem(rax, 128+384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, 4*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, 8*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, 12*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, -3*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -2*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) // iteration 3 vmovaps(mem(rax, 16*4), xmm0) vbroadcastss(mem(rbx, -1*4), xmm3) prefetch(0, mem(rax, 192+384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, 20*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, 24*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, 28*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, 0*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 1*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) // iteration 4 vmovaps(mem(rax, -32*4), xmm0) vbroadcastss(mem(rbx, 2*4), xmm3) prefetch(0, mem(rax, 384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, -28*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, -24*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, -20*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, 3*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 4*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) prefetch(0, mem(rbx, 80+192)) // prefetch b // iteration 5 vmovaps(mem(rax, -16*4), xmm0) vbroadcastss(mem(rbx, 5*4), xmm3) prefetch(0, mem(rax, 64+384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, -12*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, -8*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, -4*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, 6*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 7*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) // iteration 6 vmovaps(mem(rax, 0*4), xmm0) vbroadcastss(mem(rbx, 8*4), xmm3) prefetch(0, mem(rax, 128+384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, 4*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, 8*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, 12*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, 9*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, 10*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) // iteration 7 vmovaps(mem(rax, 16*4), xmm0) vbroadcastss(mem(rbx, 11*4), xmm3) add(imm(8*3*4), rbx) // a += 4*3 (unroll x nr) prefetch(0, mem(rax, 192+384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, 20*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, 24*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, 28*4), xmm0) add(imm(4*16*4), rax) // a += 4*16 (unroll x mr) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, -12*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -11*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) vbroadcastss(mem(rbx, -10*4), xmm3) dec(rsi) // i -= 1; jmp(.SLOOPKITER) // jump to beginning of loop. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP je(.SPOSTACCUM) // if i == 0, we're done. prefetch(0, mem(rbx, 16+192)) // prefetch b // iteration 0 vmovaps(mem(rax, -32*4), xmm0) prefetch(0, mem(rax, 384)) vfmadd231ps(xmm1, xmm0, xmm4) vfmadd231ps(xmm2, xmm0, xmm5) vfmadd231ps(xmm3, xmm0, xmm6) vmovaps(mem(rax, -28*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm7) vfmadd231ps(xmm2, xmm0, xmm8) vfmadd231ps(xmm3, xmm0, xmm9) vmovaps(mem(rax, -24*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm10) vfmadd231ps(xmm2, xmm0, xmm11) vfmadd231ps(xmm3, xmm0, xmm12) vmovaps(mem(rax, -20*4), xmm0) vfmadd231ps(xmm1, xmm0, xmm13) vbroadcastss(mem(rbx, -9*4), xmm1) vfmadd231ps(xmm2, xmm0, xmm14) vbroadcastss(mem(rbx, -8*4), xmm2) vfmadd231ps(xmm3, xmm0, xmm15) vbroadcastss(mem(rbx, -7*4), xmm3) add(imm(1*16*4), rax) // a += 4*16 (unroll x mr) add(imm(1*3*4), rbx) // a += 4*3 (unroll x nr) dec(rsi) // i -= 1; jmp(.SLOOPKLEFT) // jump to beginning of loop. label(.SPOSTACCUM) prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c // xmm4: xmm5: xmm6: // ( ab00 ( ab01 ( ab02 // ab10 ab11 ab12 // ab20 ab21 ab22 // ab30 ) ab31 ) ab32 ) // xmm7: xmm8: xmm9: // ( ab40 ( ab41 ( ab42 // ab50 ab51 ab52 // ab60 ab61 ab62 // ab70 ) ab71 ) ab72 ) // xmm10: xmm11: xmm12: // ( ab80 ( ab01 ( ab02 // ab90 ab11 ab12 // abA0 abA1 abA2 // abB0 ) abB1 ) abB2 ) // xmm13: xmm14: xmm15: // ( abC0 ( abC1 ( abC2 // abD0 abD1 abD2 // abE0 abE1 abE2 // abF0 ) abF1 ) abF2 ) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm2) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm7, xmm7) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm9, xmm9) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm11, xmm11) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm13, xmm13) vmulps(xmm0, xmm14, xmm14) vmulps(xmm0, xmm15, xmm15) prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next mov(var(rs_c), rsi) // load rs_c lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c; lea(mem(, rsi, 2), r12) // r12 = 2*rs_c; lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_c; // determine if // c % 32 == 0, AND // 4*cs_c % 32 == 0, AND // rs_c == 1 // ie: aligned, ldim aligned, and // column-stored cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4. sete(bl) // bl = ( ZF == 1 ? 1 : 0 ); test(imm(31), rcx) // set ZF if c & 32 is zero. setz(bh) // bh = ( ZF == 0 ? 1 : 0 ); test(imm(31), rdi) // set ZF if (4*cs_c) & 32 is zero. setz(al) // al = ( ZF == 0 ? 1 : 0 ); // and(bl,bh) followed by // and(bh,al) will reveal result prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm2) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case vfmadd231ps(mem(rcx, 0*16), xmm2, xmm4) vfmadd231ps(mem(rcx, 1*16), xmm2, xmm7) vfmadd231ps(mem(rcx, 2*16), xmm2, xmm10) vfmadd231ps(mem(rcx, 3*16), xmm2, xmm13) vfmadd231ps(mem(r10, 0*16), xmm2, xmm5) vfmadd231ps(mem(r10, 1*16), xmm2, xmm8) vfmadd231ps(mem(r10, 2*16), xmm2, xmm11) vfmadd231ps(mem(r10, 3*16), xmm2, xmm14) vfmadd231ps(mem(r11, 0*16), xmm2, xmm6) vfmadd231ps(mem(r11, 1*16), xmm2, xmm9) vfmadd231ps(mem(r11, 2*16), xmm2, xmm12) vfmadd231ps(mem(r11, 3*16), xmm2, xmm15) // fall through label(.SBETAZERO) vmovups(xmm4, mem(rcx, 0*16)) vmovups(xmm7, mem(rcx, 1*16)) vmovups(xmm10, mem(rcx, 2*16)) vmovups(xmm13, mem(rcx, 3*16)) vmovups(xmm5, mem(r10, 0*16)) vmovups(xmm8, mem(r10, 1*16)) vmovups(xmm11, mem(r10, 2*16)) vmovups(xmm14, mem(r10, 3*16)) vmovups(xmm6, mem(r11, 0*16)) vmovups(xmm9, mem(r11, 1*16)) vmovups(xmm12, mem(r11, 2*16)) vmovups(xmm15, mem(r11, 3*16)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( s ); } void bli_dgemm_piledriver_asm_8x3 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 8; uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( d, 8, 3, false ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. prefetch(0, mem(rbx, 128)) // prefetch b prefetch(0, mem(rbx, 64+128)) // prefetch b prefetch(0, mem(rbx, 128+128)) // prefetch b add(imm(16*8), rax) add(imm(12*8), rbx) mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; lea(mem(rcx, rdi, 2), r11) // load address of c + 2*cs_c; vmovddup(mem(rbx, -12*8), xmm1) vmovddup(mem(rbx, -11*8), xmm2) vmovddup(mem(rbx, -10*8), xmm3) vxorpd(xmm4, xmm4, xmm4) vxorpd(xmm5, xmm5, xmm5) vxorpd(xmm6, xmm6, xmm6) vxorpd(xmm7, xmm7, xmm7) vxorpd(xmm8, xmm8, xmm8) vxorpd(xmm9, xmm9, xmm9) vxorpd(xmm10, xmm10, xmm10) vxorpd(xmm11, xmm11, xmm11) vxorpd(xmm12, xmm12, xmm12) vxorpd(xmm13, xmm13, xmm13) vxorpd(xmm14, xmm14, xmm14) vxorpd(xmm15, xmm15, xmm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.DCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.DLOOPKITER) // MAIN LOOP je(.DCONSIDKLEFT) // if i == 0, jump to k_left code. prefetch(0, mem(rbx, -32+256)) // prefetch b prefetch(0, mem(rbx, 32+256)) // prefetch b // iteration 0 vmovaps(mem(rax, -8*16), xmm0) prefetch(0, mem(rax, 384)) // prefetch a vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, -7*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, -6*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, -5*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, -9*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -8*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) // iteration 1 vmovaps(mem(rax, -4*16), xmm0) prefetch(0, mem(rax, 64+384)) // prefetch a vmovddup(mem(rbx, -7*8), xmm3) vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, -3*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, -2*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, -1*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, -6*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -5*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) // iteration 2 vmovaps(mem(rax, 0*16), xmm0) prefetch(0, mem(rax, 128+384)) // prefetch a vmovddup(mem(rbx, -4*8), xmm3) vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, 1*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, 2*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, 3*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, -3*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -2*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) // iteration 3 vmovaps(mem(rax, 4*16), xmm0) prefetch(0, mem(rax, 192+384)) // prefetch a vmovddup(mem(rbx, -1*8), xmm3) vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, 5*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, 6*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, 7*16), xmm0) add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, 0*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 1*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) // iteration 4 vmovaps(mem(rax, -8*16), xmm0) prefetch(0, mem(rax, 384)) // prefetch a vmovddup(mem(rbx, 2*8), xmm3) vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, -7*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, -6*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, -5*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, 3*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 4*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) prefetch(0, mem(rbx, 96+256)) // prefetch b // iteration 5 vmovaps(mem(rax, -4*16), xmm0) prefetch(0, mem(rax, 64+384)) // prefetch a vmovddup(mem(rbx, 5*8), xmm3) vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, -3*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, -2*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, -1*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, 6*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 7*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) // iteration 6 vmovaps(mem(rax, 0*16), xmm0) prefetch(0, mem(rax, 128+384)) // prefetch a vmovddup(mem(rbx, 8*8), xmm3) vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, 1*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, 2*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, 3*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, 9*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, 10*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) // iteration 7 vmovaps(mem(rax, 4*16), xmm0) prefetch(0, mem(rax, 192+384)) // prefetch a vmovddup(mem(rbx, 11*8), xmm3) add(imm(8*3*8), rbx) // b += 8*3 (unroll x nr) vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, 5*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, 6*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, 7*16), xmm0) add(imm(4*8*8), rax) // a += 4*8 (unroll x mr) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, -12*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -11*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) vmovddup(mem(rbx, -10*8), xmm3) dec(rsi) // i -= 1; jmp(.DLOOPKITER) // jump to beginning of loop. label(.DCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.DPOSTACCUM) // if i == 0, we're done. // else, we prepare to // enter k_left loop. label(.DLOOPKLEFT) // EDGE LOOP je(.DPOSTACCUM) // if i == 0, we're done. // iteration 0 vmovaps(mem(rax, -8*16), xmm0) prefetch(0, mem(rax, 512)) // prefetch a vfmadd231pd(xmm1, xmm0, xmm4) vfmadd231pd(xmm2, xmm0, xmm5) vfmadd231pd(xmm3, xmm0, xmm6) vmovaps(mem(rax, -7*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm7) vfmadd231pd(xmm2, xmm0, xmm8) vfmadd231pd(xmm3, xmm0, xmm9) vmovaps(mem(rax, -6*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm10) vfmadd231pd(xmm2, xmm0, xmm11) vfmadd231pd(xmm3, xmm0, xmm12) vmovaps(mem(rax, -5*16), xmm0) vfmadd231pd(xmm1, xmm0, xmm13) vmovddup(mem(rbx, -9*8), xmm1) vfmadd231pd(xmm2, xmm0, xmm14) vmovddup(mem(rbx, -8*8), xmm2) vfmadd231pd(xmm3, xmm0, xmm15) vmovddup(mem(rbx, -7*8), xmm3) add(imm(1*8*8), rax) // a += 1*8 (1 x mr) add(imm(1*3*8), rbx) // b += 1*3 (1 x nr) dec(rsi) // i -= 1; jmp(.DLOOPKLEFT) // jump to beginning of loop. label(.DPOSTACCUM) prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c prefetchw0(mem(r11, 0*8)) // prefetch c + 2*cs_c // xmm4: xmm5: xmm6: // ( ab00 ( ab01 ( ab02 // ab10 ) ab11 ) ab12 ) // // xmm7: xmm8: xmm9: // ( ab20 ( ab21 ( ab22 // ab30 ) ab31 ) ab32 ) // // xmm10: xmm11: xmm12: // ( ab40 ( ab41 ( ab42 // ab50 ) ab51 ) ab52 ) // // xmm13: xmm14: xmm15: // ( ab60 ( ab61 ( ab62 // ab70 ) ab71 ) ab72 ) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vmovddup(mem(rax), xmm0) // load alpha and duplicate vmovddup(mem(rbx), xmm2) // load beta and duplicate vmulpd(xmm0, xmm4, xmm4) // scale by alpha vmulpd(xmm0, xmm5, xmm5) vmulpd(xmm0, xmm6, xmm6) vmulpd(xmm0, xmm7, xmm7) vmulpd(xmm0, xmm8, xmm8) vmulpd(xmm0, xmm9, xmm9) vmulpd(xmm0, xmm10, xmm10) vmulpd(xmm0, xmm11, xmm11) vmulpd(xmm0, xmm12, xmm12) vmulpd(xmm0, xmm13, xmm13) vmulpd(xmm0, xmm14, xmm14) vmulpd(xmm0, xmm15, xmm15) prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next // now avoid loading C if beta == 0 vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomisd(xmm0, xmm2) // set ZF if beta == 0. je(.DBETAZERO) // if ZF = 1, jump to beta == 0 case // xmm4: xmm5: xmm6: // ( ab00 ( ab01 ( ab02 // ab10 ) ab11 ) ab12 ) // // xmm7: xmm8: xmm9: // ( ab20 ( ab21 ( ab22 // ab30 ) ab31 ) ab32 ) // // xmm10: xmm11: xmm12: // ( ab40 ( ab41 ( ab42 // ab50 ) ab51 ) ab52 ) // // xmm13: xmm14: xmm15: // ( ab60 ( ab61 ( ab62 // ab70 ) ab71 ) ab72 ) vfmadd231pd(mem(rcx, 0*16), xmm2, xmm4) vfmadd231pd(mem(rcx, 1*16), xmm2, xmm7) vfmadd231pd(mem(rcx, 2*16), xmm2, xmm10) vfmadd231pd(mem(rcx, 3*16), xmm2, xmm13) vfmadd231pd(mem(r10, 0*16), xmm2, xmm5) vfmadd231pd(mem(r10, 1*16), xmm2, xmm8) vfmadd231pd(mem(r10, 2*16), xmm2, xmm11) vfmadd231pd(mem(r10, 3*16), xmm2, xmm14) vfmadd231pd(mem(r11, 0*16), xmm2, xmm6) vfmadd231pd(mem(r11, 1*16), xmm2, xmm9) vfmadd231pd(mem(r11, 2*16), xmm2, xmm12) vfmadd231pd(mem(r11, 3*16), xmm2, xmm15) // fall through label(.DBETAZERO) vmovups(xmm4, mem(rcx, 0*16)) vmovups(xmm7, mem(rcx, 1*16)) vmovups(xmm10, mem(rcx, 2*16)) vmovups(xmm13, mem(rcx, 3*16)) vmovups(xmm5, mem(r10, 0*16)) vmovups(xmm8, mem(r10, 1*16)) vmovups(xmm11, mem(r10, 2*16)) vmovups(xmm14, mem(r10, 3*16)) vmovups(xmm6, mem(r11, 0*16)) vmovups(xmm9, mem(r11, 1*16)) vmovups(xmm12, mem(r11, 2*16)) vmovups(xmm15, mem(r11, 3*16)) label(.DDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( d ); } void bli_cgemm_piledriver_asm_4x2 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 8; uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( c, 4, 2, false ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; add(imm(32*4), rax) add(imm(16*4), rbx) vxorps(xmm8, xmm8, xmm8) vxorps(xmm9, xmm9, xmm9) vxorps(xmm10, xmm10, xmm10) vxorps(xmm11, xmm11, xmm11) vxorps(xmm12, xmm12, xmm12) vxorps(xmm13, xmm13, xmm13) vxorps(xmm14, xmm14, xmm14) vxorps(xmm15, xmm15, xmm15) //vzeroall() mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.CCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.CLOOPKITER) // MAIN LOOP je(.CCONSIDKLEFT) // if i == 0, jump to k_left code. prefetch(0, mem(rbx, 256)) prefetch(0, mem(rax, 512)) // iteration 0 vmovaps(mem(rax, -32*4), xmm0) vbroadcastss(mem(rbx, -16*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, -28*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, -15*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, -14*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, -13*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) // iteration 1 vmovaps(mem(rax, -24*4), xmm0) vbroadcastss(mem(rbx, -12*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, -20*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, -11*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, -10*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, -9*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) prefetch(0, mem(rbx, 64+256)) prefetch(0, mem(rax, 64+512)) // iteration 2 vmovaps(mem(rax, -16*4), xmm0) vbroadcastss(mem(rbx, -8*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, -12*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, -7*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, -6*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, -5*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) // iteration 3 vmovaps(mem(rax, -8*4), xmm0) vbroadcastss(mem(rbx, -4*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, -4*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, -3*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, -2*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, -1*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) prefetch(0, mem(rbx, 128+256)) prefetch(0, mem(rax, 128+512)) // iteration 4 vmovaps(mem(rax, 0*4), xmm0) vbroadcastss(mem(rbx, 0*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, 4*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, 1*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, 2*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, 3*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) // iteration 5 vmovaps(mem(rax, 8*4), xmm0) vbroadcastss(mem(rbx, 4*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, 12*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, 5*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, 6*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, 7*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) prefetch(0, mem(rbx, 128+256)) prefetch(0, mem(rax, 128+512)) // iteration 6 vmovaps(mem(rax, 16*4), xmm0) vbroadcastss(mem(rbx, 8*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, 20*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, 9*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, 10*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, 11*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) // iteration 7 vmovaps(mem(rax, 24*4), xmm0) vbroadcastss(mem(rbx, 12*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, 28*4), xmm1) add(imm(8*4*8), rax) // a += 8*2 (unroll x mr) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, 13*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, 14*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, 15*4), xmm7) add(imm(8*2*8), rbx) // b += 8*2 (unroll x nr) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) dec(rsi) // i -= 1; jmp(.CLOOPKITER) // jump to beginning of loop. label(.CCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.CPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.CLOOPKLEFT) // EDGE LOOP je(.CPOSTACCUM) // if i == 0, we're done. prefetch(0, mem(rbx, 256)) prefetch(0, mem(rax, 512)) // iteration 0 vmovaps(mem(rax, -32*4), xmm0) vbroadcastss(mem(rbx, -16*4), xmm4) vfmadd231ps(xmm0, xmm4, xmm8) vmovaps(mem(rax, -28*4), xmm1) vfmadd231ps(xmm1, xmm4, xmm12) vbroadcastss(mem(rbx, -15*4), xmm5) vfmadd231ps(xmm0, xmm5, xmm9) vfmadd231ps(xmm1, xmm5, xmm13) vbroadcastss(mem(rbx, -14*4), xmm6) vfmadd231ps(xmm0, xmm6, xmm10) vfmadd231ps(xmm1, xmm6, xmm14) vbroadcastss(mem(rbx, -13*4), xmm7) vfmadd231ps(xmm0, xmm7, xmm11) vfmadd231ps(xmm1, xmm7, xmm15) add(imm(1*4*8), rax) // a += 1*2 (1 x mr) add(imm(1*2*8), rbx) // b += 1*2 (1 x nr) dec(rsi) // i -= 1; jmp(.CLOOPKLEFT) // jump to beginning of loop. label(.CPOSTACCUM) prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c vpermilps(imm(0xb1), xmm9, xmm9) vpermilps(imm(0xb1), xmm11, xmm11) vpermilps(imm(0xb1), xmm13, xmm13) vpermilps(imm(0xb1), xmm15, xmm15) vaddsubps(xmm9, xmm8, xmm8) vaddsubps(xmm11, xmm10, xmm10) vaddsubps(xmm13, xmm12, xmm12) vaddsubps(xmm15, xmm14, xmm14) // xmm8: xmm10: // ( ab00 ( ab01 // ab10 ab11 // ab20 ab21 // ab30 ) ab31 ) // xmm12: xmm14: // ( ab40 ( ab41 // ab50 ab51 // ab60 ab61 // ab70 ) ab71 ) prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next // scale by alpha mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate vpermilps(imm(0xb1), xmm8, xmm9) vpermilps(imm(0xb1), xmm10, xmm11) vpermilps(imm(0xb1), xmm12, xmm13) vpermilps(imm(0xb1), xmm14, xmm15) vmulps(xmm8, xmm0, xmm8) vmulps(xmm10, xmm0, xmm10) vmulps(xmm12, xmm0, xmm12) vmulps(xmm14, xmm0, xmm14) vmulps(xmm9, xmm1, xmm9) vmulps(xmm11, xmm1, xmm11) vmulps(xmm13, xmm1, xmm13) vmulps(xmm15, xmm1, xmm15) vaddsubps(xmm9, xmm8, xmm8) vaddsubps(xmm11, xmm10, xmm10) vaddsubps(xmm13, xmm12, xmm12) vaddsubps(xmm15, xmm14, xmm14) mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), xmm6) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), xmm7) // load beta_i and duplicate prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm6) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm7) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case vmovups(mem(rcx), xmm0) // load c00:c10 vmovups(mem(rcx, 16), xmm2) // load c20:c30 vpermilps(imm(0xb1), xmm0, xmm1) vpermilps(imm(0xb1), xmm2, xmm3) vmulps(xmm6, xmm0, xmm0) vmulps(xmm7, xmm1, xmm1) vaddsubps(xmm1, xmm0, xmm0) vaddps(xmm8, xmm0, xmm0) vmulps(xmm6, xmm2, xmm2) vmulps(xmm7, xmm3, xmm3) vaddsubps(xmm3, xmm2, xmm2) vaddps(xmm12, xmm2, xmm2) vmovups(mem(r10), xmm0) // load c01:c11 vmovups(mem(r10, 16), xmm2) // load c21:c31 vpermilps(imm(0xb1), xmm0, xmm1) vpermilps(imm(0xb1), xmm2, xmm3) vmulps(xmm6, xmm0, xmm0) vmulps(xmm7, xmm1, xmm1) vaddsubps(xmm1, xmm0, xmm0) vaddps(xmm10, xmm0, xmm0) vmulps(xmm6, xmm2, xmm2) vmulps(xmm7, xmm3, xmm3) vaddsubps(xmm3, xmm2, xmm2) vaddps(xmm14, xmm2, xmm2) // fall through label(.CBETAZERO) vmovups(xmm8, mem(rcx)) // store c00:c10 vmovups(xmm12, mem(rcx, 16)) // store c20:c30 vmovups(xmm10, mem(r10)) // store c01:c11 vmovups(xmm14, mem(r10, 16)) // store c21:c31 label(.CDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( c ); } void bli_zgemm_piledriver_asm_2x2 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 8; uint64_t k_left = k % 8; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; GEMM_UKR_SETUP_CT( z, 2, 2, false ); begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r15) // load address of b_next. mov(var(a_next), r14) // load address of a_next. mov(var(c), rcx) // load address of c mov(var(cs_c), rdi) // load cs_c lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex) lea(mem(, rdi, 2), rdi) lea(mem(rcx, rdi, 1), r10) // load address of c + 1*cs_c; add(imm(16*8), rax) add(imm(16*8), rbx) vxorpd(xmm8, xmm8, xmm8) vxorpd(xmm9, xmm9, xmm9) vxorpd(xmm10, xmm10, xmm10) vxorpd(xmm11, xmm11, xmm11) vxorpd(xmm12, xmm12, xmm12) vxorpd(xmm13, xmm13, xmm13) vxorpd(xmm14, xmm14, xmm14) vxorpd(xmm15, xmm15, xmm15) mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.ZCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.ZLOOPKITER) // MAIN LOOP je(.ZCONSIDKLEFT) // if i == 0, jump to k_left code. prefetch(0, mem(rbx, 256)) prefetch(0, mem(rax, 512)) // iteration 0 vmovaps(mem(rax, -16*8), xmm0) vmovddup(mem(rbx, -16*8), xmm4) vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -14*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, -15*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, -14*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, -13*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vmovaps(mem(rax, -12*8), xmm0) vmovddup(mem(rbx, -12*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) // iteration 1 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -10*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, -11*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, -10*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, -9*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vmovaps(mem(rax, -8*8), xmm0) vmovddup(mem(rbx, -8*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) prefetch(0, mem(rbx, 64+256)) prefetch(0, mem(rax, 64+512)) // iteration 2 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -6*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, -7*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, -6*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, -5*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vmovaps(mem(rax, -4*8), xmm0) vmovddup(mem(rbx, -4*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) // iteration 3 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -2*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, -3*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, -2*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, -1*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vmovaps(mem(rax, 0*8), xmm0) vmovddup(mem(rbx, 0*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) prefetch(0, mem(rbx, 128+256)) prefetch(0, mem(rax, 128+512)) // iteration 4 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 2*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, 1*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, 2*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, 3*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vmovaps(mem(rax, 4*8), xmm0) vmovddup(mem(rbx, 4*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) // iteration 5 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 6*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, 5*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, 6*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, 7*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vmovaps(mem(rax, 8*8), xmm0) vmovddup(mem(rbx, 8*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) prefetch(0, mem(rbx, 128+256)) prefetch(0, mem(rax, 128+512)) // iteration 6 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 10*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, 9*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, 10*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, 11*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vmovaps(mem(rax, 12*8), xmm0) vmovddup(mem(rbx, 12*8), xmm4) vfmadd231pd(xmm1, xmm7, xmm15) // iteration 7 vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, 14*8), xmm1) add(imm(8*2*16), rax) // a += 8*2 (unroll x mr) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, 13*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, 14*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, 15*8), xmm7) add(imm(8*2*16), rbx) // b += 8*2 (unroll x nr) vfmadd231pd(xmm0, xmm7, xmm11) vfmadd231pd(xmm1, xmm7, xmm15) dec(rsi) // i -= 1; jmp(.ZLOOPKITER) // jump to beginning of loop. label(.ZCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.ZPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.ZLOOPKLEFT) // EDGE LOOP je(.ZPOSTACCUM) // if i == 0, we're done. prefetch(0, mem(rbx, 256)) prefetch(0, mem(rax, 512)) // iteration 0 vmovaps(mem(rax, -16*8), xmm0) vmovddup(mem(rbx, -16*8), xmm4) vfmadd231pd(xmm0, xmm4, xmm8) vmovaps(mem(rax, -14*8), xmm1) vfmadd231pd(xmm1, xmm4, xmm12) vmovddup(mem(rbx, -15*8), xmm5) vfmadd231pd(xmm0, xmm5, xmm9) vfmadd231pd(xmm1, xmm5, xmm13) vmovddup(mem(rbx, -14*8), xmm6) vfmadd231pd(xmm0, xmm6, xmm10) vfmadd231pd(xmm1, xmm6, xmm14) vmovddup(mem(rbx, -13*8), xmm7) vfmadd231pd(xmm0, xmm7, xmm11) vfmadd231pd(xmm1, xmm7, xmm15) add(imm(1*2*16), rax) // a += 1*2 (1 x mr) add(imm(1*2*16), rbx) // b += 1*2 (1 x nr) dec(rsi) // i -= 1; jmp(.ZLOOPKLEFT) // jump to beginning of loop. label(.ZPOSTACCUM) prefetchw0(mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetchw0(mem(r10, 0*8)) // prefetch c + 1*cs_c vpermilpd(imm(0x1), xmm9, xmm9) vpermilpd(imm(0x1), xmm11, xmm11) vpermilpd(imm(0x1), xmm13, xmm13) vpermilpd(imm(0x1), xmm15, xmm15) vaddsubpd(xmm9, xmm8, xmm8) vaddsubpd(xmm11, xmm10, xmm10) vaddsubpd(xmm13, xmm12, xmm12) vaddsubpd(xmm15, xmm14, xmm14) // xmm8: xmm10: // ( ab00 ( ab01 // ab10 ) ab11 ) // xmm12: xmm14: // ( ab20 ( ab21 // ab30 ) ab31 ) prefetch(0, mem(r14)) // prefetch a_next prefetch(0, mem(r14, 64)) // prefetch a_next // scale by alpha mov(var(alpha), rax) // load address of alpha vmovddup(mem(rax), xmm0) // load alpha_r and duplicate vmovddup(mem(rax, 8), xmm1) // load alpha_i and duplicate vpermilpd(imm(0x1), xmm8, xmm9) vpermilpd(imm(0x1), xmm10, xmm11) vpermilpd(imm(0x1), xmm12, xmm13) vpermilpd(imm(0x1), xmm14, xmm15) vmulpd(xmm8, xmm0, xmm8) vmulpd(xmm10, xmm0, xmm10) vmulpd(xmm12, xmm0, xmm12) vmulpd(xmm14, xmm0, xmm14) vmulpd(xmm9, xmm1, xmm9) vmulpd(xmm11, xmm1, xmm11) vmulpd(xmm13, xmm1, xmm13) vmulpd(xmm15, xmm1, xmm15) vaddsubpd(xmm9, xmm8, xmm8) vaddsubpd(xmm11, xmm10, xmm10) vaddsubpd(xmm13, xmm12, xmm12) vaddsubpd(xmm15, xmm14, xmm14) mov(var(beta), rbx) // load address of beta vmovddup(mem(rbx), xmm6) // load beta_r and duplicate vmovddup(mem(rbx, 8), xmm7) // load beta_i and duplicate prefetch(0, mem(r15)) // prefetch b_next prefetch(0, mem(r15, 64)) // prefetch b_next // now avoid loading C if beta == 0 vxorpd(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomisd(xmm0, xmm6) // set ZF if beta_r == 0. sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm7) // set ZF if beta_i == 0. sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 ); and(r8b, r9b) // set ZF if r8b & r9b == 1. jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case vmovups(mem(rcx), xmm0) // load c00 vmovups(mem(rcx, 16), xmm2) // load c10 vpermilpd(imm(0x1), xmm0, xmm1) vpermilpd(imm(0x1), xmm2, xmm3) vmulpd(xmm6, xmm0, xmm0) vmulpd(xmm7, xmm1, xmm1) vaddsubpd(xmm1, xmm0, xmm0) vaddpd(xmm8, xmm0, xmm0) vmulpd(xmm6, xmm2, xmm2) vmulpd(xmm7, xmm3, xmm3) vaddsubpd(xmm3, xmm2, xmm2) vaddpd(xmm12, xmm2, xmm2) vmovups(mem(r10), xmm0) // load c01 vmovups(mem(r10, 16), xmm2) // load c11 vpermilpd(imm(0x1), xmm0, xmm1) vpermilpd(imm(0x1), xmm2, xmm3) vmulpd(xmm6, xmm0, xmm0) vmulpd(xmm7, xmm1, xmm1) vaddsubpd(xmm1, xmm0, xmm0) vaddpd(xmm10, xmm0, xmm0) vmulpd(xmm6, xmm2, xmm2) vmulpd(xmm7, xmm3, xmm3) vaddsubpd(xmm3, xmm2, xmm2) vaddpd(xmm14, xmm2, xmm2) // fall through label(.ZBETAZERO) vmovups(xmm8, mem(rcx)) // store c00 vmovups(xmm12, mem(rcx, 16)) // store c10 vmovups(xmm10, mem(r10)) // store c01 vmovups(xmm14, mem(r10, 16)) // store c11 label(.ZDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), // 0 [k_left] "m" (k_left), // 1 [a] "m" (a), // 2 [b] "m" (b), // 3 [alpha] "m" (alpha), // 4 [beta] "m" (beta), // 5 [c] "m" (c), // 6 [rs_c] "m" (rs_c), // 7 [cs_c] "m" (cs_c), // 8 [b_next] "m" (b_next), // 9 [a_next] "m" (a_next) // 10 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) GEMM_UKR_FLUSH_CT( z ); } cython-blis-0.9.1/blis/_src/kernels/piledriver/bli_kernels_piledriver.h000066400000000000000000000035661427272030600263470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_piledriver_asm_16x3 ) GEMM_UKR_PROT( double, d, gemm_piledriver_asm_8x3 ) GEMM_UKR_PROT( scomplex, c, gemm_piledriver_asm_4x2 ) GEMM_UKR_PROT( dcomplex, z, gemm_piledriver_asm_2x2 ) cython-blis-0.9.1/blis/_src/kernels/power10/000077500000000000000000000000001427272030600205765ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/power10/3/000077500000000000000000000000001427272030600207405ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/power10/3/bli_dgemm_power10_mma.c000066400000000000000000000151731427272030600252410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vector_int_macros.h" #define D_ASSEMBLE_VEC_PAIR \ __builtin_mma_assemble_pair (&colA_1, ca[1], ca[0]); \ __builtin_mma_assemble_pair (&colA_2, ca[3], ca[2]); #define D_ACCUMULATE \ __builtin_mma_xvf64gerpp (&acc0, colA_1, rb[0]); \ __builtin_mma_xvf64gerpp (&acc1, colA_1, rb[1]); \ __builtin_mma_xvf64gerpp (&acc2, colA_1, rb[2]); \ __builtin_mma_xvf64gerpp (&acc3, colA_1, rb[3]); \ __builtin_mma_xvf64gerpp (&acc4, colA_2, rb[0]); \ __builtin_mma_xvf64gerpp (&acc5, colA_2, rb[1]); \ __builtin_mma_xvf64gerpp (&acc6, colA_2, rb[2]); \ __builtin_mma_xvf64gerpp (&acc7, colA_2, rb[3]); #define D_INCREMENT \ A0+=8; \ B0+=8; #define D_AB_PRODUCT \ LOAD_VECTORS \ D_ASSEMBLE_VEC_PAIR \ D_INCREMENT \ D_ACCUMULATE void bli_dgemm_power10_mma_8x8 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. // (1 is subtracted from k0 because 1 iteration of the k loop is pulled out) uint64_t k_iter = (k-1) / 4; uint64_t k_left = (k-1) % 4; uint64_t rs_c = rs_c0; GEMM_UKR_SETUP_CT( d, 8, 8, true ); double* restrict A0 = a; double* restrict B0 = b; double* restrict C0 = c; double alpha_ = *alpha, beta_ = *beta; dv4sf_t result[4]; dv4sf_t *rowC; /* 8 accumulator registers that will be used to store the result. Each accumulator register is mapped to 4 vector registers. Illustration: acc0 = [ vs0 vs1 vs3 vs4 ] These registers are used to store the result of an outer product instruction (general outer product instruction syntax: xv???ger??). */ __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* 2 vector pairs are necessary for a double precision outer product instruction. */ __vector_pair colA_1, colA_2; /* Prefetch C so that it stays in cache */ PREFETCH1 (C0, 0); PREFETCH1 (C0 + rs_c, 0); PREFETCH1 (C0 + rs_c + rs_c, 0); PREFETCH1 (C0 + rs_c + rs_c + rs_c, 0); PREFETCH1 (C0, 128); PREFETCH1 (C0 + rs_c, 128); PREFETCH1 (C0 + rs_c + rs_c, 128); PREFETCH1 (C0 + rs_c + rs_c + rs_c, 128); /* Load elements into vector registers */ vec_t *ca = (vec_t *) A0; vec_t *rb = (vec_t *) B0; /* Each accumulator represents a matrix of size 4 x ( 16 / (datatype size in bytes) ) (vector register size = 16B) Thus in the case of double, the accumulate registers represent a 4x2 matrix. However, a vector register can hold at most 2 doubles. Thus, if we performed an outer product using 2 vector register, we can only get a 2x2 matrix. Therefore, we must create a vector register pair in order to get the desired 4x2 matrix. */ D_ASSEMBLE_VEC_PAIR /* Compute accumulate outer products and override accumulators with result */ __builtin_mma_xvf64ger (&acc0, colA_1, rb[0]); __builtin_mma_xvf64ger (&acc1, colA_1, rb[1]); __builtin_mma_xvf64ger (&acc2, colA_1, rb[2]); __builtin_mma_xvf64ger (&acc3, colA_1, rb[3]); __builtin_mma_xvf64ger (&acc4, colA_2, rb[0]); __builtin_mma_xvf64ger (&acc5, colA_2, rb[1]); __builtin_mma_xvf64ger (&acc6, colA_2, rb[2]); __builtin_mma_xvf64ger (&acc7, colA_2, rb[3]); /* Move A and B pointers */ D_INCREMENT // k loop (unrolled by 4) for (int k = 0; k #define COLMAJ_INDEX(row,col,ld) ((col*ld)+row) #define ROWMAJ_INDEX(row,col,ld) ((row*ld)+col) #define BLIS_INDEX(row,col,rs,cs) ((row*rs)+(col*cs)) /* * Perform * c = beta * c + alpha * a * b * where * alpha & beta are scalars * c is mr x nr in blis-format, (col-stride & row-stride) * a is mr x k in packed col-maj format (leading dim is mr) * b is k x nr in packed row-maj format (leading dim is nr) */ void bli_sgemm_power7_int_8x4 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 1 || defined(UTEST) const long MR = BLIS_DEFAULT_MR_S, NR = BLIS_DEFAULT_NR_S; const long LDA = MR, LDB = NR; long i, j, kk; float c00; for (i=0; i < m; i++) { for (j=0; j < n; j++) { c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta; for (kk=0; kk < k; kk++) c00 += *alpha * (a[COLMAJ_INDEX(i,kk,LDA)] * b[ROWMAJ_INDEX(kk,j,LDB)]); c[BLIS_INDEX(i,j,rs_c,cs_c)] = c00; } } #else //BLIS_SGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); #endif } /* * Perform * c = beta * c + alpha * a * b * where * alpha & beta are scalars * c is mr x nr in blis-format, (col-stride & row-stride) * a is mr x k in packed col-maj format (leading dim is mr) * b is k x nr in packed row-maj format (leading dim is nr) */ void bli_dgemm_power7_int_8x4 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { if ( cs_c == 1 ) { // Optimized code for case where C rows are contiguous (i.e. C is row-major) vector double vzero = vec_splats( 0.0 ); vector double vc00_01 = vzero; vector double vc02_03 = vzero; vector double vc10_11 = vzero; vector double vc12_13 = vzero; vector double vc20_21 = vzero; vector double vc22_23 = vzero; vector double vc30_31 = vzero; vector double vc32_33 = vzero; vector double vc40_41 = vzero; vector double vc42_43 = vzero; vector double vc50_51 = vzero; vector double vc52_53 = vzero; vector double vc60_61 = vzero; vector double vc62_63 = vzero; vector double vc70_71 = vzero; vector double vc72_73 = vzero; unsigned long long pa = (unsigned long long)a; unsigned long long pb = (unsigned long long)b; #if 0 unsigned long long d1 = 1*sizeof(double); unsigned long long d2 = 2*sizeof(double); unsigned long long d3 = 3*sizeof(double); unsigned long long d4 = 4*sizeof(double); unsigned long long d6 = 6*sizeof(double); #else // ppc64 linux abi: r14-r31 Nonvolatile registers used for local variables register unsigned long long d1 __asm ("r21") = 1*sizeof(double); register unsigned long long d2 __asm ("r22") = 2*sizeof(double); register unsigned long long d3 __asm ("r23") = 3*sizeof(double); register unsigned long long d4 __asm ("r24") = 4*sizeof(double); register unsigned long long d5 __asm ("r25") = 5*sizeof(double); register unsigned long long d6 __asm ("r26") = 6*sizeof(double); register unsigned long long d7 __asm ("r27") = 7*sizeof(double); __asm__ volatile (";" : "=r" (d1) : "r" (d1) ); __asm__ volatile (";" : "=r" (d2) : "r" (d2) ); __asm__ volatile (";" : "=r" (d3) : "r" (d3) ); __asm__ volatile (";" : "=r" (d4) : "r" (d4) ); __asm__ volatile (";" : "=r" (d5) : "r" (d5) ); __asm__ volatile (";" : "=r" (d6) : "r" (d6) ); __asm__ volatile (";" : "=r" (d7) : "r" (d7) ); #endif int kk; for (kk=k; kk > 0; kk--) { vector double va00 = vec_splats( *(double *)( pa+0 ) ); vector double va10 = vec_splats( *(double *)( pa+d1 ) ); vector double va20 = vec_splats( *(double *)( pa+d2 ) ); vector double va30 = vec_splats( *(double *)( pa+d3 ) ); vector double va40 = vec_splats( *(double *)( pa+d4 ) ); vector double va50 = vec_splats( *(double *)( pa+d5 ) ); vector double va60 = vec_splats( *(double *)( pa+d6 ) ); vector double va70 = vec_splats( *(double *)( pa+d7 ) ); pa += 8*sizeof(double); vector double vb00_01 = *(vector double *)( pb+0 ); vector double vb02_03 = *(vector double *)( pb+d2 ); pb += 4*sizeof(double); vc00_01 = vec_madd(va00, vb00_01, vc00_01); vc02_03 = vec_madd(va00, vb02_03, vc02_03); vc10_11 = vec_madd(va10, vb00_01, vc10_11); vc12_13 = vec_madd(va10, vb02_03, vc12_13); vc20_21 = vec_madd(va20, vb00_01, vc20_21); vc22_23 = vec_madd(va20, vb02_03, vc22_23); vc30_31 = vec_madd(va30, vb00_01, vc30_31); vc32_33 = vec_madd(va30, vb02_03, vc32_33); vc40_41 = vec_madd(va40, vb00_01, vc40_41); vc42_43 = vec_madd(va40, vb02_03, vc42_43); vc50_51 = vec_madd(va50, vb00_01, vc50_51); vc52_53 = vec_madd(va50, vb02_03, vc52_53); vc60_61 = vec_madd(va60, vb00_01, vc60_61); vc62_63 = vec_madd(va60, vb02_03, vc62_63); vc70_71 = vec_madd(va70, vb00_01, vc70_71); vc72_73 = vec_madd(va70, vb02_03, vc72_73); } vector double valpha = vec_splats( *alpha ); vector double vbeta = (vector double) { *beta, *beta }; vector double *pc = (vector double *)c; vc00_01 = vec_mul(valpha, vc00_01); vc02_03 = vec_mul(valpha, vc02_03); pc[0] = vec_madd( pc[0], vbeta, vc00_01); pc[1] = vec_madd( pc[1], vbeta, vc02_03); pc += rs_c/2; vc10_11 = vec_mul(valpha, vc10_11); vc12_13 = vec_mul(valpha, vc12_13); pc[0] = vec_madd( pc[0], vbeta, vc10_11); pc[1] = vec_madd( pc[1], vbeta, vc12_13); pc += rs_c/2; vc20_21 = vec_mul(valpha, vc20_21); vc22_23 = vec_mul(valpha, vc22_23); pc[0] = vec_madd( pc[0], vbeta, vc20_21); pc[1] = vec_madd( pc[1], vbeta, vc22_23); pc += rs_c/2; vc30_31 = vec_mul(valpha, vc30_31); vc32_33 = vec_mul(valpha, vc32_33); pc[0] = vec_madd( pc[0], vbeta, vc30_31); pc[1] = vec_madd( pc[1], vbeta, vc32_33); pc += rs_c/2; vc40_41 = vec_mul(valpha, vc40_41); vc42_43 = vec_mul(valpha, vc42_43); pc[0] = vec_madd( pc[0], vbeta, vc40_41); pc[1] = vec_madd( pc[1], vbeta, vc42_43); pc += rs_c/2; vc50_51 = vec_mul(valpha, vc50_51); vc52_53 = vec_mul(valpha, vc52_53); pc[0] = vec_madd( pc[0], vbeta, vc50_51); pc[1] = vec_madd( pc[1], vbeta, vc52_53); pc += rs_c/2; vc60_61 = vec_mul(valpha, vc60_61); vc62_63 = vec_mul(valpha, vc62_63); pc[0] = vec_madd( pc[0], vbeta, vc60_61); pc[1] = vec_madd( pc[1], vbeta, vc62_63); pc += rs_c/2; vc70_71 = vec_mul(valpha, vc70_71); vc72_73 = vec_mul(valpha, vc72_73); pc[0] = vec_madd( pc[0], vbeta, vc70_71); pc[1] = vec_madd( pc[1], vbeta, vc72_73); pc += rs_c/2; } else { GEMM_UKR_SETUP_CT( d, 8, 4, false ); // Optimized code for case where C columns are contiguous (column-major C) vector double vzero = vec_splats( 0.0 ); vector double vc00_10 = vzero; vector double vc20_30 = vzero; vector double vc40_50 = vzero; vector double vc60_70 = vzero; vector double vc01_11 = vzero; vector double vc21_31 = vzero; vector double vc41_51 = vzero; vector double vc61_71 = vzero; vector double vc02_12 = vzero; vector double vc22_32 = vzero; vector double vc42_52 = vzero; vector double vc62_72 = vzero; vector double vc03_13 = vzero; vector double vc23_33 = vzero; vector double vc43_53 = vzero; vector double vc63_73 = vzero; unsigned long long pa = (unsigned long long)a; unsigned long long pb = (unsigned long long)b; #if 0 unsigned long long d1 = 1*sizeof(double); unsigned long long d2 = 2*sizeof(double); unsigned long long d3 = 3*sizeof(double); unsigned long long d4 = 4*sizeof(double); unsigned long long d6 = 6*sizeof(double); #else // ppc64 linux abi: r14-r31 Nonvolatile registers used for local variables register unsigned long long d1 __asm ("r21") = 1*sizeof(double); register unsigned long long d2 __asm ("r22") = 2*sizeof(double); register unsigned long long d3 __asm ("r23") = 3*sizeof(double); register unsigned long long d4 __asm ("r24") = 4*sizeof(double); register unsigned long long d6 __asm ("r26") = 6*sizeof(double); __asm__ volatile (";" : "=r" (d1) : "r" (d1) ); __asm__ volatile (";" : "=r" (d2) : "r" (d2) ); __asm__ volatile (";" : "=r" (d3) : "r" (d3) ); __asm__ volatile (";" : "=r" (d4) : "r" (d4) ); __asm__ volatile (";" : "=r" (d6) : "r" (d6) ); #endif int kk; for (kk=k; kk > 1; kk-=2) { vector double va00_10 = *(vector double *)( pa+0 ); vector double va20_30 = *(vector double *)( pa+d2 ); vector double va40_50 = *(vector double *)( pa+d4 ); vector double va60_70 = *(vector double *)( pa+d6 ); pa += 8*sizeof(double); vector double vb00 = vec_splats( *(double *)( pb+0 ) ); vector double vb01 = vec_splats( *(double *)( pb+d1 ) ); vector double vb02 = vec_splats( *(double *)( pb+d2 ) ); vector double vb03 = vec_splats( *(double *)( pb+d3 ) ); pb += 4*sizeof(double); vc00_10 = vec_madd(va00_10, vb00, vc00_10); vc20_30 = vec_madd(va20_30, vb00, vc20_30); vc40_50 = vec_madd(va40_50, vb00, vc40_50); vc60_70 = vec_madd(va60_70, vb00, vc60_70); vc01_11 = vec_madd(va00_10, vb01, vc01_11); vc21_31 = vec_madd(va20_30, vb01, vc21_31); vc41_51 = vec_madd(va40_50, vb01, vc41_51); vc61_71 = vec_madd(va60_70, vb01, vc61_71); vc02_12 = vec_madd(va00_10, vb02, vc02_12); vc22_32 = vec_madd(va20_30, vb02, vc22_32); vc42_52 = vec_madd(va40_50, vb02, vc42_52); vc62_72 = vec_madd(va60_70, vb02, vc62_72); vc03_13 = vec_madd(va00_10, vb03, vc03_13); vc23_33 = vec_madd(va20_30, vb03, vc23_33); vc43_53 = vec_madd(va40_50, vb03, vc43_53); vc63_73 = vec_madd(va60_70, vb03, vc63_73); va00_10 = *(vector double *)( pa+0 ); va20_30 = *(vector double *)( pa+d2 ); va40_50 = *(vector double *)( pa+d4 ); va60_70 = *(vector double *)( pa+d6 ); pa += 8*sizeof(double); vb00 = vec_splats( *(double *)( pb+0 ) ); vb01 = vec_splats( *(double *)( pb+d1 ) ); vb02 = vec_splats( *(double *)( pb+d2 ) ); vb03 = vec_splats( *(double *)( pb+d3 ) ); pb += 4*sizeof(double); vc00_10 = vec_madd(va00_10, vb00, vc00_10); vc20_30 = vec_madd(va20_30, vb00, vc20_30); vc40_50 = vec_madd(va40_50, vb00, vc40_50); vc60_70 = vec_madd(va60_70, vb00, vc60_70); vc01_11 = vec_madd(va00_10, vb01, vc01_11); vc21_31 = vec_madd(va20_30, vb01, vc21_31); vc41_51 = vec_madd(va40_50, vb01, vc41_51); vc61_71 = vec_madd(va60_70, vb01, vc61_71); vc02_12 = vec_madd(va00_10, vb02, vc02_12); vc22_32 = vec_madd(va20_30, vb02, vc22_32); vc42_52 = vec_madd(va40_50, vb02, vc42_52); vc62_72 = vec_madd(va60_70, vb02, vc62_72); vc03_13 = vec_madd(va00_10, vb03, vc03_13); vc23_33 = vec_madd(va20_30, vb03, vc23_33); vc43_53 = vec_madd(va40_50, vb03, vc43_53); vc63_73 = vec_madd(va60_70, vb03, vc63_73); } for (kk=kk; kk > 0; kk--) { vector double va00_10 = *(vector double *)( pa+0 ); vector double va20_30 = *(vector double *)( pa+d2 ); vector double va40_50 = *(vector double *)( pa+d4 ); vector double va60_70 = *(vector double *)( pa+d6 ); pa += 8*sizeof(double); vector double vb00 = vec_splats( *(double *)( pb+0 ) ); vector double vb01 = vec_splats( *(double *)( pb+d1 ) ); vector double vb02 = vec_splats( *(double *)( pb+d2 ) ); vector double vb03 = vec_splats( *(double *)( pb+d3 ) ); pb += 4*sizeof(double); vc00_10 = vec_madd(va00_10, vb00, vc00_10); vc20_30 = vec_madd(va20_30, vb00, vc20_30); vc40_50 = vec_madd(va40_50, vb00, vc40_50); vc60_70 = vec_madd(va60_70, vb00, vc60_70); vc01_11 = vec_madd(va00_10, vb01, vc01_11); vc21_31 = vec_madd(va20_30, vb01, vc21_31); vc41_51 = vec_madd(va40_50, vb01, vc41_51); vc61_71 = vec_madd(va60_70, vb01, vc61_71); vc02_12 = vec_madd(va00_10, vb02, vc02_12); vc22_32 = vec_madd(va20_30, vb02, vc22_32); vc42_52 = vec_madd(va40_50, vb02, vc42_52); vc62_72 = vec_madd(va60_70, vb02, vc62_72); vc03_13 = vec_madd(va00_10, vb03, vc03_13); vc23_33 = vec_madd(va20_30, vb03, vc23_33); vc43_53 = vec_madd(va40_50, vb03, vc43_53); vc63_73 = vec_madd(va60_70, vb03, vc63_73); } // The following code is dependent on rs_c == 1 vector double valpha = vec_splats( *alpha ); vector double vbeta = (vector double) { *beta, *beta }; vector double *pc = (vector double *)c; vc00_10 = vec_mul(valpha, vc00_10); vc20_30 = vec_mul(valpha, vc20_30); vc40_50 = vec_mul(valpha, vc40_50); vc60_70 = vec_mul(valpha, vc60_70); pc[0] = vec_madd( pc[0], vbeta, vc00_10); pc[1] = vec_madd( pc[1], vbeta, vc20_30); pc[2] = vec_madd( pc[2], vbeta, vc40_50); pc[3] = vec_madd( pc[3], vbeta, vc60_70); pc += cs_c/2; vc01_11 = vec_mul(valpha, vc01_11); vc21_31 = vec_mul(valpha, vc21_31); vc41_51 = vec_mul(valpha, vc41_51); vc61_71 = vec_mul(valpha, vc61_71); pc[0] = vec_madd( pc[0], vbeta, vc01_11); pc[1] = vec_madd( pc[1], vbeta, vc21_31); pc[2] = vec_madd( pc[2], vbeta, vc41_51); pc[3] = vec_madd( pc[3], vbeta, vc61_71); pc += cs_c/2; vc02_12 = vec_mul(valpha, vc02_12); vc22_32 = vec_mul(valpha, vc22_32); vc42_52 = vec_mul(valpha, vc42_52); vc62_72 = vec_mul(valpha, vc62_72); pc[0] = vec_madd( pc[0], vbeta, vc02_12); pc[1] = vec_madd( pc[1], vbeta, vc22_32); pc[2] = vec_madd( pc[2], vbeta, vc42_52); pc[3] = vec_madd( pc[3], vbeta, vc62_72); pc += cs_c/2; vc03_13 = vec_mul(valpha, vc03_13); vc23_33 = vec_mul(valpha, vc23_33); vc43_53 = vec_mul(valpha, vc43_53); vc63_73 = vec_mul(valpha, vc63_73); pc[0] = vec_madd( pc[0], vbeta, vc03_13); pc[1] = vec_madd( pc[1], vbeta, vc23_33); pc[2] = vec_madd( pc[2], vbeta, vc43_53); pc[3] = vec_madd( pc[3], vbeta, vc63_73); GEMM_UKR_FLUSH_CT( d ); } } /* * Perform * c = beta * c + alpha * a * b * where * alpha & beta are scalars * c is mr x nr in blis-format, (col-stride & row-stride) * a is mr x k in packed col-maj format (leading dim is mr) * b is k x nr in packed row-maj format (leading dim is nr) */ void bli_cgemm_power7_int_8x4 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 1 || defined(UTEST) const long MR = BLIS_DEFAULT_MR_C, NR = BLIS_DEFAULT_NR_C; const long LDA = MR, LDB = NR; int i, j, kk; scomplex c00; for (i=0; i < m; i++) { for (j=0; j < n; j++) { scomplex tmpc, tmpa, tmpb, tmp; //c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta; tmpc = c[BLIS_INDEX(i,j,rs_c,cs_c)]; c00.real = tmpc.real * (*beta).real - tmpc.imag * (*beta).imag; c00.imag = tmpc.real * (*beta).imag + tmpc.imag * (*beta).real; for (kk=0; kk < k; kk++) { //c00 += *alpha * (a[COLMAJ_INDEX(i,kk,LDA)] * b[ROWMAJ_INDEX(kk,j,LDB)]); tmpa = a[COLMAJ_INDEX(i,kk,LDA)]; tmpb = b[ROWMAJ_INDEX(kk,j,LDB)]; tmp.real = tmpa.real * tmpb.real - tmpa.imag * tmpb.imag; tmp.imag = tmpa.real * tmpb.imag + tmpa.imag * tmpb.real; c00.real += (*alpha).real * tmp.real - (*alpha).imag * tmp.imag; c00.imag += (*alpha).real * tmp.imag + (*alpha).imag * tmp.real; } c[BLIS_INDEX(i,j,rs_c,cs_c)] = c00; } } #else //BLIS_CGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); #endif } /* * Perform * c = beta * c + alpha * a * b * where * alpha & beta are scalars * c is mr x nr in blis-format, (col-stride & row-stride) * a is mr x k in packed col-maj format (leading dim is mr) * b is k x nr in packed row-maj format (leading dim is nr) */ void bli_zgemm_power7_int_8x4 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { #if 1 || defined(UTEST) const long MR = BLIS_DEFAULT_MR_Z, NR = BLIS_DEFAULT_NR_Z; const long LDA = MR, LDB = NR; int i, j, kk; dcomplex c00; for (i=0; i < m; i++) { for (j=0; j < n; j++) { dcomplex tmpc, tmpa, tmpb, tmp; //c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta; tmpc = c[BLIS_INDEX(i,j,rs_c,cs_c)]; c00.real = tmpc.real * (*beta).real - tmpc.imag * (*beta).imag; c00.imag = tmpc.real * (*beta).imag + tmpc.imag * (*beta).real; for (kk=0; kk < k; kk++) { //c00 += *alpha * (a[COLMAJ_INDEX(i,kk,LDA)] * b[ROWMAJ_INDEX(kk,j,LDB)]); tmpa = a[COLMAJ_INDEX(i,kk,LDA)]; tmpb = b[ROWMAJ_INDEX(kk,j,LDB)]; tmp.real = tmpa.real * tmpb.real - tmpa.imag * tmpb.imag; tmp.imag = tmpa.real * tmpb.imag + tmpa.imag * tmpb.real; c00.real += (*alpha).real * tmp.real - (*alpha).imag * tmp.imag; c00.imag += (*alpha).real * tmp.imag + (*alpha).imag * tmp.real; } c[BLIS_INDEX(i,j,rs_c,cs_c)] = c00; } } #else //BLIS_ZGEMM_UKERNEL_REF(k, alpha, a, b, beta, c, rs_c, cs_c, data); #endif } cython-blis-0.9.1/blis/_src/kernels/power7/3/test/000077500000000000000000000000001427272030600216455ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/power7/3/test/Makefile000066400000000000000000000003061427272030600233040ustar00rootroot00000000000000 CC = gcc TARGET_ARCH = -m64 -mvsx TGTS = exp KERNEL = bli_gemm_opt_8x4.o CFLAGS = -DUTEST -std=gnu99 -ggdb3 -Wall CFLAGS += -O3 all: $(TGTS) exp: exp.o $(KERNEL) clean: rm -f $(TGTS) *.o cython-blis-0.9.1/blis/_src/kernels/power7/3/test/bli_gemm_power7_int_8x4.c000077700000000000000000000000001427272030600334512../bli_gemm_power7_int_8x4.custar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/power7/3/test/bli_gemm_power7_int_8x4.h000066400000000000000000000063541427272030600264610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _BLI_GEMM_OPT_8X4_H_ #define _BLI_GEMM_OPT_8X4_H_ #ifdef UTEST #include "blis_utest.h" #else #include "blis.h" #endif void bli_sgemm_opt_8x4 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ); void bli_dgemm_opt_8x4 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ); void bli_cgemm_opt_8x4 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ); void bli_zgemm_opt_8x4 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ); #endif cython-blis-0.9.1/blis/_src/kernels/power7/3/test/blis_utest.h000066400000000000000000000010751427272030600241760ustar00rootroot00000000000000 #ifndef _BLIS_UTEST_H_ #define _BLIS_UTEST_H_ #define BLIS_DEFAULT_MR_S 8 #define BLIS_DEFAULT_NR_S 4 #define BLIS_DEFAULT_MR_D 8 #define BLIS_DEFAULT_NR_D 4 #define BLIS_DEFAULT_MR_C 8 #define BLIS_DEFAULT_NR_C 4 #define BLIS_DEFAULT_MR_Z 8 #define BLIS_DEFAULT_NR_Z 4 typedef unsigned long dim_t; typedef long inc_t; // Complex types typedef struct scomplex_s { float real; float imag; } scomplex; typedef struct dcomplex_s { double real; double imag; } dcomplex; #define bli_check_error_code(x) #endif cython-blis-0.9.1/blis/_src/kernels/power7/3/test/exp.c000066400000000000000000000071471427272030600226160ustar00rootroot00000000000000#ifdef UTEST #include #include #include #include /* fabs */ #include "blis_utest.h" #include "bli_gemm_power7_opt_8x4.h" #define COLMAJ_INDEX(row,col,ld) ((col*ld)+row) #define ROWMAJ_INDEX(row,col,ld) ((row*ld)+col) #define BLIS_INDEX(row,col,rs,cs) ((row*rs)+(col*cs)) #define MR BLIS_DEFAULT_MR_D #define NR BLIS_DEFAULT_NR_D #define LDA MR #define LDB NR #define EPSILON 0.0000001 /* * Perform * c = beta * c + alpha * a * b * where * alpha & beta are scalars * c is mr x nr in blis-format, (col-stride & row-stride) * a is mr x k in packed col-maj format (leading dim is mr) * b is k x nr in packed row-maj format (leading dim is nr) */ void bli_dgemm_check( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* data ) { int i, j, kk; double c00; for (i=0; i < MR; i++) { for (j=0; j < NR; j++) { c00 = c[BLIS_INDEX(i,j,rs_c,cs_c)] * *beta; for (kk=0; kk < k; kk++) c00 += *alpha * (a[COLMAJ_INDEX(i,kk,LDA)] * b[ROWMAJ_INDEX(kk,j,LDB)]); c[BLIS_INDEX(i,j,rs_c,cs_c)] = c00; } } } int main(int argc, char *argv[]) { double *A, *B, *C, *C2; double alpha = 1.0, beta = 1.0; long i, j; long k = 128; int iters = 10; int errors; struct timeval tv_start, tv_end; switch (argc) { case 2: k = atoi(argv[1]); case 1: break; default: printf("Usage: %s [k]\n", argv[0]); return 1; break; } //long rs_c = 1, cs_c = MR; // Column major long rs_c = NR, cs_c = 1; // Row major A = (double*)malloc(LDA * k * sizeof(double)); B = (double*)malloc(LDB * k * sizeof(double)); C = (double*)malloc(MR * NR * sizeof(double)); C2 = (double*)malloc(MR * NR * sizeof(double)); /* Initialize C matrix in blis format */ for (j=0; j EPSILON) { if (errors<20) printf(" %ld expected=%f got=%f\n", i, C2[i], C[i]); errors++; } } printf("Errors = %d\n", errors); if (errors) { return -1; } /* Now get the performance */ gettimeofday(&tv_start, NULL); for (i=0; i #include #include "blis.h" #if 0 void bli_sgemm_sandybridge_int_8x8 ( dim_t m, dim_t n, dim_t k, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif void bli_dgemm_sandybridge_int_8x4 ( dim_t m, dim_t n, dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k / 2; uint64_t k_left = k % 2; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; uint64_t i; GEMM_UKR_SETUP_CT( d, 8, 4, false ); double *c00, *c01, *c02, *c03; double *c40, *c41, *c42, *c43; // Quad registers. __m256d va0_3, va4_7; __m256d vA0_3, vA4_7; __m256d vb0, vb1, vb2, vb3; __m256d vb; __m256d vB0; __m256d va0_3b_0, va4_7b_0; __m256d va0_3b_1, va4_7b_1; __m256d va0_3b_2, va4_7b_2; __m256d va0_3b_3, va4_7b_3; __m256d va0_3b0, va4_7b0; __m256d va0_3b1, va4_7b1; __m256d va0_3b2, va4_7b2; __m256d va0_3b3, va4_7b3; __m256d valpha, vbeta, vtmp; __m256d vc0_3_0, vc0_3_1, vc0_3_2, vc0_3_3; __m256d vc4_7_0, vc4_7_1, vc4_7_2, vc4_7_3; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(a) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"(b_next) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"(c) ); va0_3b0 = _mm256_setzero_pd(); va0_3b1 = _mm256_setzero_pd(); va0_3b2 = _mm256_setzero_pd(); va0_3b3 = _mm256_setzero_pd(); va4_7b0 = _mm256_setzero_pd(); va4_7b1 = _mm256_setzero_pd(); va4_7b2 = _mm256_setzero_pd(); va4_7b3 = _mm256_setzero_pd(); va0_3b_0 = _mm256_setzero_pd(); va0_3b_1 = _mm256_setzero_pd(); va0_3b_2 = _mm256_setzero_pd(); va0_3b_3 = _mm256_setzero_pd(); va4_7b_0 = _mm256_setzero_pd(); va4_7b_1 = _mm256_setzero_pd(); va4_7b_2 = _mm256_setzero_pd(); va4_7b_3 = _mm256_setzero_pd(); // Load va0_3 va0_3 = _mm256_load_pd( a ); // Load va4_7 va4_7 = _mm256_load_pd( a + 4 ); // Load vb (b0,b1,b2,b3) vb0 = _mm256_load_pd( b ); for( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Load va0_3 (Prefetch) vA0_3 = _mm256_load_pd( a + 8 ); // Iteration 0. vtmp = _mm256_mul_pd( va0_3, vb0 ); va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb0 ); va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); // Load va4_7 (Prefetch) vA4_7 = _mm256_load_pd( a + 12 ); // Shuffle vb (b1,b0,b3,b2) vb1 = _mm256_shuffle_pd( vb0, vb0, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb1 ); va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb1 ); va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); // Permute vb (b3,b2,b1,b0) vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); // Load vb (b0,b1,b2,b3) (Prefetch) vB0 = _mm256_load_pd( b + 4 ); vtmp = _mm256_mul_pd( va0_3, vb2 ); va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb2 ); va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); // Shuffle vb (b3,b2,b1,b0) vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb3 ); va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb3 ); va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp ); // Iteration 1. __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Load va0_3 (Next iteration) va0_3 = _mm256_load_pd( a + 16 ); vtmp = _mm256_mul_pd( vA0_3, vB0 ); va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); vb1 = _mm256_shuffle_pd( vB0, vB0, 0x5 ); vtmp = _mm256_mul_pd( vA4_7, vB0 ); va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); vtmp = _mm256_mul_pd( vA0_3, vb1 ); va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); // Load va4_7 (Next iteration) va4_7 = _mm256_load_pd( a + 20 ); vb2 = _mm256_permute2f128_pd( vb1, vb1, 0x1 ); vtmp = _mm256_mul_pd( vA4_7, vb1 ); va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); vtmp = _mm256_mul_pd( vA0_3, vb2 ); va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); vb3 = _mm256_shuffle_pd( vb2, vb2, 0x5 ); vtmp = _mm256_mul_pd( vA4_7, vb2 ); va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); // Load vb0(Next iteration) vb0 = _mm256_load_pd( b + 8 ); vtmp = _mm256_mul_pd( vA0_3, vb3 ); va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); vtmp = _mm256_mul_pd( vA4_7, vb3 ); va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp ); a += 16; b += 8; } for( i = 0; i < k_left; ++i ) { // Iteration 0. // Load va0_3 va0_3 = _mm256_load_pd( a ); // Load va4_7 va4_7 = _mm256_load_pd( a + 4 ); // Load vb (b0,b1,b2,b3) vb = _mm256_load_pd( b ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_0 = _mm256_add_pd( va0_3b_0, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb ); va4_7b_0 = _mm256_add_pd( va4_7b_0, vtmp ); // Shuffle vb (b1,b0,b3,b2) vb = _mm256_shuffle_pd( vb, vb, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_1 = _mm256_add_pd( va0_3b_1, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb ); va4_7b_1 = _mm256_add_pd( va4_7b_1, vtmp ); // Permute vb (b3,b2,b1,b0) vb = _mm256_permute2f128_pd( vb, vb, 0x1 ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_2 = _mm256_add_pd( va0_3b_2, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb ); va4_7b_2 = _mm256_add_pd( va4_7b_2, vtmp ); // Shuffle vb (b3,b2,b1,b0) vb = _mm256_shuffle_pd( vb, vb, 0x5 ); vtmp = _mm256_mul_pd( va0_3, vb ); va0_3b_3 = _mm256_add_pd( va0_3b_3, vtmp ); vtmp = _mm256_mul_pd( va4_7, vb ); va4_7b_3 = _mm256_add_pd( va4_7b_3, vtmp ); a += 8; b += 4; } vbeta = _mm256_broadcast_sd( beta ); __m256d vtmpa_0_3b_0 = _mm256_blend_pd( va0_3b_0, va0_3b_1, 0x6 ); __m256d vtmpa_0_3b_1 = _mm256_blend_pd( va0_3b_1, va0_3b_0, 0x6 ); __m256d vtmpa_0_3b_2 = _mm256_blend_pd( va0_3b_2, va0_3b_3, 0x6 ); __m256d vtmpa_0_3b_3 = _mm256_blend_pd( va0_3b_3, va0_3b_2, 0x6 ); __m256d vtmpa_4_7b_0 = _mm256_blend_pd( va4_7b_0, va4_7b_1, 0x6 ); __m256d vtmpa_4_7b_1 = _mm256_blend_pd( va4_7b_1, va4_7b_0, 0x6 ); __m256d vtmpa_4_7b_2 = _mm256_blend_pd( va4_7b_2, va4_7b_3, 0x6 ); __m256d vtmpa_4_7b_3 = _mm256_blend_pd( va4_7b_3, va4_7b_2, 0x6 ); valpha = _mm256_broadcast_sd( alpha ); va0_3b0 = _mm256_permute2f128_pd( vtmpa_0_3b_0, vtmpa_0_3b_2, 0x30 ); va0_3b3 = _mm256_permute2f128_pd( vtmpa_0_3b_2, vtmpa_0_3b_0, 0x30 ); va0_3b1 = _mm256_permute2f128_pd( vtmpa_0_3b_1, vtmpa_0_3b_3, 0x30 ); va0_3b2 = _mm256_permute2f128_pd( vtmpa_0_3b_3, vtmpa_0_3b_1, 0x30 ); va4_7b0 = _mm256_permute2f128_pd( vtmpa_4_7b_0, vtmpa_4_7b_2, 0x30 ); va4_7b3 = _mm256_permute2f128_pd( vtmpa_4_7b_2, vtmpa_4_7b_0, 0x30 ); va4_7b1 = _mm256_permute2f128_pd( vtmpa_4_7b_1, vtmpa_4_7b_3, 0x30 ); va4_7b2 = _mm256_permute2f128_pd( vtmpa_4_7b_3, vtmpa_4_7b_1, 0x30 ); __m128d vzero = _mm_setzero_pd( ); if( _mm_comieq_sd( _mm256_castpd256_pd128(vbeta), vzero ) ) { // Calculate address c00 = ( c + 0 + 0*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b0); // Store back to memory _mm256_store_pd( c00, vtmp ); // Calculate address c40 = ( c + 4 + 0*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b0); // Store back to memory _mm256_store_pd( c40, vtmp ); // Calculate address c01 = ( c + 0 + 1*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b1); // Store back to memory _mm256_store_pd( c01, vtmp ); // Calculate address c41 = ( c + 4 + 1*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b1); // Store back to memory _mm256_store_pd( c41, vtmp ); // Calculate address c02 = ( c + 0 + 2*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b2); // Store back to memory _mm256_store_pd( c02, vtmp ); // Calculate address c42 = ( c + 4 + 2*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b2); // Store back to memory _mm256_store_pd( c42, vtmp ); // Calculate address c03 = ( c + 0 + 3*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b3); // Store back to memory _mm256_store_pd( c03, vtmp ); // Calculate address c43 = ( c + 4 + 3*cs_c ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b3); // Store back to memory _mm256_store_pd( c43, vtmp ); } else { // Calculate address c00 = ( c + 0 + 0*cs_c ); // Load //vc0_3_0 = _mm256_load_pd( c + 0 + 0*cs_c ); vc0_3_0 = _mm256_load_pd( c00 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b0); // Scale by beta vc0_3_0 = _mm256_mul_pd( vbeta, vc0_3_0 ); // Add gemm result vc0_3_0 = _mm256_add_pd( vc0_3_0, vtmp ); // Store back to memory _mm256_store_pd( c00, vc0_3_0 ); // Calculate address c40 = ( c + 4 + 0*cs_c ); // Load //vc4_7_0 = _mm256_load_pd( c + 4 + 0*cs_c ); vc4_7_0 = _mm256_load_pd( c40 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b0); // Scale by beta vc4_7_0 = _mm256_mul_pd( vbeta, vc4_7_0 ); // Add gemm result vc4_7_0 = _mm256_add_pd( vc4_7_0, vtmp ); // Store back to memory _mm256_store_pd( c40, vc4_7_0 ); // Calculate address c01 = ( c + 0 + 1*cs_c ); // Load //vc0_3_1 = _mm256_load_pd( c + 0 + 1*cs_c ); vc0_3_1 = _mm256_load_pd( c01 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b1); // Scale by beta vc0_3_1 = _mm256_mul_pd( vbeta, vc0_3_1 ); // Add gemm result vc0_3_1 = _mm256_add_pd( vc0_3_1, vtmp ); // Store back to memory _mm256_store_pd( c01, vc0_3_1 ); // Calculate address c41 = ( c + 4 + 1*cs_c ); // Load //vc4_7_1 = _mm256_load_pd( c + 4 + 1*cs_c ); vc4_7_1 = _mm256_load_pd( c41 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b1); // Scale by beta vc4_7_1 = _mm256_mul_pd( vbeta, vc4_7_1 ); // Add gemm result vc4_7_1 = _mm256_add_pd( vc4_7_1, vtmp ); // Store back to memory _mm256_store_pd( c41, vc4_7_1 ); // Calculate address c02 = ( c + 0 + 2*cs_c ); // Load //vc0_3_2 = _mm256_load_pd( c + 0 + 2*cs_c ); vc0_3_2 = _mm256_load_pd( c02 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b2); // Scale by beta vc0_3_2 = _mm256_mul_pd( vbeta, vc0_3_2 ); // Add gemm result vc0_3_2 = _mm256_add_pd( vc0_3_2, vtmp ); // Store back to memory _mm256_store_pd( c02, vc0_3_2 ); // Calculate address c42 = ( c + 4 + 2*cs_c ); // Load //vc4_7_2 = _mm256_load_pd( c + 4 + 2*cs_c ); vc4_7_2 = _mm256_load_pd( c42 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b2); // Scale by beta vc4_7_2 = _mm256_mul_pd( vbeta, vc4_7_2 ); // Add gemm result vc4_7_2 = _mm256_add_pd( vc4_7_2, vtmp ); // Store back to memory _mm256_store_pd( c42, vc4_7_2 ); // Calculate address c03 = ( c + 0 + 3*cs_c ); // Load //vc0_3_3 = _mm256_load_pd( c + 0 + 3*cs_c ); vc0_3_3 = _mm256_load_pd( c03 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va0_3b3); // Scale by beta vc0_3_3 = _mm256_mul_pd( vbeta, vc0_3_3 ); // Add gemm result vc0_3_3 = _mm256_add_pd( vc0_3_3, vtmp ); // Store back to memory _mm256_store_pd( c03, vc0_3_3 ); // Calculate address c43 = ( c + 4 + 3*cs_c ); // Load //vc4_7_3 = _mm256_load_pd( c + 4 + 3*cs_c ); vc4_7_3 = _mm256_load_pd( c43 ); // Scale by alpha vtmp = _mm256_mul_pd( valpha, va4_7b3); // Scale by beta vc4_7_3 = _mm256_mul_pd( vbeta, vc4_7_3 ); // Add gemm result vc4_7_3 = _mm256_add_pd( vc4_7_3, vtmp ); // Store back to memory _mm256_store_pd( c43, vc4_7_3 ); } GEMM_UKR_FLUSH_CT( d ); } #if 0 void bli_cgemm_sandybridge_int_8x4 ( dim_t m, dim_t n, dim_t k, scomplex* restrict alpha, scomplex* restrict a, scomplex* restrict b, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif #if 0 void bli_zgemm_sandybridge_int_4x4 ( dim_t m, dim_t n, dim_t k, dcomplex* restrict alpha, dcomplex* restrict a, dcomplex* restrict b, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { } #endif cython-blis-0.9.1/blis/_src/kernels/sandybridge/bli_kernels_sandybridge.h000066400000000000000000000041531427272030600266140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // d8x4 (assembly) GEMM_UKR_PROT( float, s, gemm_sandybridge_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_asm_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_asm_4x4 ) // d8x4 (intrinsics) GEMM_UKR_PROT( float, s, gemm_sandybridge_int_8x8 ) GEMM_UKR_PROT( double, d, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( scomplex, c, gemm_sandybridge_int_8x4 ) GEMM_UKR_PROT( dcomplex, z, gemm_sandybridge_int_4x4 ) cython-blis-0.9.1/blis/_src/kernels/skx/000077500000000000000000000000001427272030600201065ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/skx/3/000077500000000000000000000000001427272030600202505ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c000066400000000000000000000364441427272030600253310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_INTEL #include "bli_x86_asm_macros.h" #define A_L1_PREFETCH_DIST 4 //should be multiple of 2 /*The pointer of B is moved ahead by one iteration of k before the loop starts.Therefore, prefetching 3 k iterations ahead*/ #define B_L1_PREFETCH_DIST 4 #define TAIL_NITER 8 #define CACHELINE_SIZE 64 //size of cache line in bytes /* During each subiteration, prefetching 2 cache lines of B * UNROLL factor ahead. 2cache lines = 16 doubles (NR). * */ #define PREFETCH_A_L1(n, k) \ PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*16*8 + (2*n+k) * CACHELINE_SIZE)) /* Preloading B for the first iteration of the main loop. * for subiter(1), subiter(2), and subiter(3) */ #define PREFETCH_B_L1_1ITER \ PREFETCH(0, MEM(RBX )) \ PREFETCH(0, MEM(RBX, CACHELINE_SIZE)) \ PREFETCH(0, MEM(RBX, 2*CACHELINE_SIZE)) \ PREFETCH(0, MEM(RBX, 3*CACHELINE_SIZE)) \ PREFETCH(0, MEM(RBX, 4*CACHELINE_SIZE)) \ PREFETCH(0, MEM(RBX, 5*CACHELINE_SIZE)) #define LOOP_ALIGN ALIGN16 #define UPDATE_C(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \ VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \ VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \ VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \ VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \ VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,2)) #define UPDATE_C_BZ(R1,R2,R3,R4) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VMOVUPD(MEM(RCX,0*64), ZMM(R1)) \ VMOVUPD(MEM(RCX,1*64), ZMM(R2)) \ VMOVUPD(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ VMOVUPD(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,2)) #define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),1)) \ VFMADD231PD(ZMM(R1), ZMM(6), ZMM(1)) \ VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(2), ZMM(R1)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),1)) \ VFMADD231PD(ZMM(R2), ZMM(6), ZMM(1)) \ VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(2), ZMM(R2)) \ \ LEA(RCX, MEM(RCX,RAX,1)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(2),1)) \ VFMADD231PD(ZMM(R3), ZMM(6), ZMM(1)) \ VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(2), ZMM(R3)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VGATHERQPD(ZMM(6) MASK_K(1), MEM(RCX,ZMM(3),1)) \ VFMADD231PD(ZMM(R4), ZMM(6), ZMM(1)) \ VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(2), ZMM(R4)) \ \ LEA(RCX, MEM(RCX,RAX,1)) #define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \ \ KXNORW(K(1), K(0), K(0)) \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(1), ZMM(R1)) \ \ KXNORW(K(1), K(0), K(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(1), ZMM(R2)) \ \ LEA(RCX, MEM(RCX,RAX,1)) \ \ KXNORW(K(1), K(0), K(0)) \ VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \ VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(1), ZMM(R3)) \ \ KXNORW(K(1), K(0), K(0)) \ VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \ VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(1), ZMM(R4)) \ \ LEA(RCX, MEM(RCX,RAX,1)) #ifdef PREFETCH_C_L2 #undef PREFETCH_C_L2 #define PREFETCH_C_L2 \ \ PREFETCH(1, MEM(RCX, 0*64)) \ PREFETCH(1, MEM(RCX, 1*64)) \ \ PREFETCH(1, MEM(RCX,R12,1,0*64)) \ PREFETCH(1, MEM(RCX,R12,1,1*64)) \ \ PREFETCH(1, MEM(RCX,R12,2,0*64)) \ PREFETCH(1, MEM(RCX,R12,2,1*64)) \ \ PREFETCH(1, MEM(RCX,R13,1,0*64)) \ PREFETCH(1, MEM(RCX,R13,1,1*64)) \ \ PREFETCH(1, MEM(RCX,R12,4,0*64)) \ PREFETCH(1, MEM(RCX,R12,4,1*64)) \ \ PREFETCH(1, MEM(RCX,R14,1,0*64)) \ PREFETCH(1, MEM(RCX,R14,1,1*64)) \ \ PREFETCH(1, MEM(RCX,R13,2,0*64)) \ PREFETCH(1, MEM(RCX,R13,2,1*64)) \ \ PREFETCH(1, MEM(RCX,R15,1,0*64)) \ PREFETCH(1, MEM(RCX,R15,1,1*64)) \ \ PREFETCH(1, MEM(RDX, 0*64)) \ PREFETCH(1, MEM(RDX, 1*64)) \ \ PREFETCH(1, MEM(RDX,R12,1,0*64)) \ PREFETCH(1, MEM(RDX,R12,1,1*64)) \ \ PREFETCH(1, MEM(RDX,R12,2,0*64)) \ PREFETCH(1, MEM(RDX,R12,2,1*64)) \ \ PREFETCH(1, MEM(RDX,R13,1,0*64)) \ PREFETCH(1, MEM(RDX,R13,1,1*64)) #else #undef PREFETCH_C_L2 #define PREFETCH_C_L2 #endif #define PREFETCH_C_L1 \ \ PREFETCHW0(MEM(RCX, 0*64)) \ PREFETCHW0(MEM(RCX, 1*64)) \ PREFETCHW0(MEM(RCX,R12,1,0*64)) \ PREFETCHW0(MEM(RCX,R12,1,1*64)) \ PREFETCHW0(MEM(RCX,R12,2,0*64)) \ PREFETCHW0(MEM(RCX,R12,2,1*64)) \ PREFETCHW0(MEM(RCX,R13,1,0*64)) \ PREFETCHW0(MEM(RCX,R13,1,1*64)) \ PREFETCHW0(MEM(RCX,R12,4,0*64)) \ PREFETCHW0(MEM(RCX,R12,4,1*64)) \ PREFETCHW0(MEM(RCX,R14,1,0*64)) \ PREFETCHW0(MEM(RCX,R14,1,1*64)) \ PREFETCHW0(MEM(RCX,R13,2,0*64)) \ PREFETCHW0(MEM(RCX,R13,2,1*64)) \ PREFETCHW0(MEM(RCX,R15,1,0*64)) \ PREFETCHW0(MEM(RCX,R15,1,1*64)) \ PREFETCHW0(MEM(RDX, 0*64)) \ PREFETCHW0(MEM(RDX, 1*64)) \ PREFETCHW0(MEM(RDX,R12,1,0*64)) \ PREFETCHW0(MEM(RDX,R12,1,1*64)) \ PREFETCHW0(MEM(RDX,R12,2,0*64)) \ PREFETCHW0(MEM(RDX,R12,2,1*64)) \ PREFETCHW0(MEM(RDX,R13,1,0*64)) \ PREFETCHW0(MEM(RDX,R13,1,1*64)) // // n: index in unrolled loop // // a: ZMM register to load into // b: ZMM register to read from // // ...: addressing for A, except for offset // #define SUBITER(n) \ \ PREFETCH_A_L1(n, 0) \ \ VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 0)*8)) \ VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 1)*8)) \ VFMADD231PD(ZMM( 8), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM( 9), ZMM(1), ZMM(3)) \ VFMADD231PD(ZMM(10), ZMM(0), ZMM(4)) \ VFMADD231PD(ZMM(11), ZMM(1), ZMM(4)) \ \ VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 2)*8)) \ VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 3)*8)) \ VFMADD231PD(ZMM(12), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(13), ZMM(1), ZMM(3)) \ VFMADD231PD(ZMM(14), ZMM(0), ZMM(4)) \ VFMADD231PD(ZMM(15), ZMM(1), ZMM(4)) \ \ VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 4)*8)) \ VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 5)*8)) \ VFMADD231PD(ZMM(16), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(17), ZMM(1), ZMM(3)) \ VFMADD231PD(ZMM(18), ZMM(0), ZMM(4)) \ VFMADD231PD(ZMM(19), ZMM(1), ZMM(4)) \ \ PREFETCH_A_L1(n, 1) \ \ VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 6)*8)) \ VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 7)*8)) \ VFMADD231PD(ZMM(20), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(21), ZMM(1), ZMM(3)) \ VFMADD231PD(ZMM(22), ZMM(0), ZMM(4)) \ VFMADD231PD(ZMM(23), ZMM(1), ZMM(4)) \ \ VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+ 8)*8)) \ VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+ 9)*8)) \ VFMADD231PD(ZMM(24), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(25), ZMM(1), ZMM(3)) \ VFMADD231PD(ZMM(26), ZMM(0), ZMM(4)) \ VFMADD231PD(ZMM(27), ZMM(1), ZMM(4)) \ \ VBROADCASTSD(ZMM(3), MEM(RBX,(12*n+10)*8)) \ VBROADCASTSD(ZMM(4), MEM(RBX,(12*n+11)*8)) \ VFMADD231PD(ZMM(28), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(29), ZMM(1), ZMM(3)) \ VFMADD231PD(ZMM(30), ZMM(0), ZMM(4)) \ VFMADD231PD(ZMM(31), ZMM(1), ZMM(4)) \ \ VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \ VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8)) //This is an array used for the scatter/gather instructions. static int64_t offsets[16] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; void bli_dgemm_skx_asm_16x12_l2 ( dim_t m, dim_t n, dim_t k_, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, cntx_t* restrict cntx ) { (void)data; (void)cntx; int64_t k = k_; int64_t rs_c = rs_c_; int64_t cs_c = cs_c_; GEMM_UKR_SETUP_CT( d, 16, 12, false ); BEGIN_ASM() VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VMOVAPD(YMM( 7), YMM(8)) VMOVAPD(YMM( 9), YMM(8)) VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c VMOVAPD(YMM(14), YMM(8)) VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*8)) //pre-load a VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 8*8)) //pre-load a VMOVAPD(YMM(17), YMM(8)) VMOVAPD(YMM(18), YMM(8)) VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3 VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5 VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7 VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c VMOVAPD(YMM(24), YMM(8)) VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(16*8)) //mr*sizeof(double) VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*8)) //nr*sizeof(double) VMOVAPD(YMM(27), YMM(8)) VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load VMOVAPD(YMM(29), YMM(8)) VMOVAPD(YMM(30), YMM(8)) VMOVAPD(YMM(31), YMM(8)) TEST(RSI, RSI) JZ(POSTACCUM) #ifdef PREFETCH_A_BEFORE PREFETCH(0, MEM(RAX,0*64)) PREFETCH(0, MEM(RAX,1*64)) PREFETCH(0, MEM(RAX,2*64)) PREFETCH(0, MEM(RAX,3*64)) PREFETCH(0, MEM(RAX,4*64)) PREFETCH(0, MEM(RAX,5*64)) PREFETCH(0, MEM(RAX,6*64)) PREFETCH(0, MEM(RAX,7*64)) #endif #ifdef PREFETCH_B_BEFORE PREFETCH(0, MEM(RBX,0*64)) PREFETCH(0, MEM(RBX,1*64)) PREFETCH(0, MEM(RBX,2*64)) PREFETCH(0, MEM(RBX,3*64)) PREFETCH(0, MEM(RBX,4*64)) PREFETCH(0, MEM(RBX,5*64)) #endif PREFETCH_C_L2 MOV(RDI, RSI) AND(RSI, IMM(3)) SAR(RDI, IMM(2)) SUB(RDI, IMM(0+TAIL_NITER)) JLE(K_SMALL) LOOP_ALIGN LABEL(MAIN_LOOP) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8)) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64)) SUBITER(0) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128)) SUBITER(1) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192)) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256)) SUBITER(2) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320)) SUBITER(3) LEA(RAX, MEM(RAX,R8,4)) LEA(RBX, MEM(RBX,R9,4)) DEC(RDI) JNZ(MAIN_LOOP) LABEL(K_SMALL) PREFETCH_C_L1 ADD(RDI, IMM(0+TAIL_NITER)) JZ(TAIL_LOOP) LOOP_ALIGN LABEL(SMALL_LOOP) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8)) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64)) SUBITER(0) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+128)) SUBITER(1) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+192)) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+256)) SUBITER(2) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+320)) SUBITER(3) LEA(RAX, MEM(RAX,R8,4)) LEA(RBX, MEM(RBX,R9,4)) DEC(RDI) JNZ(SMALL_LOOP) TEST(RSI, RSI) JZ(POSTACCUM) LOOP_ALIGN LABEL(TAIL_LOOP) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8)) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*8+64)) SUBITER(0) ADD(RAX, R8) ADD(RBX, R9) DEC(RSI) JNZ(TAIL_LOOP) LABEL(POSTACCUM) #ifdef PREFETCH_A_AFTER MOV(R8, VAR(a)) PREFETCH(0, MEM(R8,0*64)) PREFETCH(0, MEM(R8,1*64)) PREFETCH(0, MEM(R8,2*64)) PREFETCH(0, MEM(R8,3*64)) PREFETCH(0, MEM(R8,4*64)) PREFETCH(0, MEM(R8,5*64)) PREFETCH(0, MEM(R8,6*64)) PREFETCH(0, MEM(R8,7*64)) #endif #ifdef PREFETCH_B_AFTER MOV(R9, VAR(b)) PREFETCH(0, MEM(R9,0*64)) PREFETCH(0, MEM(R9,1*64)) PREFETCH(0, MEM(R9,2*64)) PREFETCH(0, MEM(R9,3*64)) PREFETCH(0, MEM(R9,4*64)) PREFETCH(0, MEM(R9,5*64)) #endif MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSD(ZMM(0), MEM(RAX)) VBROADCASTSD(ZMM(1), MEM(RBX)) MOV(RAX, VAR(cs_c)) LEA(RAX, MEM(,RAX,8)) VCOMISD(XMM(1), XMM(7)) JE(COLSTORBZ) UPDATE_C( 8, 9,10,11) UPDATE_C(12,13,14,15) UPDATE_C(16,17,18,19) UPDATE_C(20,21,22,23) UPDATE_C(24,25,26,27) UPDATE_C(28,29,30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ( 8, 9,10,11) UPDATE_C_BZ(12,13,14,15) UPDATE_C_BZ(16,17,18,19) UPDATE_C_BZ(20,21,22,23) UPDATE_C_BZ(24,25,26,27) UPDATE_C_BZ(28,29,30,31) LABEL(END) VZEROUPPER() END_ASM( : // output operands : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) GEMM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/skx/3/bli_dgemm_skx_asm_16x14.c000066400000000000000000000264551427272030600247370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "bli_x86_asm_macros.h" #define A_L1_PREFETCH_DIST 4 // in units of k iterations #define B_L1_PREFETCH_DIST 4 // e.g. 4 k iterations ~= 56 cycles #define TAIL_NITER 5 // in units of 4x unrolled k iterations // e.g. 5 -> 4*5 k iterations ~= 280 cycles #define PREFETCH_A_L1(n, k) \ PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*16*8 + (2*n+k)*64)) #define PREFETCH_B_L1(n, k) \ PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*14*8 + (2*n+k)*56)) #define LOOP_ALIGN ALIGN32 #define UPDATE_C(R1,R2) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX)) \ VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,64)) \ VMOVUPD(MEM(RCX), ZMM(R1)) \ VMOVUPD(MEM(RCX,64), ZMM(R2)) \ LEA(RCX, MEM(RCX,RBX,1)) #define UPDATE_C_BZ(R1,R2) \ \ VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \ VMOVUPD(MEM(RCX), ZMM(R1)) \ VMOVUPD(MEM(RCX,64), ZMM(R2)) \ LEA(RCX, MEM(RCX,RBX,1)) #define UPDATE_C_COL_SCATTERED(R1,R2) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ KXNORW(K(3), K(0), K(0)) \ KXNORW(K(4), K(0), K(0)) \ VGATHERQPD(ZMM(0) MASK_K(1), MEM(RCX,ZMM(2),1)) \ VFMADD231PD(ZMM(R1), ZMM(0), ZMM(1)) \ VGATHERQPD(ZMM(0) MASK_K(2), MEM(RCX,ZMM(3),1)) \ VFMADD231PD(ZMM(R2), ZMM(0), ZMM(1)) \ VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(3), ZMM(R1)) \ VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(4), ZMM(R2)) \ LEA(RCX, MEM(RCX,RBX,1)) #define UPDATE_C_BZ_COL_SCATTERED(R1,R2) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(1), ZMM(R1)) \ VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(2), ZMM(R2)) \ LEA(RCX, MEM(RCX,RBX,1)) #define SUBITER(n) \ \ PREFETCH_A_L1(n, 0) \ \ VBROADCASTSD(ZMM(2), MEM(RBX,(14*n+ 0)*8)) \ VBROADCASTSD(ZMM(3), MEM(RBX,(14*n+ 1)*8)) \ VFMADD231PD(ZMM( 4), ZMM(0), ZMM(2)) \ VFMADD231PD(ZMM( 5), ZMM(1), ZMM(2)) \ VFMADD231PD(ZMM( 6), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM( 7), ZMM(1), ZMM(3)) \ \ VBROADCASTSD(ZMM(2), MEM(RBX,(14*n+ 2)*8)) \ VBROADCASTSD(ZMM(3), MEM(RBX,(14*n+ 3)*8)) \ VFMADD231PD(ZMM( 8), ZMM(0), ZMM(2)) \ VFMADD231PD(ZMM( 9), ZMM(1), ZMM(2)) \ VFMADD231PD(ZMM(10), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(11), ZMM(1), ZMM(3)) \ \ PREFETCH_B_L1(n, 0) \ \ VBROADCASTSD(ZMM(2), MEM(RBX,(14*n+ 4)*8)) \ VBROADCASTSD(ZMM(3), MEM(RBX,(14*n+ 5)*8)) \ VFMADD231PD(ZMM(12), ZMM(0), ZMM(2)) \ VFMADD231PD(ZMM(13), ZMM(1), ZMM(2)) \ VFMADD231PD(ZMM(14), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(15), ZMM(1), ZMM(3)) \ \ VBROADCASTSD(ZMM(2), MEM(RBX,(14*n+ 6)*8)) \ VBROADCASTSD(ZMM(3), MEM(RBX,(14*n+ 7)*8)) \ VFMADD231PD(ZMM(16), ZMM(0), ZMM(2)) \ VFMADD231PD(ZMM(17), ZMM(1), ZMM(2)) \ VFMADD231PD(ZMM(18), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(19), ZMM(1), ZMM(3)) \ \ PREFETCH_A_L1(n, 1) \ \ VBROADCASTSD(ZMM(2), MEM(RBX,(14*n+ 8)*8)) \ VBROADCASTSD(ZMM(3), MEM(RBX,(14*n+ 9)*8)) \ VFMADD231PD(ZMM(20), ZMM(0), ZMM(2)) \ VFMADD231PD(ZMM(21), ZMM(1), ZMM(2)) \ VFMADD231PD(ZMM(22), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(23), ZMM(1), ZMM(3)) \ \ VBROADCASTSD(ZMM(2), MEM(RBX,(14*n+10)*8)) \ VBROADCASTSD(ZMM(3), MEM(RBX,(14*n+11)*8)) \ VFMADD231PD(ZMM(24), ZMM(0), ZMM(2)) \ VFMADD231PD(ZMM(25), ZMM(1), ZMM(2)) \ VFMADD231PD(ZMM(26), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(27), ZMM(1), ZMM(3)) \ \ PREFETCH_B_L1(n, 1) \ \ VBROADCASTSD(ZMM(2), MEM(RBX,(14*n+12)*8)) \ VBROADCASTSD(ZMM(3), MEM(RBX,(14*n+13)*8)) \ VFMADD231PD(ZMM(28), ZMM(0), ZMM(2)) \ VFMADD231PD(ZMM(29), ZMM(1), ZMM(2)) \ VFMADD231PD(ZMM(30), ZMM(0), ZMM(3)) \ VFMADD231PD(ZMM(31), ZMM(1), ZMM(3)) \ \ VMOVAPD(ZMM(0), MEM(RAX,(16*n+0)*8)) \ VMOVAPD(ZMM(1), MEM(RAX,(16*n+8)*8)) //This is an array used for the scatter/gather instructions. static int64_t offsets[16] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; void bli_dgemm_skx_asm_16x14 ( dim_t m, dim_t n, dim_t k_, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, cntx_t* restrict cntx ) { (void)data; (void)cntx; int64_t k = k_; int64_t rs_c = rs_c_; int64_t cs_c = cs_c_; GEMM_UKR_SETUP_CT( d, 16, 14, false ); BEGIN_ASM() VXORPD(YMM( 4), YMM( 4), YMM( 4)) //clear out registers VXORPD(YMM( 5), YMM( 5), YMM( 5)) VXORPD(YMM( 6), YMM( 6), YMM( 6)) VXORPD(YMM( 7), YMM( 7), YMM( 7)) VXORPD(YMM( 8), YMM( 8), YMM( 8)) VXORPD(YMM( 9), YMM( 9), YMM( 9)) VXORPD(YMM(10), YMM(10), YMM(10)) VXORPD(YMM(11), YMM(11), YMM(11)) VXORPD(YMM(12), YMM(12), YMM(12)) VXORPD(YMM(13), YMM(13), YMM(13)) VXORPD(YMM(14), YMM(14), YMM(14)) VXORPD(YMM(15), YMM(15), YMM(15)) VXORPD(YMM(16), YMM(16), YMM(16)) VXORPD(YMM(17), YMM(17), YMM(17)) VXORPD(YMM(18), YMM(18), YMM(18)) VXORPD(YMM(19), YMM(19), YMM(19)) VXORPD(YMM(20), YMM(20), YMM(20)) VXORPD(YMM(21), YMM(21), YMM(21)) VXORPD(YMM(22), YMM(22), YMM(22)) VXORPD(YMM(23), YMM(23), YMM(23)) VXORPD(YMM(24), YMM(24), YMM(24)) VXORPD(YMM(25), YMM(25), YMM(25)) VXORPD(YMM(26), YMM(26), YMM(26)) VXORPD(YMM(27), YMM(27), YMM(27)) VXORPD(YMM(28), YMM(28), YMM(28)) VXORPD(YMM(29), YMM(29), YMM(29)) VXORPD(YMM(30), YMM(30), YMM(30)) VXORPD(YMM(31), YMM(31), YMM(31)) MOV(RSI, VAR(k)) //loop index MOV(RAX, VAR(a)) //load address of a MOV(RBX, VAR(b)) //load address of b MOV(RCX, VAR(c)) //load address of c LEA(RDX, MEM(RSI,RSI,2)) LEA(RDX, MEM(,RDX,4)) LEA(RDX, MEM(RDX,RSI,2)) // 14*k LEA(RDX, MEM(RBX,RDX,8,-128)) // b_next LEA(R9, MEM(RCX,63)) // c for prefetching VMOVAPD(ZMM(0), MEM(RAX, 0*8)) //pre-load a VMOVAPD(ZMM(1), MEM(RAX, 8*8)) //pre-load a LEA(RAX, MEM(RAX,16*8)) //adjust a for pre-load MOV(R12, VAR(rs_c)) MOV(R10, VAR(cs_c)) LEA(R12, MEM(,R12,8)) LEA(R10, MEM(,R10,8)) MOV(RDI, RSI) AND(RSI, IMM(3)) SAR(RDI, IMM(2)) SUB(RDI, IMM(14+TAIL_NITER)) JLE(K_LE_80) LOOP_ALIGN LABEL(LOOP1) SUBITER(0) PREFETCH(1, MEM(RDX)) SUBITER(1) SUB(RDI, IMM(1)) SUBITER(2) PREFETCH(1, MEM(RDX,64)) SUBITER(3) LEA(RAX, MEM(RAX,4*16*8)) LEA(RBX, MEM(RBX,4*14*8)) LEA(RDX, MEM(RDX,16*8)) JNZ(LOOP1) LABEL(K_LE_80) ADD(RDI, IMM(14)) JLE(K_LE_24) LOOP_ALIGN LABEL(LOOP2) PREFETCH(0, MEM(R9)) SUBITER(0) PREFETCH(1, MEM(RDX)) SUBITER(1) PREFETCH(0, MEM(R9,64)) SUB(RDI, IMM(1)) SUBITER(2) PREFETCH(1, MEM(RDX,64)) SUBITER(3) LEA(RAX, MEM(RAX,4*16*8)) LEA(RBX, MEM(RBX,4*14*8)) LEA(RDX, MEM(RDX,16*8)) LEA(R9, MEM(R9,R10,1)) JNZ(LOOP2) LABEL(K_LE_24) ADD(RDI, IMM(0+TAIL_NITER)) JLE(TAIL) LOOP_ALIGN LABEL(LOOP3) SUBITER(0) PREFETCH(1, MEM(RDX)) SUBITER(1) SUB(RDI, IMM(1)) SUBITER(2) PREFETCH(1, MEM(RDX,64)) SUBITER(3) LEA(RAX, MEM(RAX,4*16*8)) LEA(RBX, MEM(RBX,4*14*8)) LEA(RDX, MEM(RDX,16*8)) JNZ(LOOP3) LABEL(TAIL) TEST(RSI, RSI) JZ(POSTACCUM) LOOP_ALIGN LABEL(TAIL_LOOP) SUB(RSI, IMM(1)) SUBITER(0) LEA(RAX, MEM(RAX,16*8)) LEA(RBX, MEM(RBX,14*8)) JNZ(TAIL_LOOP) LABEL(POSTACCUM) MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSD(ZMM(0), MEM(RAX)) VBROADCASTSD(ZMM(1), MEM(RBX)) VXORPD(YMM(2), YMM(2), YMM(2)) MOV(RAX, R12) MOV(RBX, R10) VCOMISD(XMM(1), XMM(2)) JE(COLSTORBZ) UPDATE_C( 4, 5) UPDATE_C( 6, 7) UPDATE_C( 8, 9) UPDATE_C(10,11) UPDATE_C(12,13) UPDATE_C(14,15) UPDATE_C(16,17) UPDATE_C(18,19) UPDATE_C(20,21) UPDATE_C(22,23) UPDATE_C(24,25) UPDATE_C(26,27) UPDATE_C(28,29) UPDATE_C(30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ( 4, 5) UPDATE_C_BZ( 6, 7) UPDATE_C_BZ( 8, 9) UPDATE_C_BZ(10,11) UPDATE_C_BZ(12,13) UPDATE_C_BZ(14,15) UPDATE_C_BZ(16,17) UPDATE_C_BZ(18,19) UPDATE_C_BZ(20,21) UPDATE_C_BZ(22,23) UPDATE_C_BZ(24,25) UPDATE_C_BZ(26,27) UPDATE_C_BZ(28,29) UPDATE_C_BZ(30,31) LABEL(END) VZEROUPPER() END_ASM ( : // output operands : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) GEMM_UKR_FLUSH_CT( d ); } cython-blis-0.9.1/blis/_src/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c000066400000000000000000000403171427272030600253400ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_INTEL #include "bli_x86_asm_macros.h" #define CACHELINE_SIZE 64 //size of cache line in bytes #define A_L1_PREFETCH_DIST 4 //should be multiple of 2 /*The pointer of B is moved ahead by one iteration of k before the loop starts.Therefore, prefetching 3 k iterations ahead*/ #define B_L1_PREFETCH_DIST 4 #define TAIL_NITER 8 /* During each subiteration, prefetching 2 cache lines of B * UNROLL factor ahead. 2cache lines = 32 floats (NR). * */ #define PREFETCH_A_L1(n, k) \ PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST*32*4 + (2*n+k) * CACHELINE_SIZE)) #define LOOP_ALIGN ALIGN16 #define UPDATE_C(R1,R2,R3,R4) \ \ VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \ VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \ VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \ VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \ VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \ VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \ VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,2)) #define UPDATE_C_BZ(R1,R2,R3,R4) \ \ VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \ VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \ VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \ VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \ LEA(RCX, MEM(RCX,RAX,2)) #define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ KXNORW(K(3), K(0), K(0)) \ KXNORW(K(4), K(0), K(0)) \ VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \ VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),1)) \ VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),1)) \ VFMADD231PS(YMM(R1), YMM(6), YMM(1)) \ VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ VSCATTERQPS(MEM(RCX,ZMM(2),1) MASK_K(3), YMM(R1)) \ VSCATTERQPS(MEM(RCX,ZMM(3),1) MASK_K(4), YMM( 5)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ KXNORW(K(3), K(0), K(0)) \ KXNORW(K(4), K(0), K(0)) \ VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \ VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),1)) \ VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),1)) \ VFMADD231PS(YMM(R2), YMM(6), YMM(1)) \ VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ VSCATTERQPS(MEM(RDX,ZMM(2),1) MASK_K(3), YMM(R2)) \ VSCATTERQPS(MEM(RDX,ZMM(3),1) MASK_K(4), YMM( 5)) \ \ LEA(RCX, MEM(RCX,RAX,1)) \ LEA(RDX, MEM(RDX,RAX,1)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ KXNORW(K(3), K(0), K(0)) \ KXNORW(K(4), K(0), K(0)) \ VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \ VGATHERQPS(YMM(6) MASK_K(1), MEM(RCX,ZMM(2),1)) \ VGATHERQPS(YMM(7) MASK_K(2), MEM(RCX,ZMM(3),1)) \ VFMADD231PS(YMM(R3), YMM(6), YMM(1)) \ VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ VSCATTERQPS(MEM(RCX,ZMM(2),1) MASK_K(3), YMM(R3)) \ VSCATTERQPS(MEM(RCX,ZMM(3),1) MASK_K(4), YMM( 5)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ KXNORW(K(3), K(0), K(0)) \ KXNORW(K(4), K(0), K(0)) \ VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \ VGATHERQPS(YMM(6) MASK_K(1), MEM(RDX,ZMM(2),1)) \ VGATHERQPS(YMM(7) MASK_K(2), MEM(RDX,ZMM(3),1)) \ VFMADD231PS(YMM(R4), YMM(6), YMM(1)) \ VFMADD231PS(YMM( 5), YMM(7), YMM(1)) \ VSCATTERQPS(MEM(RDX,ZMM(2),1) MASK_K(3), YMM(R4)) \ VSCATTERQPS(MEM(RDX,ZMM(3),1) MASK_K(4), YMM( 5)) \ \ LEA(RCX, MEM(RCX,RAX,1)) \ LEA(RDX, MEM(RDX,RAX,1)) #define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R1), IMM(1)) \ VSCATTERQPS(MEM(RCX,ZMM(2),1) MASK_K(1), YMM(R1)) \ VSCATTERQPS(MEM(RCX,ZMM(3),1) MASK_K(2), YMM( 5)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R2), IMM(1)) \ VSCATTERQPS(MEM(RDX,ZMM(2),1) MASK_K(1), YMM(R2)) \ VSCATTERQPS(MEM(RDX,ZMM(3),1) MASK_K(2), YMM( 5)) \ \ LEA(RCX, MEM(RCX,RAX,1)) \ LEA(RDX, MEM(RDX,RAX,1)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R3), IMM(1)) \ VSCATTERQPS(MEM(RCX,ZMM(2),1) MASK_K(1), YMM(R3)) \ VSCATTERQPS(MEM(RCX,ZMM(3),1) MASK_K(2), YMM( 5)) \ \ KXNORW(K(1), K(0), K(0)) \ KXNORW(K(2), K(0), K(0)) \ VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \ VEXTRACTF64X4(YMM(5), ZMM(R4), IMM(1)) \ VSCATTERQPS(MEM(RDX,ZMM(2),1) MASK_K(1), YMM(R4)) \ VSCATTERQPS(MEM(RDX,ZMM(3),1) MASK_K(2), YMM( 5)) \ \ LEA(RCX, MEM(RCX,RAX,1)) \ LEA(RDX, MEM(RDX,RAX,1)) #ifdef PREFETCH_C_L2 #undef PREFETCH_C_L2 #define PREFETCH_C_L2 \ \ PREFETCH(1, MEM(RCX, 0*64)) \ PREFETCH(1, MEM(RCX, 1*64)) \ \ PREFETCH(1, MEM(RCX,R12,1,0*64)) \ PREFETCH(1, MEM(RCX,R12,1,1*64)) \ \ PREFETCH(1, MEM(RCX,R12,2,0*64)) \ PREFETCH(1, MEM(RCX,R12,2,1*64)) \ \ PREFETCH(1, MEM(RCX,R13,1,0*64)) \ PREFETCH(1, MEM(RCX,R13,1,1*64)) \ \ PREFETCH(1, MEM(RCX,R12,4,0*64)) \ PREFETCH(1, MEM(RCX,R12,4,1*64)) \ \ PREFETCH(1, MEM(RCX,R14,1,0*64)) \ PREFETCH(1, MEM(RCX,R14,1,1*64)) \ \ PREFETCH(1, MEM(RCX,R13,2,0*64)) \ PREFETCH(1, MEM(RCX,R13,2,1*64)) \ \ PREFETCH(1, MEM(RCX,R15,1,0*64)) \ PREFETCH(1, MEM(RCX,R15,1,1*64)) \ \ PREFETCH(1, MEM(RDX, 0*64)) \ PREFETCH(1, MEM(RDX, 1*64)) \ \ PREFETCH(1, MEM(RDX,R12,1,0*64)) \ PREFETCH(1, MEM(RDX,R12,1,1*64)) \ \ PREFETCH(1, MEM(RDX,R12,2,0*64)) \ PREFETCH(1, MEM(RDX,R12,2,1*64)) \ \ PREFETCH(1, MEM(RDX,R13,1,0*64)) \ PREFETCH(1, MEM(RDX,R13,1,1*64)) #else #undef PREFETCH_C_L2 #define PREFETCH_C_L2 #endif #define PREFETCH_C_L1 \ \ PREFETCHW0(MEM(RCX, 0*64)) \ PREFETCHW0(MEM(RCX, 1*64)) \ PREFETCHW0(MEM(RCX,R12,1,0*64)) \ PREFETCHW0(MEM(RCX,R12,1,1*64)) \ PREFETCHW0(MEM(RCX,R12,2,0*64)) \ PREFETCHW0(MEM(RCX,R12,2,1*64)) \ PREFETCHW0(MEM(RCX,R13,1,0*64)) \ PREFETCHW0(MEM(RCX,R13,1,1*64)) \ PREFETCHW0(MEM(RCX,R12,4,0*64)) \ PREFETCHW0(MEM(RCX,R12,4,1*64)) \ PREFETCHW0(MEM(RCX,R14,1,0*64)) \ PREFETCHW0(MEM(RCX,R14,1,1*64)) \ PREFETCHW0(MEM(RCX,R13,2,0*64)) \ PREFETCHW0(MEM(RCX,R13,2,1*64)) \ PREFETCHW0(MEM(RCX,R15,1,0*64)) \ PREFETCHW0(MEM(RCX,R15,1,1*64)) \ PREFETCHW0(MEM(RDX, 0*64)) \ PREFETCHW0(MEM(RDX, 1*64)) \ PREFETCHW0(MEM(RDX,R12,1,0*64)) \ PREFETCHW0(MEM(RDX,R12,1,1*64)) \ PREFETCHW0(MEM(RDX,R12,2,0*64)) \ PREFETCHW0(MEM(RDX,R12,2,1*64)) \ PREFETCHW0(MEM(RDX,R13,1,0*64)) \ PREFETCHW0(MEM(RDX,R13,1,1*64)) // // n: index in unrolled loop // // a: ZMM register to load into // b: ZMM register to read from // // ...: addressing for B, except for offset // #define SUBITER(n) \ \ PREFETCH_A_L1(n, 0) \ \ VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 0)*4)) \ VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 1)*4)) \ VFMADD231PS(ZMM( 8), ZMM(0), ZMM(3)) \ VFMADD231PS(ZMM( 9), ZMM(1), ZMM(3)) \ VFMADD231PS(ZMM(10), ZMM(0), ZMM(4)) \ VFMADD231PS(ZMM(11), ZMM(1), ZMM(4)) \ \ VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 2)*4)) \ VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 3)*4)) \ VFMADD231PS(ZMM(12), ZMM(0), ZMM(3)) \ VFMADD231PS(ZMM(13), ZMM(1), ZMM(3)) \ VFMADD231PS(ZMM(14), ZMM(0), ZMM(4)) \ VFMADD231PS(ZMM(15), ZMM(1), ZMM(4)) \ \ VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 4)*4)) \ VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 5)*4)) \ VFMADD231PS(ZMM(16), ZMM(0), ZMM(3)) \ VFMADD231PS(ZMM(17), ZMM(1), ZMM(3)) \ VFMADD231PS(ZMM(18), ZMM(0), ZMM(4)) \ VFMADD231PS(ZMM(19), ZMM(1), ZMM(4)) \ \ PREFETCH_A_L1(n, 1) \ \ VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 6)*4)) \ VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 7)*4)) \ VFMADD231PS(ZMM(20), ZMM(0), ZMM(3)) \ VFMADD231PS(ZMM(21), ZMM(1), ZMM(3)) \ VFMADD231PS(ZMM(22), ZMM(0), ZMM(4)) \ VFMADD231PS(ZMM(23), ZMM(1), ZMM(4)) \ \ VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+ 8)*4)) \ VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+ 9)*4)) \ VFMADD231PS(ZMM(24), ZMM(0), ZMM(3)) \ VFMADD231PS(ZMM(25), ZMM(1), ZMM(3)) \ VFMADD231PS(ZMM(26), ZMM(0), ZMM(4)) \ VFMADD231PS(ZMM(27), ZMM(1), ZMM(4)) \ \ VBROADCASTSS(ZMM(3), MEM(RBX,(12*n+10)*4)) \ VBROADCASTSS(ZMM(4), MEM(RBX,(12*n+11)*4)) \ VFMADD231PS(ZMM(28), ZMM(0), ZMM(3)) \ VFMADD231PS(ZMM(29), ZMM(1), ZMM(3)) \ VFMADD231PS(ZMM(30), ZMM(0), ZMM(4)) \ VFMADD231PS(ZMM(31), ZMM(1), ZMM(4)) \ \ VMOVAPD(ZMM(0), MEM(RAX,(32*n+0)*4)) \ VMOVAPD(ZMM(1), MEM(RAX,(32*n+16)*4)) //This is an array used for the scatter/gather instructions. static int64_t offsets[16] __attribute__((aligned(64))) = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}; void bli_sgemm_skx_asm_32x12_l2 ( dim_t m, dim_t n, dim_t k_, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, cntx_t* restrict cntx ) { (void)data; (void)cntx; int64_t k = k_; int64_t rs_c = rs_c_; int64_t cs_c = cs_c_; GEMM_UKR_SETUP_CT( s, 32, 12, false ); BEGIN_ASM() VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers VMOVAPD(YMM( 7), YMM(8)) VMOVAPD(YMM( 9), YMM(8)) VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c VMOVAPD(YMM(14), YMM(8)) VMOVAPD(YMM(15), YMM(8)) VMOVAPD(ZMM(0), MEM(RAX, 0*4)) //pre-load a VMOVAPD(YMM(16), YMM(8)) VMOVAPD(ZMM(1), MEM(RAX, 16*4)) //pre-load a VMOVAPD(YMM(17), YMM(8)) VMOVAPD(YMM(18), YMM(8)) VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(cs_c)) //cs_c VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3 VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5 VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7 VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*cs_c VMOVAPD(YMM(24), YMM(8)) VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(32*4)) //mr*sizeof(float) VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(12*4)) //nr*sizeof(float) VMOVAPD(YMM(27), YMM(8)) VMOVAPD(YMM(28), YMM(8)) LEA(RAX, MEM(RAX,R8,1)) //adjust a for pre-load VMOVAPD(YMM(29), YMM(8)) VMOVAPD(YMM(30), YMM(8)) VMOVAPD(YMM(31), YMM(8)) TEST(RSI, RSI) JZ(POSTACCUM) #ifdef PREFETCH_A_BEFORE /* Prefetching 8 cachlines of A (4 iterations worth of data (32 (MR) x4 (sizeof(float)) x4 iter /64 = 8 cachelines) */ PREFETCH(0, MEM(RAX,0*64)) PREFETCH(0, MEM(RAX,1*64)) PREFETCH(0, MEM(RAX,2*64)) PREFETCH(0, MEM(RAX,3*64)) PREFETCH(0, MEM(RAX,4*64)) PREFETCH(0, MEM(RAX,5*64)) PREFETCH(0, MEM(RAX,6*64)) PREFETCH(0, MEM(RAX,7*64)) #endif #ifdef PREFETCH_B_BEFORE /* Prefetching 3 cachlines of B (4 iterations worth of data (12 (NR) x 4 (sizeof(float)) x 4 iter /64 = 3 cachelines) */ PREFETCH(0, MEM(RBX,0*64)) PREFETCH(0, MEM(RBX,1*64)) PREFETCH(0, MEM(RBX,2*64)) #endif PREFETCH_C_L2 MOV(RDI, RSI) AND(RSI, IMM(3)) SAR(RDI, IMM(2)) SUB(RDI, IMM(0+TAIL_NITER)) JLE(K_SMALL) LOOP_ALIGN LABEL(MAIN_LOOP) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4)) SUBITER(0) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64)) SUBITER(1) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128)) SUBITER(2) SUBITER(3) LEA(RAX, MEM(RAX,R8,4)) LEA(RBX, MEM(RBX,R9,4)) DEC(RDI) JNZ(MAIN_LOOP) LABEL(K_SMALL) PREFETCH_C_L1 ADD(RDI, IMM(0+TAIL_NITER)) JZ(TAIL_LOOP) LOOP_ALIGN LABEL(SMALL_LOOP) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4)) SUBITER(0) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+64)) SUBITER(1) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4+128)) SUBITER(2) SUBITER(3) LEA(RAX, MEM(RAX,R8,4)) LEA(RBX, MEM(RBX,R9,4)) DEC(RDI) JNZ(SMALL_LOOP) TEST(RSI, RSI) JZ(POSTACCUM) LOOP_ALIGN LABEL(TAIL_LOOP) PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*12*4)) SUBITER(0) ADD(RAX, R8) ADD(RBX, R9) DEC(RSI) JNZ(TAIL_LOOP) LABEL(POSTACCUM) #ifdef PREFETCH_A_AFTER MOV(R8, VAR(a)) PREFETCH(0, MEM(R8,0*64)) PREFETCH(0, MEM(R8,1*64)) PREFETCH(0, MEM(R8,2*64)) PREFETCH(0, MEM(R8,3*64)) PREFETCH(0, MEM(R8,4*64)) PREFETCH(0, MEM(R8,5*64)) PREFETCH(0, MEM(R8,6*64)) PREFETCH(0, MEM(R8,7*64)) #endif #ifdef PREFETCH_B_AFTER MOV(R9, VAR(b)) PREFETCH(0, MEM(R9,0*64)) PREFETCH(0, MEM(R9,1*64)) PREFETCH(0, MEM(R9,2*64)) #endif MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSS(ZMM(0), MEM(RAX)) VBROADCASTSS(ZMM(1), MEM(RBX)) MOV(RAX, VAR(cs_c)) LEA(RAX, MEM(,RAX,4)) VCOMISS(XMM(1), XMM(7)) JE(COLSTORBZ) UPDATE_C( 8, 9,10,11) UPDATE_C(12,13,14,15) UPDATE_C(16,17,18,19) UPDATE_C(20,21,22,23) UPDATE_C(24,25,26,27) UPDATE_C(28,29,30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ( 8, 9,10,11) UPDATE_C_BZ(12,13,14,15) UPDATE_C_BZ(16,17,18,19) UPDATE_C_BZ(20,21,22,23) UPDATE_C_BZ(24,25,26,27) UPDATE_C_BZ(28,29,30,31) LABEL(END) VZEROUPPER() END_ASM( : // output operands : // input operands [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c) : // register clobber list "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory" ) GEMM_UKR_FLUSH_CT( s ); } cython-blis-0.9.1/blis/_src/kernels/skx/bli_kernels_skx.h000066400000000000000000000035311427272030600234370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ GEMM_UKR_PROT( float , s, gemm_skx_asm_32x12_l2 ) GEMM_UKR_PROT( float , s, gemm_skx_asm_12x32_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x12_l2 ) GEMM_UKR_PROT( double, d, gemm_skx_asm_16x14 ) cython-blis-0.9.1/blis/_src/kernels/zen/000077500000000000000000000000001427272030600200755ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen/1/000077500000000000000000000000001427272030600202355ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_amaxv_zen_int.c000066400000000000000000000414671427272030600241050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2016 - 2018 - 2019, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; typedef union { __m128 v; float f[4]; } v4sf_t; /* Union data structure to access AVX registers One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); }v4df_t; typedef union { __m128d v; double d[2]; }v2dd_t; // return a mask which indicates either: // - v1 > v2 // - v1 is NaN and v2 is not // assumes that idx(v1) > idx(v2) // all "OQ" comparisons false if either operand NaN #define CMP256( dt, v1, v2 ) \ _mm256_or_p##dt( _mm256_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* v1 > v2 || */ \ _mm256_andnot_p##dt( _mm256_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \ _mm256_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) */ \ ) \ ); // return a mask which indicates either: // - v1 > v2 // - v1 is NaN and v2 is not // - v1 == v2 (maybe == NaN) and i1 < i2 // all "OQ" comparisons false if either operand NaN #define CMP128( dt, v1, v2, i1, i2 ) \ _mm_or_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_GT_OQ ), /* ( v1 > v2 || */ \ _mm_andnot_p##dt( _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ), /* ( !isnan(v2) && */ \ _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ) /* isnan(v1) ) ) || */ \ ) \ ), \ _mm_and_p##dt( _mm_or_p##dt( _mm_cmp_p##dt( v1, v2, _CMP_EQ_OQ ), /* ( ( v1 == v2 || */ \ _mm_and_p##dt( _mm_cmp_p##dt( v1, v1, _CMP_UNORD_Q ), /* ( isnan(v1) && */ \ _mm_cmp_p##dt( v2, v2, _CMP_UNORD_Q ) /* isnan(v2) ) ) && */ \ ) \ ), \ _mm_cmp_p##dt( i1, i2, _CMP_LT_OQ ) /* i1 < i2 ) */ \ ) \ ); // ----------------------------------------------------------------------------- void bli_samaxv_zen_int ( dim_t n, float* restrict x, inc_t incx, dim_t* restrict i_max, cntx_t* restrict cntx ) { float* minus_one = PASTEMAC(s,m1); dim_t* zero_i = PASTEMAC(i,0); float chi1_r; //float chi1_i; float abs_chi1; float abs_chi1_max; dim_t i_max_l; dim_t i; /* If the vector length is zero, return early. This directly emulates the behavior of netlib BLAS's i?amax() routines. */ if ( bli_zero_dim1( n ) ) { PASTEMAC(i,copys)( *zero_i, *i_max ); return; } /* Initialize the index of the maximum absolute value to zero. */ PASTEMAC(i,copys)( *zero_i, i_max_l ); /* Initialize the maximum absolute value search candidate with -1, which is guaranteed to be less than all values we will compute. */ PASTEMAC(s,copys)( *minus_one, abs_chi1_max ); // For non-unit strides, or very small vector lengths, compute with // scalar code. if ( incx != 1 || n < 8 ) { for ( i = 0; i < n; ++i ) { float* chi1 = x + (i )*incx; /* Get the real and imaginary components of chi1. */ chi1_r = *chi1; /* Replace chi1_r and chi1_i with their absolute values. */ chi1_r = fabsf( chi1_r ); /* Add the real and imaginary absolute values together. */ abs_chi1 = chi1_r; /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's i?amax(). */ if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; } } } else { dim_t n_iter, n_left; dim_t num_vec_elements = 8; v8sf_t x_vec, max_vec, maxInx_vec, mask_vec; v8sf_t idx_vec, inc_vec; v8sf_t sign_mask; v4sf_t max_vec_lo, max_vec_hi, mask_vec_lo; v4sf_t maxInx_vec_lo, maxInx_vec_hi; n_iter = n / num_vec_elements; n_left = n % num_vec_elements; idx_vec.v = _mm256_set_ps( 7, 6, 5, 4, 3, 2, 1, 0 ); inc_vec.v = _mm256_set1_ps( 8 ); max_vec.v = _mm256_set1_ps( -1 ); maxInx_vec.v = _mm256_setzero_ps(); sign_mask.v = _mm256_set1_ps( -0.f ); for ( i = 0; i < n_iter; ++i ) { x_vec.v = _mm256_loadu_ps( x ); // Get the absolute value of the vector element. x_vec.v = _mm256_andnot_ps( sign_mask.v, x_vec.v ); mask_vec.v = CMP256( s, x_vec.v, max_vec.v ); max_vec.v = _mm256_blendv_ps( max_vec.v, x_vec.v, mask_vec.v ); maxInx_vec.v = _mm256_blendv_ps( maxInx_vec.v, idx_vec.v, mask_vec.v ); idx_vec.v += inc_vec.v; x += num_vec_elements; } max_vec_lo.v = _mm256_extractf128_ps( max_vec.v, 0 ); max_vec_hi.v = _mm256_extractf128_ps( max_vec.v, 1 ); maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 ); maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 ); mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 14 ); maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 ); mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); max_vec_hi.v = _mm_permute_ps( max_vec_lo.v, 1 ); maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 ); mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_ps( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); abs_chi1_max = max_vec_lo.f[0]; i_max_l = maxInx_vec_lo.f[0]; for ( i = n - n_left; i < n; i++ ) { float* chi1 = x; /* Get the real and imaginary components of chi1. */ chi1_r = *chi1; /* Replace chi1_r and chi1_i with their absolute values. */ abs_chi1 = fabsf( chi1_r ); /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's i?amax(). */ if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; } x += 1; } } // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // later, especially if BLIS is compiled with -mfpmath=sse). _mm256_zeroupper(); /* Store final index to output variable. */ *i_max = i_max_l; } // ----------------------------------------------------------------------------- void bli_damaxv_zen_int ( dim_t n, double* restrict x, inc_t incx, dim_t* restrict i_max, cntx_t* restrict cntx ) { double* minus_one = PASTEMAC(d,m1); dim_t* zero_i = PASTEMAC(i,0); double chi1_r; //double chi1_i; double abs_chi1; double abs_chi1_max; dim_t i_max_l; dim_t i; /* If the vector length is zero, return early. This directly emulates the behavior of netlib BLAS's i?amax() routines. */ if ( bli_zero_dim1( n ) ) { PASTEMAC(i,copys)( *zero_i, *i_max ); return; } /* Initialize the index of the maximum absolute value to zero. */ \ PASTEMAC(i,copys)( *zero_i, i_max_l ); /* Initialize the maximum absolute value search candidate with -1, which is guaranteed to be less than all values we will compute. */ PASTEMAC(d,copys)( *minus_one, abs_chi1_max ); // For non-unit strides, or very small vector lengths, compute with // scalar code. if ( incx != 1 || n < 4 ) { for ( i = 0; i < n; ++i ) { double* chi1 = x + (i )*incx; /* Get the real and imaginary components of chi1. */ chi1_r = *chi1; /* Replace chi1_r and chi1_i with their absolute values. */ chi1_r = fabs( chi1_r ); /* Add the real and imaginary absolute values together. */ abs_chi1 = chi1_r; /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's i?amax(). */ if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; } } } else { dim_t n_iter, n_left; dim_t num_vec_elements = 4; v4df_t x_vec, max_vec, maxInx_vec, mask_vec; v4df_t idx_vec, inc_vec; v4df_t sign_mask; v2dd_t max_vec_lo, max_vec_hi, mask_vec_lo; v2dd_t maxInx_vec_lo, maxInx_vec_hi; n_iter = n / num_vec_elements; n_left = n % num_vec_elements; idx_vec.v = _mm256_set_pd( 3, 2, 1, 0 ); inc_vec.v = _mm256_set1_pd( 4 ); max_vec.v = _mm256_set1_pd( -1 ); maxInx_vec.v = _mm256_setzero_pd(); sign_mask.v = _mm256_set1_pd( -0.f ); for ( i = 0; i < n_iter; ++i ) { x_vec.v = _mm256_loadu_pd( x ); // Get the absolute value of the vector element. x_vec.v = _mm256_andnot_pd( sign_mask.v, x_vec.v ); mask_vec.v = CMP256( d, x_vec.v, max_vec.v ); max_vec.v = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v ); maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v ); idx_vec.v += inc_vec.v; x += num_vec_elements; } max_vec_lo.v = _mm256_extractf128_pd( max_vec.v, 0 ); max_vec_hi.v = _mm256_extractf128_pd( max_vec.v, 1 ); maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 ); maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 ); mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); max_vec_hi.v = _mm_permute_pd( max_vec_lo.v, 1 ); maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 ); mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v ); max_vec_lo.v = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v ); maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v ); abs_chi1_max = max_vec_lo.d[0]; i_max_l = maxInx_vec_lo.d[0]; for ( i = n - n_left; i < n; i++ ) { double* chi1 = x; /* Get the real and imaginary components of chi1. */ chi1_r = *chi1; /* Replace chi1_r and chi1_i with their absolute values. */ abs_chi1 = fabs( chi1_r ); /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, return the index of the first NaN. This behavior mimics that of LAPACK's i?amax(). */ if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) ) { abs_chi1_max = abs_chi1; i_max_l = i; } x += 1; } } // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // later, especially if BLIS is compiled with -mfpmath=sse). _mm256_zeroupper(); /* Store final index to output variable. */ *i_max = i_max_l; } // ----------------------------------------------------------------------------- #if 0 #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ dim_t n, \ ctype* x, inc_t incx, \ dim_t* i_max, \ cntx_t* cntx \ ) \ { \ ctype_r* minus_one = PASTEMAC(chr,m1); \ dim_t* zero_i = PASTEMAC(i,0); \ \ ctype_r chi1_r; \ ctype_r chi1_i; \ ctype_r abs_chi1; \ ctype_r abs_chi1_max; \ dim_t i; \ \ /* Initialize the index of the maximum absolute value to zero. */ \ PASTEMAC(i,copys)( zero_i, *i_max ); \ \ /* If the vector length is zero, return early. This directly emulates the behavior of netlib BLAS's i?amax() routines. */ \ if ( bli_zero_dim1( n ) ) return; \ \ /* Initialize the maximum absolute value search candidate with -1, which is guaranteed to be less than all values we will compute. */ \ PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ \ if ( incx == 1 ) \ { \ for ( i = 0; i < n; ++i ) \ { \ /* Get the real and imaginary components of chi1. */ \ PASTEMAC2(ch,chr,gets)( x[i], chi1_r, chi1_i ); \ \ /* Replace chi1_r and chi1_i with their absolute values. */ \ PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ \ /* Add the real and imaginary absolute values together. */ \ PASTEMAC(chr,set0s)( abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ \ /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's ?lange(). */ \ if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ { \ abs_chi1_max = abs_chi1; \ *i_max = i; \ } \ } \ } \ else \ { \ for ( i = 0; i < n; ++i ) \ { \ ctype* chi1 = x + (i )*incx; \ \ /* Get the real and imaginary components of chi1. */ \ PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ \ /* Replace chi1_r and chi1_i with their absolute values. */ \ PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ \ /* Add the real and imaginary absolute values together. */ \ PASTEMAC(chr,set0s)( abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ \ /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's ?lange(). */ \ if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \ { \ abs_chi1_max = abs_chi1; \ *i_max = i; \ } \ } \ } \ } GENTFUNCR( scomplex, float, c, s, amaxv_zen_int ) GENTFUNCR( dcomplex, double, z, d, amaxv_zen_int ) #endif cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_axpyv_zen_int.c000066400000000000000000000201631427272030600241260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_saxpyv_zen_int ( conj_t conjx, dim_t n, float* restrict alpha, float* restrict x, inc_t incx, float* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; float* restrict x0; float* restrict y0; v8sf_t alphav; v8sf_t x0v, x1v, x2v, x3v; v8sf_t y0v, y1v, y2v, y3v; // If the vector dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 || incy != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; y0 = y; // Broadcast the alpha scalar to all elements of a vector register. alphav.v = _mm256_broadcast_ss( alpha ); // If there are vectorized iterations, perform them with vector // instructions. for ( i = 0; i < n_viter; ++i ) { // Load the input values. y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_ps( alphav.v, x0v.v, y0v.v ); y1v.v = _mm256_fmadd_ps( alphav.v, x1v.v, y1v.v ); y2v.v = _mm256_fmadd_ps( alphav.v, x2v.v, y2v.v ); y3v.v = _mm256_fmadd_ps( alphav.v, x3v.v, y3v.v ); // Store the output. _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v ); _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), y2v.v ); _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), y3v.v ); x0 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); const float alphac = *alpha; // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { const float x0c = *x0; *y0 += alphac * x0c; x0 += incx; y0 += incy; } } // ----------------------------------------------------------------------------- void bli_daxpyv_zen_int ( conj_t conjx, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; double* restrict x0; double* restrict y0; v4df_t alphav; v4df_t x0v, x1v, x2v, x3v; v4df_t y0v, y1v, y2v, y3v; // If the vector dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 || incy != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; y0 = y; // Broadcast the alpha scalar to all elements of a vector register. alphav.v = _mm256_broadcast_sd( alpha ); // If there are vectorized iterations, perform them with vector // instructions. for ( i = 0; i < n_viter; ++i ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( alphav.v, x0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( alphav.v, x1v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( alphav.v, x2v.v, y2v.v ); y3v.v = _mm256_fmadd_pd( alphav.v, x3v.v, y3v.v ); // Store the output. _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v ); _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), y2v.v ); _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), y3v.v ); x0 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); const double alphac = *alpha; // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { const double x0c = *x0; *y0 += alphac * x0c; x0 += incx; y0 += incy; } } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_axpyv_zen_int10.c000066400000000000000000000363431427272030600242760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_saxpyv_zen_int10 ( conj_t conjx, dim_t n, float* restrict alpha, float* restrict x, inc_t incx, float* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; dim_t i; float* restrict x0; float* restrict y0; __m256 alphav; __m256 xv[10]; __m256 yv[10]; __m256 zv[10]; // If the vector dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return; // Initialize local pointers. x0 = x; y0 = y; if ( incx == 1 && incy == 1 ) { // Broadcast the alpha scalar to all elements of a vector register. alphav = _mm256_broadcast_ss( alpha ); for ( i = 0; (i + 79) < n; i += 80 ) { // 80 elements will be processed per loop; 10 FMAs will run per loop. xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg ); xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg ); xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg ); yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg ); yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg ); yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg ); yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg ); zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] ); zv[5] = _mm256_fmadd_ps( xv[5], alphav, yv[5] ); zv[6] = _mm256_fmadd_ps( xv[6], alphav, yv[6] ); zv[7] = _mm256_fmadd_ps( xv[7], alphav, yv[7] ); zv[8] = _mm256_fmadd_ps( xv[8], alphav, yv[8] ); zv[9] = _mm256_fmadd_ps( xv[9], alphav, yv[9] ); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] ); _mm256_storeu_ps( (y0 + 5*n_elem_per_reg), zv[5] ); _mm256_storeu_ps( (y0 + 6*n_elem_per_reg), zv[6] ); _mm256_storeu_ps( (y0 + 7*n_elem_per_reg), zv[7] ); _mm256_storeu_ps( (y0 + 8*n_elem_per_reg), zv[8] ); _mm256_storeu_ps( (y0 + 9*n_elem_per_reg), zv[9] ); x0 += 10*n_elem_per_reg; y0 += 10*n_elem_per_reg; } for ( ; (i + 39) < n; i += 40 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); zv[4] = _mm256_fmadd_ps( xv[4], alphav, yv[4] ); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), zv[4] ); x0 += 5*n_elem_per_reg; y0 += 5*n_elem_per_reg; } for ( ; (i + 31) < n; i += 32 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); zv[2] = _mm256_fmadd_ps( xv[2], alphav, yv[2] ); zv[3] = _mm256_fmadd_ps( xv[3], alphav, yv[3] ); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), zv[3] ); x0 += 4*n_elem_per_reg; y0 += 4*n_elem_per_reg; } for ( ; (i + 15) < n; i += 16 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_ps( xv[1], alphav, yv[1] ); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), zv[1] ); x0 += 2*n_elem_per_reg; y0 += 2*n_elem_per_reg; } for ( ; (i + 7) < n; i += 8 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); zv[0] = _mm256_fmadd_ps( xv[0], alphav, yv[0] ); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), zv[0] ); x0 += 1*n_elem_per_reg; y0 += 1*n_elem_per_reg; } // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); for ( ; (i + 0) < n; i += 1 ) { *y0 += (*alpha) * (*x0); x0 += 1; y0 += 1; } } else { const float alphac = *alpha; for ( i = 0; i < n; ++i ) { const float x0c = *x0; *y0 += alphac * x0c; x0 += incx; y0 += incy; } } } // ----------------------------------------------------------------------------- void bli_daxpyv_zen_int10 ( conj_t conjx, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; dim_t i; double* restrict x0 = x; double* restrict y0 = y; __m256d alphav; __m256d xv[10]; __m256d yv[10]; __m256d zv[10]; // If the vector dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return; // Initialize local pointers. x0 = x; y0 = y; if ( incx == 1 && incy == 1 ) { // Broadcast the alpha scalar to all elements of a vector register. alphav = _mm256_broadcast_sd( alpha ); for ( i = 0; (i + 39) < n; i += 40 ) { // 40 elements will be processed per loop; 10 FMAs will run per loop. xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg ); xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg ); xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg ); yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg ); yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg ); yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg ); yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg ); zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] ); zv[5] = _mm256_fmadd_pd( xv[5], alphav, yv[5] ); zv[6] = _mm256_fmadd_pd( xv[6], alphav, yv[6] ); zv[7] = _mm256_fmadd_pd( xv[7], alphav, yv[7] ); zv[8] = _mm256_fmadd_pd( xv[8], alphav, yv[8] ); zv[9] = _mm256_fmadd_pd( xv[9], alphav, yv[9] ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] ); _mm256_storeu_pd( (y0 + 5*n_elem_per_reg), zv[5] ); _mm256_storeu_pd( (y0 + 6*n_elem_per_reg), zv[6] ); _mm256_storeu_pd( (y0 + 7*n_elem_per_reg), zv[7] ); _mm256_storeu_pd( (y0 + 8*n_elem_per_reg), zv[8] ); _mm256_storeu_pd( (y0 + 9*n_elem_per_reg), zv[9] ); x0 += 10*n_elem_per_reg; y0 += 10*n_elem_per_reg; } for ( ; (i + 19) < n; i += 20 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); zv[4] = _mm256_fmadd_pd( xv[4], alphav, yv[4] ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), zv[4] ); x0 += 5*n_elem_per_reg; y0 += 5*n_elem_per_reg; } for ( ; (i + 15) < n; i += 16 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); zv[2] = _mm256_fmadd_pd( xv[2], alphav, yv[2] ); zv[3] = _mm256_fmadd_pd( xv[3], alphav, yv[3] ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), zv[3] ); x0 += 4*n_elem_per_reg; y0 += 4*n_elem_per_reg; } for ( ; i + 7 < n; i += 8 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); zv[1] = _mm256_fmadd_pd( xv[1], alphav, yv[1] ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), zv[1] ); x0 += 2*n_elem_per_reg; y0 += 2*n_elem_per_reg; } for ( ; i + 3 < n; i += 4 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); zv[0] = _mm256_fmadd_pd( xv[0], alphav, yv[0] ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), zv[0] ); x0 += 1*n_elem_per_reg; y0 += 1*n_elem_per_reg; } // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); for ( ; i < n; i += 1 ) { *y0 += (*alpha) * (*x0); y0 += 1; x0 += 1; } } else { const double alphac = *alpha; for ( i = 0; i < n; ++i ) { const double x0c = *x0; *y0 += alphac * x0c; x0 += incx; y0 += incy; } } } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_copyv_zen_int.c000066400000000000000000000266231427272030600241260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" // ----------------------------------------------------------------------------- void bli_scopyv_zen_int ( conj_t conjx, dim_t n, float* restrict x, inc_t incx, float* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t num_elem_per_reg = 8; dim_t i = 0; __m256 xv[16]; // If the vector dimension is zero return early. if ( bli_zero_dim1( n ) ) return; if ( incx == 1 && incy == 1 ) { #if 0 PRAGMA_SIMD for (i = 0; i < n; i++) { y[i] = x[i]; } #endif #if 0 memcpy(y, x, n << 2); #endif #if 1 // For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128 // for example if n = 255 // n & ~0x7F results in 128: copy from 0 to 128 happens in first loop // n & ~0x3F results in 192: copy from 128 to 192 happens in second loop // n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on. for ( i = 0; i < (n & (~0x7F)); i += 128 ) { xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8); xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9); xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10); xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11); xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12); xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13); xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14); xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15); _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); _mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]); _mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]); _mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]); _mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]); _mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]); _mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]); _mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]); _mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]); y += 128; x += 128; } for ( ; i < (n & (~0x3F)); i += 64 ) { xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4); xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5); xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6); xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7); _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); _mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]); _mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]); _mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]); _mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]); y += 64; x += 64; } for ( ; i < (n & (~0x1F)); i += 32 ) { xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2); xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3); _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); _mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]); _mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]); y += 32; x += 32; } for ( ; i < (n & (~0x0F)); i += 16 ) { xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1); _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]); y += 16; x += 16; } for ( ; i < (n & (~0x07)); i += 8 ) { xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0); _mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]); y += 8; x += 8; } for ( ; i < n; ++i ) { *y++ = *x++; } #endif } else { for ( dim_t i = 0; i < n; ++i ) { *y = *x; x += incx; y += incy; } } } // ----------------------------------------------------------------------------- void bli_dcopyv_zen_int ( conj_t conjx, dim_t n, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t num_elem_per_reg = 4; dim_t i = 0; __m256d xv[16]; // If the vector dimension is zero return early. if ( bli_zero_dim1( n ) ) return; if ( incx == 1 && incy == 1 ) { #if 0 PRAGMA_SIMD for (i = 0; i < n; ++i) { y[i] = x[i]; } #endif #if 0 memcpy(y, x, n << 3); #endif #if 1 // n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64, // the copy operation will be done for the multiples of 64 for ( i = 0; i < (n & (~0x3F)); i += 64 ) { xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8); xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9); xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10); xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11); xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12); xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13); xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14); xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15); _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); _mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]); _mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]); _mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]); _mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]); _mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]); _mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]); _mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]); _mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]); y += num_elem_per_reg * 16; x += num_elem_per_reg * 16; } for ( ; i < (n & (~0x1F)); i += 32 ) { xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4); xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5); xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6); xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7); _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); _mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]); _mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]); _mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]); _mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]); y += num_elem_per_reg * 8; x += num_elem_per_reg * 8; } for ( ; i < (n & (~0xF)); i += 16 ) { xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2); xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3); _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); _mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]); _mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]); y += num_elem_per_reg * 4; x += num_elem_per_reg * 4; } for ( ; i < (n & (~0x07)); i += 8 ) { xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1); _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); _mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]); y += num_elem_per_reg * 2; x += num_elem_per_reg * 2; } for ( ; i < (n & (~0x03)); i += 4 ) { xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0); _mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]); y += num_elem_per_reg; x += num_elem_per_reg; } for ( ; i < n; ++i ) { *y++ = *x++; } #endif } else { for ( i = 0; i < n; ++i ) { *y = *x; x += incx; y += incy; } } } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_dotv_zen_int.c000066400000000000000000000216431427272030600237370ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_sdotv_zen_int ( conj_t conjx, conj_t conjy, dim_t n, float* restrict x, inc_t incx, float* restrict y, inc_t incy, float* restrict rho, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; float* restrict x0; float* restrict y0; float rho0; v8sf_t rho0v, rho1v, rho2v, rho3v; v8sf_t x0v, y0v; v8sf_t x1v, y1v; v8sf_t x2v, y2v; v8sf_t x3v, y3v; // If the vector dimension is zero, set rho to zero and return early. if ( bli_zero_dim1( n ) ) { PASTEMAC(s,set0s)( *rho ); return; } // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 || incy != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; y0 = y; // Initialize the local scalar rho1 to zero. PASTEMAC(s,set0s)( rho0 ); // Initialize the unrolled iterations' rho vectors to zero. rho0v.v = _mm256_setzero_ps(); rho1v.v = _mm256_setzero_ps(); rho2v.v = _mm256_setzero_ps(); rho3v.v = _mm256_setzero_ps(); for ( i = 0; i < n_viter; ++i ) { // Load the x and y input vector elements. x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); // Compute the element-wise product of the x and y vectors, // storing in the corresponding rho vectors. rho0v.v = _mm256_fmadd_ps( x0v.v, y0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_ps( x1v.v, y1v.v, rho1v.v ); rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v ); rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v ); x0 += ( n_elem_per_reg * n_iter_unroll ); y0 += ( n_elem_per_reg * n_iter_unroll ); } // Accumulate the unrolled rho vectors into a single vector. rho0v.v += rho1v.v; rho0v.v += rho2v.v; rho0v.v += rho3v.v; // Accumulate the final rho vector into a single scalar result. rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] + rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7]; // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { const float x0c = *x0; const float y0c = *y0; rho0 += x0c * y0c; x0 += incx; y0 += incy; } // Copy the final result into the output variable. PASTEMAC(s,copys)( rho0, *rho ); } // ----------------------------------------------------------------------------- void bli_ddotv_zen_int ( conj_t conjx, conj_t conjy, dim_t n, double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; double* restrict x0; double* restrict y0; double rho0; v4df_t rho0v, rho1v, rho2v, rho3v; v4df_t x0v, y0v; v4df_t x1v, y1v; v4df_t x2v, y2v; v4df_t x3v, y3v; // If the vector dimension is zero, set rho to zero and return early. if ( bli_zero_dim1( n ) ) { PASTEMAC(d,set0s)( *rho ); return; } // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 || incy != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; y0 = y; // Initialize the local scalar rho1 to zero. PASTEMAC(d,set0s)( rho0 ); // Initialize the unrolled iterations' rho vectors to zero. rho0v.v = _mm256_setzero_pd(); rho1v.v = _mm256_setzero_pd(); rho2v.v = _mm256_setzero_pd(); rho3v.v = _mm256_setzero_pd(); for ( i = 0; i < n_viter; ++i ) { // Load the x and y input vector elements. x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); // Compute the element-wise product of the x and y vectors, // storing in the corresponding rho vectors. rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_pd( x1v.v, y1v.v, rho1v.v ); rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v ); rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v ); x0 += ( n_elem_per_reg * n_iter_unroll ); y0 += ( n_elem_per_reg * n_iter_unroll ); } // Accumulate the unrolled rho vectors into a single vector. rho0v.v += rho1v.v; rho0v.v += rho2v.v; rho0v.v += rho3v.v; // Accumulate the final rho vector into a single scalar result. rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3]; // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { const double x0c = *x0; const double y0c = *y0; rho0 += x0c * y0c; x0 += incx; y0 += incy; } // Copy the final result into the output variable. PASTEMAC(d,copys)( rho0, *rho ); } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_dotv_zen_int10.c000066400000000000000000000335601427272030600241010ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_sdotv_zen_int10 ( conj_t conjx, conj_t conjy, dim_t n, float* restrict x, inc_t incx, float* restrict y, inc_t incy, float* restrict rho, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; dim_t i; float* restrict x0; float* restrict y0; float rho0 = 0.0; __m256 xv[10]; __m256 yv[10]; v8sf_t rhov[10]; // If the vector dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim1( n ) ) { PASTEMAC(s,set0s)( *rho ); return; } // Initialize local pointers. x0 = x; y0 = y; PASTEMAC(s,set0s)( rho0 ); if ( incx == 1 && incy == 1 ) { rhov[0].v = _mm256_setzero_ps(); rhov[1].v = _mm256_setzero_ps(); rhov[2].v = _mm256_setzero_ps(); rhov[3].v = _mm256_setzero_ps(); rhov[4].v = _mm256_setzero_ps(); rhov[5].v = _mm256_setzero_ps(); rhov[6].v = _mm256_setzero_ps(); rhov[7].v = _mm256_setzero_ps(); rhov[8].v = _mm256_setzero_ps(); rhov[9].v = _mm256_setzero_ps(); for ( i = 0 ; (i + 79) < n; i += 80 ) { // 80 elements will be processed per loop; 10 FMAs will run per loop. xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg ); xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg ); xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg ); yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg ); yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg ); yv[8] = _mm256_loadu_ps( y0 + 8*n_elem_per_reg ); yv[9] = _mm256_loadu_ps( y0 + 9*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v ); rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v ); rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v ); rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v ); rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v ); rhov[5].v = _mm256_fmadd_ps( xv[5], yv[5], rhov[5].v ); rhov[6].v = _mm256_fmadd_ps( xv[6], yv[6], rhov[6].v ); rhov[7].v = _mm256_fmadd_ps( xv[7], yv[7], rhov[7].v ); rhov[8].v = _mm256_fmadd_ps( xv[8], yv[8], rhov[8].v ); rhov[9].v = _mm256_fmadd_ps( xv[9], yv[9], rhov[9].v ); x0 += 10*n_elem_per_reg; y0 += 10*n_elem_per_reg; } rhov[0].v += rhov[5].v; rhov[1].v += rhov[6].v; rhov[2].v += rhov[7].v; rhov[3].v += rhov[8].v; rhov[4].v += rhov[9].v; for ( ; (i + 39) < n; i += 40 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v ); rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v ); rhov[2].v = _mm256_fmadd_ps( xv[2], yv[2], rhov[2].v ); rhov[3].v = _mm256_fmadd_ps( xv[3], yv[3], rhov[3].v ); rhov[4].v = _mm256_fmadd_ps( xv[4], yv[4], rhov[4].v ); x0 += 5*n_elem_per_reg; y0 += 5*n_elem_per_reg; } rhov[0].v += rhov[2].v; rhov[1].v += rhov[3].v; rhov[0].v += rhov[4].v; for ( ; (i + 15) < n; i += 16 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v ); rhov[1].v = _mm256_fmadd_ps( xv[1], yv[1], rhov[1].v ); x0 += 2*n_elem_per_reg; y0 += 2*n_elem_per_reg; } rhov[0].v += rhov[1].v; for ( ; (i + 7) < n; i += 8 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_ps( xv[0], yv[0], rhov[0].v ); x0 += 1*n_elem_per_reg; y0 += 1*n_elem_per_reg; } for ( ; (i + 0) < n; i += 1 ) { rho0 += (*x0) * (*y0); x0 += 1; y0 += 1; } rho0 += rhov[0].f[0] + rhov[0].f[1] + rhov[0].f[2] + rhov[0].f[3] + rhov[0].f[4] + rhov[0].f[5] + rhov[0].f[6] + rhov[0].f[7]; // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // later, especially if BLIS is compiled with -mfpmath=sse). _mm256_zeroupper(); } else { for ( i = 0; i < n; ++i ) { const float x0c = *x0; const float y0c = *y0; rho0 += x0c * y0c; x0 += incx; y0 += incy; } } // Copy the final result into the output variable. PASTEMAC(s,copys)( rho0, *rho ); } // ----------------------------------------------------------------------------- void bli_ddotv_zen_int10 ( conj_t conjx, conj_t conjy, dim_t n, double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict rho, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; dim_t i; double* restrict x0; double* restrict y0; double rho0 = 0.0; __m256d xv[10]; __m256d yv[10]; v4df_t rhov[10]; // If the vector dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim1( n ) ) { PASTEMAC(d,set0s)( *rho ); return; } // Initialize local pointers. x0 = x; y0 = y; PASTEMAC(d,set0s)( rho0 ); if ( incx == 1 && incy == 1 ) { rhov[0].v = _mm256_setzero_pd(); rhov[1].v = _mm256_setzero_pd(); rhov[2].v = _mm256_setzero_pd(); rhov[3].v = _mm256_setzero_pd(); rhov[4].v = _mm256_setzero_pd(); rhov[5].v = _mm256_setzero_pd(); rhov[6].v = _mm256_setzero_pd(); rhov[7].v = _mm256_setzero_pd(); rhov[8].v = _mm256_setzero_pd(); rhov[9].v = _mm256_setzero_pd(); for ( i = 0; (i + 39) < n; i += 40 ) { // 80 elements will be processed per loop; 10 FMAs will run per loop. xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg ); xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg ); xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg ); yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg ); yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg ); yv[8] = _mm256_loadu_pd( y0 + 8*n_elem_per_reg ); yv[9] = _mm256_loadu_pd( y0 + 9*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v ); rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v ); rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v ); rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v ); rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v ); rhov[5].v = _mm256_fmadd_pd( xv[5], yv[5], rhov[5].v ); rhov[6].v = _mm256_fmadd_pd( xv[6], yv[6], rhov[6].v ); rhov[7].v = _mm256_fmadd_pd( xv[7], yv[7], rhov[7].v ); rhov[8].v = _mm256_fmadd_pd( xv[8], yv[8], rhov[8].v ); rhov[9].v = _mm256_fmadd_pd( xv[9], yv[9], rhov[9].v ); x0 += 10*n_elem_per_reg; y0 += 10*n_elem_per_reg; } rhov[0].v += rhov[5].v; rhov[1].v += rhov[6].v; rhov[2].v += rhov[7].v; rhov[3].v += rhov[8].v; rhov[4].v += rhov[9].v; for ( ; (i + 19) < n; i += 20 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v ); rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v ); rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v ); rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v ); rhov[4].v = _mm256_fmadd_pd( xv[4], yv[4], rhov[4].v ); x0 += 5*n_elem_per_reg; y0 += 5*n_elem_per_reg; } rhov[0].v += rhov[4].v; for ( ; (i + 15) < n; i += 16 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v ); rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v ); rhov[2].v = _mm256_fmadd_pd( xv[2], yv[2], rhov[2].v ); rhov[3].v = _mm256_fmadd_pd( xv[3], yv[3], rhov[3].v ); x0 += 4*n_elem_per_reg; y0 += 4*n_elem_per_reg; } rhov[0].v += rhov[2].v; rhov[1].v += rhov[3].v; for ( ; (i + 7) < n; i += 8 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v ); rhov[1].v = _mm256_fmadd_pd( xv[1], yv[1], rhov[1].v ); x0 += 2*n_elem_per_reg; y0 += 2*n_elem_per_reg; } rhov[0].v += rhov[1].v; for ( ; (i + 3) < n; i += 4 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); rhov[0].v = _mm256_fmadd_pd( xv[0], yv[0], rhov[0].v ); x0 += 1*n_elem_per_reg; y0 += 1*n_elem_per_reg; } for ( ; (i + 0) < n; i += 1 ) { rho0 += (*x0) * (*y0); x0 += 1; y0 += 1; } // Manually add the results from above to finish the sum. rho0 += rhov[0].d[0] + rhov[0].d[1] + rhov[0].d[2] + rhov[0].d[3]; // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // later, especially if BLIS is compiled with -mfpmath=sse). _mm256_zeroupper(); } else { for ( i = 0; i < n; ++i ) { const double x0c = *x0; const double y0c = *y0; rho0 += x0c * y0c; x0 += incx; y0 += incy; } } // Copy the final result into the output variable. PASTEMAC(d,copys)( rho0, *rho ); } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_dotxv_zen_int.c000066400000000000000000000225321427272030600241250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_sdotxv_zen_int ( conj_t conjx, conj_t conjy, dim_t n, float* restrict alpha, float* restrict x, inc_t incx, float* restrict y, inc_t incy, float* restrict beta, float* restrict rho, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; float* restrict x0; float* restrict y0; float rho0; v8sf_t rho0v, rho1v, rho2v, rho3v; v8sf_t x0v, y0v; v8sf_t x1v, y1v; v8sf_t x2v, y2v; v8sf_t x3v, y3v; // If beta is zero, initialize rho1 to zero instead of scaling // rho by beta (in case rho contains NaN or Inf). if ( PASTEMAC(s,eq0)( *beta ) ) { PASTEMAC(s,set0s)( *rho ); } else { PASTEMAC(s,scals)( *beta, *rho ); } // If the vector dimension is zero, output rho and return early. if ( bli_zero_dim1( n ) || PASTEMAC(s,eq0)( *alpha ) ) return; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 || incy != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; y0 = y; // Initialize the unrolled iterations' rho vectors to zero. rho0v.v = _mm256_setzero_ps(); rho1v.v = _mm256_setzero_ps(); rho2v.v = _mm256_setzero_ps(); rho3v.v = _mm256_setzero_ps(); for ( i = 0; i < n_viter; ++i ) { // Load the x and y input vector elements. x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); y2v.v = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); y3v.v = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); // Compute the element-wise product of the x and y vectors, // storing in the corresponding rho vectors. rho0v.v = _mm256_fmadd_ps( x0v.v, y0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_ps( x1v.v, y1v.v, rho1v.v ); rho2v.v = _mm256_fmadd_ps( x2v.v, y2v.v, rho2v.v ); rho3v.v = _mm256_fmadd_ps( x3v.v, y3v.v, rho3v.v ); x0 += ( n_elem_per_reg * n_iter_unroll ); y0 += ( n_elem_per_reg * n_iter_unroll ); } // Accumulate the unrolled rho vectors into a single vector. rho0v.v += rho1v.v; rho0v.v += rho2v.v; rho0v.v += rho3v.v; // Accumulate the final rho vector into a single scalar result. rho0 = rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] + rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7]; // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { const float x0c = *x0; const float y0c = *y0; rho0 += x0c * y0c; x0 += incx; y0 += incy; } // Accumulate the final result into the output variable. PASTEMAC(s,axpys)( *alpha, rho0, *rho ); } // ----------------------------------------------------------------------------- void bli_ddotxv_zen_int ( conj_t conjx, conj_t conjy, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, double* restrict y, inc_t incy, double* restrict beta, double* restrict rho, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; double* restrict x0; double* restrict y0; double rho0; v4df_t rho0v, rho1v, rho2v, rho3v; v4df_t x0v, y0v; v4df_t x1v, y1v; v4df_t x2v, y2v; v4df_t x3v, y3v; // If beta is zero, initialize rho1 to zero instead of scaling // rho by beta (in case rho contains NaN or Inf). if ( PASTEMAC(d,eq0)( *beta ) ) { PASTEMAC(d,set0s)( *rho ); } else { PASTEMAC(d,scals)( *beta, *rho ); } // If the vector dimension is zero, output rho and return early. if ( bli_zero_dim1( n ) || PASTEMAC(d,eq0)( *alpha ) ) return; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 || incy != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; y0 = y; // Initialize the unrolled iterations' rho vectors to zero. rho0v.v = _mm256_setzero_pd(); rho1v.v = _mm256_setzero_pd(); rho2v.v = _mm256_setzero_pd(); rho3v.v = _mm256_setzero_pd(); for ( i = 0; i < n_viter; ++i ) { // Load the x and y input vector elements. x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); // Compute the element-wise product of the x and y vectors, // storing in the corresponding rho vectors. rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_pd( x1v.v, y1v.v, rho1v.v ); rho2v.v = _mm256_fmadd_pd( x2v.v, y2v.v, rho2v.v ); rho3v.v = _mm256_fmadd_pd( x3v.v, y3v.v, rho3v.v ); x0 += ( n_elem_per_reg * n_iter_unroll ); y0 += ( n_elem_per_reg * n_iter_unroll ); } // Accumulate the unrolled rho vectors into a single vector. rho0v.v += rho1v.v; rho0v.v += rho2v.v; rho0v.v += rho3v.v; // Accumulate the final rho vector into a single scalar result. rho0 = rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3]; // Issue vzeroupper instruction to clear upper lanes of ymm registers. // This avoids a performance penalty caused by false dependencies when // transitioning from from AVX to SSE instructions (which may occur // as soon as the n_left cleanup loop below if BLIS is compiled with // -mfpmath=sse). _mm256_zeroupper(); // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { const double x0c = *x0; const double y0c = *y0; rho0 += x0c * y0c; x0 += incx; y0 += incy; } // Accumulate the final result into the output variable. PASTEMAC(d,axpys)( *alpha, rho0, *rho ); } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_scalv_zen_int.c000066400000000000000000000163031427272030600240700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_sscalv_zen_int ( conj_t conjalpha, dim_t n, float* restrict alpha, float* restrict x, inc_t incx, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; float* restrict x0; v8sf_t alphav; v8sf_t x0v, x1v, x2v, x3v; // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return; // If alpha is zero, use setv (in case y contains NaN or Inf). if ( PASTEMAC(s,eq0)( *alpha ) ) { float* zero = bli_s0; ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); f ( BLIS_NO_CONJUGATE, n, zero, x, incx, cntx ); return; } // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; // Broadcast the alpha scalar to all elements of a vector register. alphav.v = _mm256_broadcast_ss( alpha ); // If there are vectorized iterations, perform them with vector // instructions. for ( i = 0; i < n_viter; ++i ) { // Load the input values. x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); x1v.v = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); x2v.v = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); x3v.v = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); // perform : x := alpha * x; x0v.v = _mm256_mul_ps( alphav.v, x0v.v ); x1v.v = _mm256_mul_ps( alphav.v, x1v.v ); x2v.v = _mm256_mul_ps( alphav.v, x2v.v ); x3v.v = _mm256_mul_ps( alphav.v, x3v.v ); // Store the output. _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), x0v.v ); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), x1v.v ); _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), x2v.v ); _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), x3v.v ); x0 += n_elem_per_reg * n_iter_unroll; } const float alphac = *alpha; // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { *x0 *= alphac; x0 += incx; } } // ----------------------------------------------------------------------------- void bli_dscalv_zen_int ( conj_t conjalpha, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 4; dim_t i; dim_t n_viter; dim_t n_left; double* restrict x0; v4df_t alphav; v4df_t x0v, x1v, x2v, x3v; // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return; // If alpha is zero, use setv (in case y contains NaN or Inf). if ( PASTEMAC(d,eq0)( *alpha ) ) { double* zero = bli_d0; dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f ( BLIS_NO_CONJUGATE, n, zero, x, incx, cntx ); return; } // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. n_viter = ( n ) / ( n_elem_per_reg * n_iter_unroll ); n_left = ( n ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override n_viter and n_left to use scalar code // for all iterations. if ( incx != 1 ) { n_viter = 0; n_left = n; } // Initialize local pointers. x0 = x; // Broadcast the alpha scalar to all elements of a vector register. alphav.v = _mm256_broadcast_sd( alpha ); // If there are vectorized iterations, perform them with vector // instructions. for ( i = 0; i < n_viter; ++i ) { // Load the input values. x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); x1v.v = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); x2v.v = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); // perform : y += alpha * x; x0v.v = _mm256_mul_pd( alphav.v, x0v.v ); x1v.v = _mm256_mul_pd( alphav.v, x1v.v ); x2v.v = _mm256_mul_pd( alphav.v, x2v.v ); x3v.v = _mm256_mul_pd( alphav.v, x3v.v ); // Store the output. _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), x0v.v ); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), x1v.v ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), x2v.v ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), x3v.v ); x0 += n_elem_per_reg * n_iter_unroll; } const double alphac = *alpha; // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < n_left; ++i ) { *x0 *= alphac; x0 += incx; } } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_scalv_zen_int10.c000066400000000000000000000323451427272030600242350ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. Copyright (C) 2018, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_sscalv_zen_int10 ( conj_t conjalpha, dim_t n, float* restrict alpha, float* restrict x, inc_t incx, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; dim_t i; float* restrict x0; __m256 alphav; __m256 xv[10]; __m256 zv[10]; // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(s,eq1)( *alpha ) ) return; // If alpha is zero, use setv. if ( PASTEMAC(s,eq0)( *alpha ) ) { float* zero = bli_s0; if ( cntx == NULL ) cntx = bli_gks_query_cntx(); ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx ); f ( BLIS_NO_CONJUGATE, n, zero, x, incx, cntx ); return; } // Initialize local pointers. x0 = x; if ( incx == 1 ) { // Broadcast the alpha scalar to all elements of a vector register. alphav = _mm256_broadcast_ss( alpha ); for ( i = 0; (i + 79) < n; i += 80 ) { // Load the input values. xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg ); xv[8] = _mm256_loadu_ps( x0 + 8*n_elem_per_reg ); xv[9] = _mm256_loadu_ps( x0 + 9*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_ps( alphav, xv[0] ); zv[1] = _mm256_mul_ps( alphav, xv[1] ); zv[2] = _mm256_mul_ps( alphav, xv[2] ); zv[3] = _mm256_mul_ps( alphav, xv[3] ); zv[4] = _mm256_mul_ps( alphav, xv[4] ); zv[5] = _mm256_mul_ps( alphav, xv[5] ); zv[6] = _mm256_mul_ps( alphav, xv[6] ); zv[7] = _mm256_mul_ps( alphav, xv[7] ); zv[8] = _mm256_mul_ps( alphav, xv[8] ); zv[9] = _mm256_mul_ps( alphav, xv[9] ); // Store the output. _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] ); _mm256_storeu_ps( (x0 + 5*n_elem_per_reg), zv[5] ); _mm256_storeu_ps( (x0 + 6*n_elem_per_reg), zv[6] ); _mm256_storeu_ps( (x0 + 7*n_elem_per_reg), zv[7] ); _mm256_storeu_ps( (x0 + 8*n_elem_per_reg), zv[8] ); _mm256_storeu_ps( (x0 + 9*n_elem_per_reg), zv[9] ); x0 += 10*n_elem_per_reg; } for ( ; (i + 39) < n; i += 40 ) { // Load the input values. xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_ps( alphav, xv[0] ); zv[1] = _mm256_mul_ps( alphav, xv[1] ); zv[2] = _mm256_mul_ps( alphav, xv[2] ); zv[3] = _mm256_mul_ps( alphav, xv[3] ); zv[4] = _mm256_mul_ps( alphav, xv[4] ); // Store the output. _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_ps( (x0 + 4*n_elem_per_reg), zv[4] ); x0 += 5*n_elem_per_reg; } for ( ; (i + 31) < n; i += 32 ) { // Load the input values. xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_ps( alphav, xv[0] ); zv[1] = _mm256_mul_ps( alphav, xv[1] ); zv[2] = _mm256_mul_ps( alphav, xv[2] ); zv[3] = _mm256_mul_ps( alphav, xv[3] ); // Store the output. _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), zv[3] ); x0 += 4*n_elem_per_reg; } for ( ; (i + 15) < n; i += 16 ) { // Load the input values. xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_ps( alphav, xv[0] ); zv[1] = _mm256_mul_ps( alphav, xv[1] ); // Store the output. _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), zv[1] ); x0 += 2*n_elem_per_reg; } for ( ; (i + 7) < n; i += 8 ) { // Load the input values. xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_ps( alphav, xv[0] ); // Store the output. _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), zv[0] ); x0 += 1*n_elem_per_reg; } for ( ; (i + 0) < n; i += 1 ) { *x0 *= *alpha; x0 += 1; } } else { const float alphac = *alpha; for ( i = 0; i < n; ++i ) { *x0 *= alphac; x0 += incx; } } } // ----------------------------------------------------------------------------- void bli_dscalv_zen_int10 ( conj_t conjalpha, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; dim_t i; double* restrict x0; __m256d alphav; __m256d xv[10]; __m256d zv[10]; // If the vector dimension is zero, or if alpha is unit, return early. if ( bli_zero_dim1( n ) || PASTEMAC(d,eq1)( *alpha ) ) return; // If alpha is zero, use setv. if ( PASTEMAC(d,eq0)( *alpha ) ) { double* zero = bli_d0; if( cntx == NULL ) cntx = bli_gks_query_cntx(); dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx ); f ( BLIS_NO_CONJUGATE, n, zero, x, incx, cntx ); return; } // Initialize local pointers. x0 = x; if ( incx == 1 ) { // Broadcast the alpha scalar to all elements of a vector register. alphav = _mm256_broadcast_sd( alpha ); for ( i = 0; (i + 39) < n; i += 40 ) { // Load the input values. xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg ); xv[8] = _mm256_loadu_pd( x0 + 8*n_elem_per_reg ); xv[9] = _mm256_loadu_pd( x0 + 9*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_pd( alphav, xv[0] ); zv[1] = _mm256_mul_pd( alphav, xv[1] ); zv[2] = _mm256_mul_pd( alphav, xv[2] ); zv[3] = _mm256_mul_pd( alphav, xv[3] ); zv[4] = _mm256_mul_pd( alphav, xv[4] ); zv[5] = _mm256_mul_pd( alphav, xv[5] ); zv[6] = _mm256_mul_pd( alphav, xv[6] ); zv[7] = _mm256_mul_pd( alphav, xv[7] ); zv[8] = _mm256_mul_pd( alphav, xv[8] ); zv[9] = _mm256_mul_pd( alphav, xv[9] ); // Store the output. _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] ); _mm256_storeu_pd( (x0 + 5*n_elem_per_reg), zv[5] ); _mm256_storeu_pd( (x0 + 6*n_elem_per_reg), zv[6] ); _mm256_storeu_pd( (x0 + 7*n_elem_per_reg), zv[7] ); _mm256_storeu_pd( (x0 + 8*n_elem_per_reg), zv[8] ); _mm256_storeu_pd( (x0 + 9*n_elem_per_reg), zv[9] ); x0 += 10*n_elem_per_reg; } for ( ; (i + 19) < n; i += 20 ) { // Load the input values. xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_pd( alphav, xv[0] ); zv[1] = _mm256_mul_pd( alphav, xv[1] ); zv[2] = _mm256_mul_pd( alphav, xv[2] ); zv[3] = _mm256_mul_pd( alphav, xv[3] ); zv[4] = _mm256_mul_pd( alphav, xv[4] ); // Store the output. _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] ); _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), zv[4] ); x0 += 5*n_elem_per_reg; } for ( ; (i + 15) < n; i += 16 ) { // Load the input values. xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_pd( alphav, xv[0] ); zv[1] = _mm256_mul_pd( alphav, xv[1] ); zv[2] = _mm256_mul_pd( alphav, xv[2] ); zv[3] = _mm256_mul_pd( alphav, xv[3] ); // Store the output. _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] ); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), zv[2] ); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), zv[3] ); x0 += 4*n_elem_per_reg; } for ( ; (i + 7) < n; i += 8 ) { // Load the input values. xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_pd( alphav, xv[0] ); zv[1] = _mm256_mul_pd( alphav, xv[1] ); // Store the output. _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] ); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), zv[1] ); x0 += 2*n_elem_per_reg; } for ( ; (i + 3) < n; i += 4 ) { // Load the input values. xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); // perform : x := alpha * x; zv[0] = _mm256_mul_pd( alphav, xv[0] ); // Store the output. _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), zv[0] ); x0 += 1*n_elem_per_reg; } for ( ; (i + 0) < n; i += 1 ) { *x0 *= *alpha; x0 += 1; } } else { const double alphac = *alpha; for ( i = 0; i < n; ++i ) { *x0 *= alphac; x0 += incx; } } } // ----------------------------------------------------------------------------- // // NOTE: This function definition is provided as a placeholder in order to allow // function names of scalv kernels to be hard-coded in bli_gemv_unf_var2_amd.c. // void bli_cscalv_zen_int10 ( conj_t conjalpha, dim_t n, scomplex* restrict alpha, scomplex* restrict x, inc_t incx, cntx_t* restrict cntx ) { const num_t dt = BLIS_SCOMPLEX; cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); f ( conjalpha, n, alpha, x, incx, cntx ); } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_setv_zen_int.c000066400000000000000000000166251427272030600237500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" // ----------------------------------------------------------------------------- void bli_ssetv_zen_int ( conj_t conjalpha, dim_t n, float* restrict alpha, float* restrict x, inc_t incx, cntx_t* restrict cntx ) { const dim_t num_elem_per_reg = 8; dim_t i = 0; __m256 alphav; // If the vector dimension is zero return early. if ( bli_zero_dim1( n ) ) return; if ( incx == 1 ) { alphav = _mm256_broadcast_ss( alpha ); // For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128 // for example if n = 255 // n & ~0x7F results in 128: copy from 0 to 128 happens in first loop // n & ~0x3F results in 192: copy from 128 to 192 happens in second loop // n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on. for ( i = 0; i < (n & (~0x7F)); i += 128 ) { _mm256_storeu_ps(x + num_elem_per_reg * 0, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 1, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 2, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 3, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 4, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 5, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 6, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 7, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 8, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 9, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 10, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 11, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 12, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 13, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 14, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 15, alphav); x += 128; } for ( ; i < (n & (~0x3F)); i += 64 ) { _mm256_storeu_ps(x + num_elem_per_reg * 0, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 1, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 2, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 3, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 4, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 5, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 6, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 7, alphav); x += 64; } for ( ; i < (n & (~0x1F)); i += 32 ) { _mm256_storeu_ps(x + num_elem_per_reg * 0, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 1, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 2, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 3, alphav); x += 32; } for ( ; i < (n & (~0x0F)); i += 16 ) { _mm256_storeu_ps(x + num_elem_per_reg * 0, alphav); _mm256_storeu_ps(x + num_elem_per_reg * 1, alphav); x += 16; } for ( ; i < (n & (~0x07)); i += 8 ) { _mm256_storeu_ps(x + num_elem_per_reg * 0, alphav); x += 8; } for ( ; i < n; ++i ) { *x++ = *alpha; } } else { for ( dim_t i = 0; i < n; ++i ) { *x = *alpha; x += incx; } } } void bli_dsetv_zen_int ( conj_t conjalpha, dim_t n, double* restrict alpha, double* restrict x, inc_t incx, cntx_t* restrict cntx ) { const dim_t num_elem_per_reg = 4; dim_t i = 0; __m256d alphav; // If the vector dimension is zero return early. if ( bli_zero_dim1( n ) ) return; if ( incx == 1 ) { // Broadcast the alpha scalar to all elements of a vector register. alphav = _mm256_broadcast_sd( alpha ); // n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64, // the copy operation will be done for the multiples of 64 for ( i = 0; i < (n & (~0x3F)); i += 64 ) { _mm256_storeu_pd(x + num_elem_per_reg * 0, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 1, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 2, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 3, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 4, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 5, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 6, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 7, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 8, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 9, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 10, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 11, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 12, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 13, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 14, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 15, alphav); x += num_elem_per_reg * 16; } for ( ; i < (n & (~0x1F)); i += 32 ) { _mm256_storeu_pd(x + num_elem_per_reg * 0, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 1, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 2, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 3, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 4, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 5, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 6, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 7, alphav); x += num_elem_per_reg * 8; } for ( ; i < (n & (~0xF)); i += 16 ) { _mm256_storeu_pd(x + num_elem_per_reg * 0, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 1, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 2, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 3, alphav); x += num_elem_per_reg * 4; } for ( ; i < (n & (~0x07)); i += 8 ) { _mm256_storeu_pd(x + num_elem_per_reg * 0, alphav); _mm256_storeu_pd(x + num_elem_per_reg * 1, alphav); x += num_elem_per_reg * 2; } for ( ; i < (n & (~0x03)); i += 4 ) { _mm256_storeu_pd(x + num_elem_per_reg * 0, alphav); x += num_elem_per_reg; } for ( ; i < n; ++i ) { *x++ = *alpha; } } else { for ( i = 0; i < n; ++i ) { *x = *alpha; x += incx; } } } cython-blis-0.9.1/blis/_src/kernels/zen/1/bli_swapv_zen_int8.c000066400000000000000000000251161427272030600242120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_sswapv_zen_int8 ( dim_t n, float* restrict x, inc_t incx, float* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 8; dim_t i = 0; float* restrict x0; float* restrict y0; __m256 xv[8]; __m256 yv[8]; // If the vector dimension is zero, return early. if ( bli_zero_dim1( n ) ) return; x0 = x; y0 = y; if ( incx == 1 && incy == 1 ) { for ( i = 0; ( i + 63 ) < n; i += 64 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_ps( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_ps( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_ps( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_ps( x0 + 7*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_ps( y0 + 4*n_elem_per_reg ); yv[5] = _mm256_loadu_ps( y0 + 5*n_elem_per_reg ); yv[6] = _mm256_loadu_ps( y0 + 6*n_elem_per_reg ); yv[7] = _mm256_loadu_ps( y0 + 7*n_elem_per_reg ); _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]); _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]); _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]); _mm256_storeu_ps( (x0 + 4*n_elem_per_reg), yv[4]); _mm256_storeu_ps( (x0 + 5*n_elem_per_reg), yv[5]); _mm256_storeu_ps( (x0 + 6*n_elem_per_reg), yv[6]); _mm256_storeu_ps( (x0 + 7*n_elem_per_reg), yv[7]); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]); _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]); _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]); _mm256_storeu_ps( (y0 + 4*n_elem_per_reg), xv[4]); _mm256_storeu_ps( (y0 + 5*n_elem_per_reg), xv[5]); _mm256_storeu_ps( (y0 + 6*n_elem_per_reg), xv[6]); _mm256_storeu_ps( (y0 + 7*n_elem_per_reg), xv[7]); x0 += 8*n_elem_per_reg; y0 += 8*n_elem_per_reg; } for ( ; ( i + 31 ) < n; i += 32 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_ps( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_ps( x0 + 3*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_ps( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_ps( y0 + 3*n_elem_per_reg ); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]); _mm256_storeu_ps( (y0 + 2*n_elem_per_reg), xv[2]); _mm256_storeu_ps( (y0 + 3*n_elem_per_reg), xv[3]); _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]); _mm256_storeu_ps( (x0 + 2*n_elem_per_reg), yv[2]); _mm256_storeu_ps( (x0 + 3*n_elem_per_reg), yv[3]); x0 += 4*n_elem_per_reg; y0 += 4*n_elem_per_reg; } for ( ; ( i + 15 ) < n; i += 16 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_ps( x0 + 1*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), xv[1]); _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); _mm256_storeu_ps( (x0 + 1*n_elem_per_reg), yv[1]); x0 += 2*n_elem_per_reg; y0 += 2*n_elem_per_reg; } for ( ; ( i + 7 ) < n; i += 8 ) { xv[0] = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); yv[0] = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); _mm256_storeu_ps( (x0 + 0*n_elem_per_reg), yv[0]); _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), xv[0]); x0 += 1*n_elem_per_reg; y0 += 1*n_elem_per_reg; } for ( ; (i + 0) < n; i += 1 ) { PASTEMAC(s,swaps)( x[i], y[i] ); } } else { for ( i = 0; i < n; ++i ) { PASTEMAC(s,swaps)( (*x0), (*y0) ); x0 += incx; y0 += incy; } } } //-------------------------------------------------------------------------------- void bli_dswapv_zen_int8 ( dim_t n, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t n_elem_per_reg = 4; dim_t i = 0; double* restrict x0; double* restrict y0; __m256d xv[8]; __m256d yv[8]; // If the vector dimension is zero, return early. if ( bli_zero_dim1( n ) ) return; x0 = x; y0 = y; if ( incx == 1 && incy == 1 ) { for ( ; ( i + 31 ) < n; i += 32 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); xv[4] = _mm256_loadu_pd( x0 + 4*n_elem_per_reg ); xv[5] = _mm256_loadu_pd( x0 + 5*n_elem_per_reg ); xv[6] = _mm256_loadu_pd( x0 + 6*n_elem_per_reg ); xv[7] = _mm256_loadu_pd( x0 + 7*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); yv[4] = _mm256_loadu_pd( y0 + 4*n_elem_per_reg ); yv[5] = _mm256_loadu_pd( y0 + 5*n_elem_per_reg ); yv[6] = _mm256_loadu_pd( y0 + 6*n_elem_per_reg ); yv[7] = _mm256_loadu_pd( y0 + 7*n_elem_per_reg ); _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]); _mm256_storeu_pd( (x0 + 4*n_elem_per_reg), yv[4]); _mm256_storeu_pd( (x0 + 5*n_elem_per_reg), yv[5]); _mm256_storeu_pd( (x0 + 6*n_elem_per_reg), yv[6]); _mm256_storeu_pd( (x0 + 7*n_elem_per_reg), yv[7]); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]); _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]); _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]); _mm256_storeu_pd( (y0 + 4*n_elem_per_reg), xv[4]); _mm256_storeu_pd( (y0 + 5*n_elem_per_reg), xv[5]); _mm256_storeu_pd( (y0 + 6*n_elem_per_reg), xv[6]); _mm256_storeu_pd( (y0 + 7*n_elem_per_reg), xv[7]); x0 += 8*n_elem_per_reg; y0 += 8*n_elem_per_reg; } for ( ; ( i + 15 ) < n; i += 16 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); xv[2] = _mm256_loadu_pd( x0 + 2*n_elem_per_reg ); xv[3] = _mm256_loadu_pd( x0 + 3*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); yv[2] = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); yv[3] = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]); _mm256_storeu_pd( (y0 + 2*n_elem_per_reg), xv[2]); _mm256_storeu_pd( (y0 + 3*n_elem_per_reg), xv[3]); _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]); _mm256_storeu_pd( (x0 + 2*n_elem_per_reg), yv[2]); _mm256_storeu_pd( (x0 + 3*n_elem_per_reg), yv[3]); x0 += 4*n_elem_per_reg; y0 += 4*n_elem_per_reg; } for ( ; ( i + 7 ) < n; i += 8 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); xv[1] = _mm256_loadu_pd( x0 + 1*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); yv[1] = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), xv[1]); _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); _mm256_storeu_pd( (x0 + 1*n_elem_per_reg), yv[1]); x0 += 2*n_elem_per_reg; y0 += 2*n_elem_per_reg; } for ( ; ( i + 3 ) < n; i += 4 ) { xv[0] = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); yv[0] = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), xv[0]); _mm256_storeu_pd( (x0 + 0*n_elem_per_reg), yv[0]); x0 += 1*n_elem_per_reg; y0 += 1*n_elem_per_reg; } for ( ; (i + 0) < n; i += 1 ) { PASTEMAC(d,swaps)( x[i], y[i] ); } } else { for ( i = 0; i < n; ++i ) { PASTEMAC(d,swaps)( (*x0), (*y0) ); x0 += incx; y0 += incy; } } } cython-blis-0.9.1/blis/_src/kernels/zen/1f/000077500000000000000000000000001427272030600204035ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen/1f/bli_axpyf_zen_int_4.c000066400000000000000000000216621427272030600245040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" void bli_caxpyf_zen_int_4 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, scomplex* restrict alpha, scomplex* restrict a, inc_t inca, inc_t lda, scomplex* restrict x, inc_t incx, scomplex* restrict y, inc_t incy, cntx_t* restrict cntx ) { inc_t fuse_fac = 4; inc_t i; __m256 ymm0, ymm1, ymm2, ymm3; __m256 ymm4, ymm5, ymm6, ymm7; __m256 ymm8, ymm10; __m256 ymm12, ymm13; float* ap[4]; float* y0 = (float*)y; scomplex chi0; scomplex chi1; scomplex chi2; scomplex chi3; dim_t setPlusOne = 1; if ( bli_is_conj(conja) ) { setPlusOne = -1; } // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || bli_ceq0( *alpha ) ) return; // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over axpyv. if ( b_n != fuse_fac ) { if ( cntx == NULL ) cntx = bli_gks_query_cntx(); caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { scomplex* a1 = a + (0 )*inca + (i )*lda; scomplex* chi1 = x + (i )*incx; scomplex* y1 = y + (0 )*incy; scomplex alpha_chi1; bli_ccopycjs( conjx, *chi1, alpha_chi1 ); bli_cscals( *alpha, alpha_chi1 ); f ( conja, m, &alpha_chi1, a1, inca, y1, incy, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. if(bli_is_noconj(conjx)) { chi0 = *( x + 0*incx ); chi1 = *( x + 1*incx ); chi2 = *( x + 2*incx ); chi3 = *( x + 3*incx ); } else { scomplex *pchi0 = x + 0*incx ; scomplex *pchi1 = x + 1*incx ; scomplex *pchi2 = x + 2*incx ; scomplex *pchi3 = x + 3*incx ; bli_ccopycjs( conjx, *pchi0, chi0 ); bli_ccopycjs( conjx, *pchi1, chi1 ); bli_ccopycjs( conjx, *pchi2, chi2 ); bli_ccopycjs( conjx, *pchi3, chi3 ); } // Scale each chi scalar by alpha. bli_cscals( *alpha, chi0 ); bli_cscals( *alpha, chi1 ); bli_cscals( *alpha, chi2 ); bli_cscals( *alpha, chi3 ); lda *= 2; incx *= 2; incy *= 2; inca *= 2; ap[0] = (float*)a; ap[1] = (float*)a + lda; ap[2] = ap[1] + lda; ap[3] = ap[2] + lda; if( inca == 2 && incy == 2 ) { inc_t n1 = m/4; inc_t n2 = m%4; ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); // broadcast real & imag parts of 4 elements of x ymm0 = _mm256_broadcast_ss(&chi0.real); // real part of x0 ymm1 = _mm256_broadcast_ss(&chi0.imag); // imag part of x0 ymm2 = _mm256_broadcast_ss(&chi1.real); // real part of x1 ymm3 = _mm256_broadcast_ss(&chi1.imag); // imag part of x1 ymm4 = _mm256_broadcast_ss(&chi2.real); // real part of x2 ymm5 = _mm256_broadcast_ss(&chi2.imag); // imag part of x2 ymm6 = _mm256_broadcast_ss(&chi3.real); // real part of x3 ymm7 = _mm256_broadcast_ss(&chi3.imag); // imag part of x3 for(i = 0; i < n1; i++) { //load first two columns of A ymm8 = _mm256_loadu_ps(ap[0] + 0); ymm10 = _mm256_loadu_ps(ap[1] + 0); ymm12 = _mm256_mul_ps(ymm8, ymm0); ymm13 = _mm256_mul_ps(ymm8, ymm1); ymm12 = _mm256_fmadd_ps(ymm10, ymm2, ymm12); ymm13 = _mm256_fmadd_ps(ymm10, ymm3, ymm13); //load 3rd and 4th columns of A ymm8 = _mm256_loadu_ps(ap[2] + 0); ymm10 = _mm256_loadu_ps(ap[3] + 0); ymm12 = _mm256_fmadd_ps(ymm8, ymm4, ymm12); ymm13 = _mm256_fmadd_ps(ymm8, ymm5, ymm13); ymm12 = _mm256_fmadd_ps(ymm10, ymm6, ymm12); ymm13 = _mm256_fmadd_ps(ymm10, ymm7, ymm13); //load Y vector ymm10 = _mm256_loadu_ps(y0 + 0); if(bli_is_noconj(conja)) { //printf("Inside no conj if\n"); ymm13 = _mm256_permute_ps(ymm13, 0xB1); ymm8 = _mm256_addsub_ps(ymm12, ymm13); } else { ymm12 = _mm256_permute_ps(ymm12, 0xB1); ymm8 = _mm256_addsub_ps(ymm13, ymm12); ymm8 = _mm256_permute_ps(ymm8, 0xB1); } ymm12 = _mm256_add_ps(ymm8, ymm10); _mm256_storeu_ps((float*)(y0), ymm12); y0 += 8; ap[0] += 8; ap[1] += 8; ap[2] += 8; ap[3] += 8; } // If there are leftover iterations, perform them with scalar code. for ( i = 0; (i + 0) < n2 ; ++i ) { scomplex y0c = *(scomplex*)y0; const scomplex a0c = *(scomplex*)ap[0]; const scomplex a1c = *(scomplex*)ap[1]; const scomplex a2c = *(scomplex*)ap[2]; const scomplex a3c = *(scomplex*)ap[3]; y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne; y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne; y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne; y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne; y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne; y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne; y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne; y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne; *(scomplex*)y0 = y0c; ap[0] += 2; ap[1] += 2; ap[2] += 2; ap[3] += 2; y0 += 2; } //PASTEMAC(c,fprintm)(stdout, "Y after A*x in axpyf",m, 1, (scomplex*)y, 1, 1, "%4.1f", ""); } else { for (i = 0 ; (i + 0) < m ; ++i ) { scomplex y0c = *(scomplex*)y0; const scomplex a0c = *(scomplex*)ap[0]; const scomplex a1c = *(scomplex*)ap[1]; const scomplex a2c = *(scomplex*)ap[2]; const scomplex a3c = *(scomplex*)ap[3]; y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne; y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne; y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne; y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne; y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne; y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne; y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne; y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne; *(scomplex*)y0 = y0c; ap[0] += inca; ap[1] += inca; ap[2] += inca; ap[3] += inca; y0 += incy; } } } cython-blis-0.9.1/blis/_src/kernels/zen/1f/bli_axpyf_zen_int_5.c000066400000000000000000001135051427272030600245030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; __m128d xmm[2]; double d[4] __attribute__((aligned(64))); } v4df_t; typedef union { __m128d v; double d[2] __attribute__((aligned(64))); } v2df_t; void bli_saxpyf_zen_int_5 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, float* restrict alpha, float* restrict a, inc_t inca, inc_t lda, float* restrict x, inc_t incx, float* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 5; const dim_t n_elem_per_reg = 8; const dim_t n_iter_unroll = 2; dim_t i; float* restrict a0; float* restrict a1; float* restrict a2; float* restrict a3; float* restrict a4; float* restrict y0; v8sf_t chi0v, chi1v, chi2v, chi3v; v8sf_t chi4v; v8sf_t a00v, a01v, a02v, a03v; v8sf_t a04v; v8sf_t a10v, a11v, a12v, a13v; v8sf_t a14v; v8sf_t y0v, y1v; float chi0, chi1, chi2, chi3; float chi4; // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return; // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over axpyv. if ( b_n != fuse_fac ) { if(cntx == NULL) cntx = bli_gks_query_cntx(); saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { float* a1 = a + (0 )*inca + (i )*lda; float* chi1 = x + (i )*incx; float* y1 = y + (0 )*incy; float alpha_chi1; bli_scopycjs( conjx, *chi1, alpha_chi1 ); bli_sscals( *alpha, alpha_chi1 ); f ( conja, m, &alpha_chi1, a1, inca, y1, incy, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. a0 = a + 0*lda; a1 = a + 1*lda; a2 = a + 2*lda; a3 = a + 3*lda; a4 = a + 4*lda; y0 = y; chi0 = *( x + 0*incx ); chi1 = *( x + 1*incx ); chi2 = *( x + 2*incx ); chi3 = *( x + 3*incx ); chi4 = *( x + 4*incx ); // Scale each chi scalar by alpha. bli_sscals( *alpha, chi0 ); bli_sscals( *alpha, chi1 ); bli_sscals( *alpha, chi2 ); bli_sscals( *alpha, chi3 ); bli_sscals( *alpha, chi4 ); // Broadcast the (alpha*chi?) scalars to all elements of vector registers. chi0v.v = _mm256_broadcast_ss( &chi0 ); chi1v.v = _mm256_broadcast_ss( &chi1 ); chi2v.v = _mm256_broadcast_ss( &chi2 ); chi3v.v = _mm256_broadcast_ss( &chi3 ); chi4v.v = _mm256_broadcast_ss( &chi4 ); // If there are vectorized iterations, perform them with vector // instructions. if ( inca == 1 && incy == 1 ) { for ( i = 0; (i + 15) < m; i += 16 ) { // Load the input values. y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg ); a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg ); a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg ); a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg ); a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg ); a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v ); y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v ); y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v ); y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v ); y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v ); // Store the output. _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v ); y0 += n_iter_unroll * n_elem_per_reg; a0 += n_iter_unroll * n_elem_per_reg; a1 += n_iter_unroll * n_elem_per_reg; a2 += n_iter_unroll * n_elem_per_reg; a3 += n_iter_unroll * n_elem_per_reg; a4 += n_iter_unroll * n_elem_per_reg; } for( ; (i + 7) < m; i += 8 ) { // Load the input values. y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v ); // Store the output. _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); y0 += n_elem_per_reg; a0 += n_elem_per_reg; a1 += n_elem_per_reg; a2 += n_elem_per_reg; a3 += n_elem_per_reg; a4 += n_elem_per_reg; } // If there are leftover iterations, perform them with scalar code. for ( ; (i + 0) < m ; ++i ) { double y0c = *y0; const float a0c = *a0; const float a1c = *a1; const float a2c = *a2; const float a3c = *a3; const float a4c = *a4; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; y0c += chi4 * a4c; *y0 = y0c; a0 += 1; a1 += 1; a2 += 1; a3 += 1; a4 += 1; y0 += 1; } } else { for ( i = 0; (i + 0) < m ; ++i ) { double y0c = *y0; const float a0c = *a0; const float a1c = *a1; const float a2c = *a2; const float a3c = *a3; const float a4c = *a4; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; y0c += chi4 * a4c; *y0 = y0c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; a4 += inca; y0 += incy; } } } // ----------------------------------------------------------------------------- void bli_daxpyf_zen_int_5 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 5; const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 2; dim_t i; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict a4; double* restrict y0; v4df_t chi0v, chi1v, chi2v, chi3v; v4df_t chi4v; v4df_t a00v, a01v, a02v, a03v; v4df_t a04v; v4df_t a10v, a11v, a12v, a13v; v4df_t a14v; v4df_t y0v, y1v; double chi0, chi1, chi2, chi3; double chi4; // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over axpyv. if ( b_n != fuse_fac ) { daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { double* a1 = a + (0 )*inca + (i )*lda; double* chi1 = x + (i )*incx; double* y1 = y + (0 )*incy; double alpha_chi1; bli_dcopycjs( conjx, *chi1, alpha_chi1 ); bli_dscals( *alpha, alpha_chi1 ); f ( conja, m, &alpha_chi1, a1, inca, y1, incy, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. a0 = a + 0*lda; a1 = a + 1*lda; a2 = a + 2*lda; a3 = a + 3*lda; a4 = a + 4*lda; y0 = y; chi0 = *( x + 0*incx ); chi1 = *( x + 1*incx ); chi2 = *( x + 2*incx ); chi3 = *( x + 3*incx ); chi4 = *( x + 4*incx ); // Scale each chi scalar by alpha. bli_dscals( *alpha, chi0 ); bli_dscals( *alpha, chi1 ); bli_dscals( *alpha, chi2 ); bli_dscals( *alpha, chi3 ); bli_dscals( *alpha, chi4 ); // Broadcast the (alpha*chi?) scalars to all elements of vector registers. chi0v.v = _mm256_broadcast_sd( &chi0 ); chi1v.v = _mm256_broadcast_sd( &chi1 ); chi2v.v = _mm256_broadcast_sd( &chi2 ); chi3v.v = _mm256_broadcast_sd( &chi3 ); chi4v.v = _mm256_broadcast_sd( &chi4 ); // If there are vectorized iterations, perform them with vector // instructions. if ( inca == 1 && incy == 1 ) { for ( i = 0; (i + 7) < m; i += 8 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); y0 += n_iter_unroll * n_elem_per_reg; a0 += n_iter_unroll * n_elem_per_reg; a1 += n_iter_unroll * n_elem_per_reg; a2 += n_iter_unroll * n_elem_per_reg; a3 += n_iter_unroll * n_elem_per_reg; a4 += n_iter_unroll * n_elem_per_reg; } for( ; (i + 3) < m; i += 4 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v ); // Store the output. _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); y0 += n_elem_per_reg; a0 += n_elem_per_reg; a1 += n_elem_per_reg; a2 += n_elem_per_reg; a3 += n_elem_per_reg; a4 += n_elem_per_reg; } // If there are leftover iterations, perform them with scalar code. for ( ; (i + 0) < m ; ++i ) { double y0c = *y0; const double a0c = *a0; const double a1c = *a1; const double a2c = *a2; const double a3c = *a3; const double a4c = *a4; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; y0c += chi4 * a4c; *y0 = y0c; a0 += 1; a1 += 1; a2 += 1; a3 += 1; a4 += 1; y0 += 1; } } else { for ( i = 0; (i + 0) < m ; ++i ) { double y0c = *y0; const double a0c = *a0; const double a1c = *a1; const double a2c = *a2; const double a3c = *a3; const double a4c = *a4; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; y0c += chi4 * a4c; *y0 = y0c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; a4 += inca; y0 += incy; } } } // ----------------------------------------------------------------------------- static void bli_daxpyf_zen_int_16x2 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 2; const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 4; dim_t i; double* restrict a0; double* restrict a1; double* restrict y0; v4df_t chi0v, chi1v; v4df_t a00v, a01v; v4df_t a10v, a11v; v4df_t a20v, a21v; v4df_t a30v, a31v; v4df_t y0v, y1v, y2v, y3v; double chi0, chi1; v2df_t a40v, a41v; v2df_t y4v; // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over axpyv. if ( b_n != fuse_fac ) { daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { double* a1 = a + (0 )*inca + (i )*lda; double* chi1 = x + (i )*incx; double* y1 = y + (0 )*incy; double alpha_chi1; bli_dcopycjs( conjx, *chi1, alpha_chi1 ); bli_dscals( *alpha, alpha_chi1 ); f ( conja, m, &alpha_chi1, a1, inca, y1, incy, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. a0 = a + 0*lda; a1 = a + 1*lda; y0 = y; chi0 = *( x + 0*incx ); chi1 = *( x + 1*incx ); // Scale each chi scalar by alpha. bli_dscals( *alpha, chi0 ); bli_dscals( *alpha, chi1 ); // Broadcast the (alpha*chi?) scalars to all elements of vector registers. chi0v.v = _mm256_broadcast_sd( &chi0 ); chi1v.v = _mm256_broadcast_sd( &chi1 ); // If there are vectorized iterations, perform them with vector // instructions. if ( inca == 1 && incy == 1 ) { for ( i = 0; (i + 15) < m; i += 16 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v ); y0 += n_iter_unroll * n_elem_per_reg; a0 += n_iter_unroll * n_elem_per_reg; a1 += n_iter_unroll * n_elem_per_reg; } for ( ; (i + 11) < m; i += 12 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); y0 += 3 * n_elem_per_reg; a0 += 3 * n_elem_per_reg; a1 += 3 * n_elem_per_reg; } for ( ; (i + 7) < m; i += 8 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); y0 += 2 * n_elem_per_reg; a0 += 2 * n_elem_per_reg; a1 += 2 * n_elem_per_reg; } for ( ; (i + 3) < m; i += 4 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); y0 += n_elem_per_reg; a0 += n_elem_per_reg; a1 += n_elem_per_reg; } for ( ; (i + 1) < m; i += 2 ) { // Load the input values. y4v.v = _mm_loadu_pd( y0 + 0*n_elem_per_reg ); a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg ); a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg ); // perform : y += alpha * x; y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v ); y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v ); // Store the output. _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v ); y0 += 2; a0 += 2; a1 += 2; } // If there are leftover iterations, perform them with scalar code. for ( ; (i + 0) < m ; ++i ) { double y0c = *y0; const double a0c = *a0; const double a1c = *a1; y0c += chi0 * a0c; y0c += chi1 * a1c; *y0 = y0c; a0 += 1; a1 += 1; y0 += 1; } } else { for ( i = 0; (i + 0) < m ; ++i ) { double y0c = *y0; const double a0c = *a0; const double a1c = *a1; y0c += chi0 * a0c; y0c += chi1 * a1c; *y0 = y0c; a0 += inca; a1 += inca; y0 += incy; } } } // ----------------------------------------------------------------------------- void bli_daxpyf_zen_int_16x4 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 4; const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 4; dim_t i; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict y0; v4df_t chi0v, chi1v, chi2v, chi3v; v4df_t a00v, a01v, a02v, a03v; v4df_t a10v, a11v, a12v, a13v; v4df_t a20v, a21v, a22v, a23v; v4df_t a30v, a31v, a32v, a33v; v4df_t y0v, y1v, y2v, y3v; double chi0, chi1, chi2, chi3; v2df_t y4v; v2df_t a40v, a41v, a42v, a43v; // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return; // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over axpyv. if ( b_n != fuse_fac ) { if(cntx == NULL) cntx = bli_gks_query_cntx(); daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { double* a1 = a + (0 )*inca + (i )*lda; double* chi1 = x + (i )*incx; double* y1 = y + (0 )*incy; double alpha_chi1; bli_dcopycjs( conjx, *chi1, alpha_chi1 ); bli_dscals( *alpha, alpha_chi1 ); f ( conja, m, &alpha_chi1, a1, inca, y1, incy, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. a0 = a + 0*lda; a1 = a + 1*lda; a2 = a + 2*lda; a3 = a + 3*lda; y0 = y; chi0 = *( x + 0*incx ); chi1 = *( x + 1*incx ); chi2 = *( x + 2*incx ); chi3 = *( x + 3*incx ); // Scale each chi scalar by alpha. bli_dscals( *alpha, chi0 ); bli_dscals( *alpha, chi1 ); bli_dscals( *alpha, chi2 ); bli_dscals( *alpha, chi3 ); // Broadcast the (alpha*chi?) scalars to all elements of vector registers. chi0v.v = _mm256_broadcast_sd( &chi0 ); chi1v.v = _mm256_broadcast_sd( &chi1 ); chi2v.v = _mm256_broadcast_sd( &chi2 ); chi3v.v = _mm256_broadcast_sd( &chi3 ); // If there are vectorized iterations, perform them with vector // instructions. if ( inca == 1 && incy == 1 ) { for ( i = 0; (i + 15) < m; i += 16 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg ); a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg ); a32v.v = _mm256_loadu_pd( a2 + 3*n_elem_per_reg ); a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg ); a33v.v = _mm256_loadu_pd( a3 + 3*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v ); y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v ); y3v.v = _mm256_fmadd_pd( a32v.v, chi2v.v, y3v.v ); y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v ); y3v.v = _mm256_fmadd_pd( a33v.v, chi3v.v, y3v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v ); y0 += n_iter_unroll * n_elem_per_reg; a0 += n_iter_unroll * n_elem_per_reg; a1 += n_iter_unroll * n_elem_per_reg; a2 += n_iter_unroll * n_elem_per_reg; a3 += n_iter_unroll * n_elem_per_reg; } for ( ; (i + 11) < m; i += 12 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg ); a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg ); a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v ); y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v ); y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v ); y0 += 3 * n_elem_per_reg; a0 += 3 * n_elem_per_reg; a1 += 3 * n_elem_per_reg; a2 += 3 * n_elem_per_reg; a3 += 3 * n_elem_per_reg; } for ( ; (i + 7) < m; i += 8 ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg ); a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg ); a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v ); y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v ); y0 += 2 * n_elem_per_reg; a0 += 2 * n_elem_per_reg; a1 += 2 * n_elem_per_reg; a2 += 2 * n_elem_per_reg; a3 += 2 * n_elem_per_reg; } for ( ; (i + 3) < m; i += 4) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v ); // Store the output. _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v ); y0 += n_elem_per_reg; a0 += n_elem_per_reg; a1 += n_elem_per_reg; a2 += n_elem_per_reg; a3 += n_elem_per_reg; } #if 1 for ( ; (i + 1) < m; i += 2) { // Load the input values. y4v.v = _mm_loadu_pd( y0 + 0*n_elem_per_reg ); a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg ); a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg ); a42v.v = _mm_loadu_pd( a2 + 0*n_elem_per_reg ); a43v.v = _mm_loadu_pd( a3 + 0*n_elem_per_reg ); // perform : y += alpha * x; y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v ); y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v ); y4v.v = _mm_fmadd_pd( a42v.v, chi2v.xmm[0], y4v.v ); y4v.v = _mm_fmadd_pd( a43v.v, chi3v.xmm[0], y4v.v ); // Store the output. _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v ); y0 += 2; a0 += 2; a1 += 2; a2 += 2; a3 += 2; } #endif // If there are leftover iterations, perform them with scalar code. for ( ; (i + 0) < m ; ++i ) { double y0c = *y0; const double a0c = *a0; const double a1c = *a1; const double a2c = *a2; const double a3c = *a3; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; *y0 = y0c; a0 += 1; a1 += 1; a2 += 1; a3 += 1; y0 += 1; } } else { for ( i = 0; (i + 0) < m ; ++i ) { double y0c = *y0; const double a0c = *a0; const double a1c = *a1; const double a2c = *a2; const double a3c = *a3; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; *y0 = y0c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } } } cython-blis-0.9.1/blis/_src/kernels/zen/1f/bli_axpyf_zen_int_8.c000066400000000000000000000315221427272030600245040ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018, The University of Texas at Austin Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_saxpyf_zen_int_8 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, float* restrict alpha, float* restrict a, inc_t inca, inc_t lda, float* restrict x, inc_t incx, float* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 8; const dim_t n_elem_per_reg = 8; const dim_t n_iter_unroll = 1; dim_t i; dim_t m_viter; dim_t m_left; float* restrict a0; float* restrict a1; float* restrict a2; float* restrict a3; float* restrict a4; float* restrict a5; float* restrict a6; float* restrict a7; float* restrict y0; v8sf_t chi0v, chi1v, chi2v, chi3v; v8sf_t chi4v, chi5v, chi6v, chi7v; v8sf_t a0v, a1v, a2v, a3v; v8sf_t a4v, a5v, a6v, a7v; v8sf_t y0v; float chi0, chi1, chi2, chi3; float chi4, chi5, chi6, chi7; // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || PASTEMAC(s,eq0)( *alpha ) ) return; // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over axpyv. if ( b_n != fuse_fac ) { saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { float* a1 = a + (0 )*inca + (i )*lda; float* chi1 = x + (i )*incx; float* y1 = y + (0 )*incy; float alpha_chi1; PASTEMAC(s,copycjs)( conjx, *chi1, alpha_chi1 ); PASTEMAC(s,scals)( *alpha, alpha_chi1 ); f ( conja, m, &alpha_chi1, a1, inca, y1, incy, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override m_viter and m_left to use scalar code // for all iterations. if ( inca != 1 || incy != 1 ) { m_viter = 0; m_left = m; } a0 = a + 0*lda; a1 = a + 1*lda; a2 = a + 2*lda; a3 = a + 3*lda; a4 = a + 4*lda; a5 = a + 5*lda; a6 = a + 6*lda; a7 = a + 7*lda; y0 = y; chi0 = *( x + 0*incx ); chi1 = *( x + 1*incx ); chi2 = *( x + 2*incx ); chi3 = *( x + 3*incx ); chi4 = *( x + 4*incx ); chi5 = *( x + 5*incx ); chi6 = *( x + 6*incx ); chi7 = *( x + 7*incx ); // Scale each chi scalar by alpha. PASTEMAC(s,scals)( *alpha, chi0 ); PASTEMAC(s,scals)( *alpha, chi1 ); PASTEMAC(s,scals)( *alpha, chi2 ); PASTEMAC(s,scals)( *alpha, chi3 ); PASTEMAC(s,scals)( *alpha, chi4 ); PASTEMAC(s,scals)( *alpha, chi5 ); PASTEMAC(s,scals)( *alpha, chi6 ); PASTEMAC(s,scals)( *alpha, chi7 ); // Broadcast the (alpha*chi?) scalars to all elements of vector registers. chi0v.v = _mm256_broadcast_ss( &chi0 ); chi1v.v = _mm256_broadcast_ss( &chi1 ); chi2v.v = _mm256_broadcast_ss( &chi2 ); chi3v.v = _mm256_broadcast_ss( &chi3 ); chi4v.v = _mm256_broadcast_ss( &chi4 ); chi5v.v = _mm256_broadcast_ss( &chi5 ); chi6v.v = _mm256_broadcast_ss( &chi6 ); chi7v.v = _mm256_broadcast_ss( &chi7 ); // If there are vectorized iterations, perform them with vector // instructions. for ( i = 0; i < m_viter; ++i ) { // Load the input values. y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg ); a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg ); a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg ); a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_ps( a0v.v, chi0v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a1v.v, chi1v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a2v.v, chi2v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a3v.v, chi3v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a4v.v, chi4v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a5v.v, chi5v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a6v.v, chi6v.v, y0v.v ); y0v.v = _mm256_fmadd_ps( a7v.v, chi7v.v, y0v.v ); // Store the output. _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v ); y0 += n_elem_per_reg; a0 += n_elem_per_reg; a1 += n_elem_per_reg; a2 += n_elem_per_reg; a3 += n_elem_per_reg; a4 += n_elem_per_reg; a5 += n_elem_per_reg; a6 += n_elem_per_reg; a7 += n_elem_per_reg; } // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < m_left ; ++i ) { float y0c = *y0; const float a0c = *a0; const float a1c = *a1; const float a2c = *a2; const float a3c = *a3; const float a4c = *a4; const float a5c = *a5; const float a6c = *a6; const float a7c = *a7; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; y0c += chi4 * a4c; y0c += chi5 * a5c; y0c += chi6 * a6c; y0c += chi7 * a7c; *y0 = y0c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; a4 += inca; a5 += inca; a6 += inca; a7 += inca; y0 += incy; } } // ----------------------------------------------------------------------------- void bli_daxpyf_zen_int_8 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 8; const dim_t n_elem_per_reg = 4; const dim_t n_iter_unroll = 1; dim_t i; dim_t m_viter; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict a4; double* restrict a5; double* restrict a6; double* restrict a7; double* restrict y0; v4df_t chi0v, chi1v, chi2v, chi3v; v4df_t chi4v, chi5v, chi6v, chi7v; v4df_t a0v, a1v, a2v, a3v; v4df_t a4v, a5v, a6v, a7v; v4df_t y0v; double chi0, chi1, chi2, chi3; double chi4, chi5, chi6, chi7; // If either dimension is zero, or if alpha is zero, return early. if ( bli_zero_dim2( m, b_n ) || PASTEMAC(d,eq0)( *alpha ) ) return; // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over axpyv. if ( b_n != fuse_fac ) { daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx ); for ( i = 0; i < b_n; ++i ) { double* a1 = a + (0 )*inca + (i )*lda; double* chi1 = x + (i )*incx; double* y1 = y + (0 )*incy; double alpha_chi1; PASTEMAC(d,copycjs)( conjx, *chi1, alpha_chi1 ); PASTEMAC(d,scals)( *alpha, alpha_chi1 ); f ( conja, m, &alpha_chi1, a1, inca, y1, incy, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m ) % ( n_elem_per_reg * n_iter_unroll ); // If there is anything that would interfere with our use of contiguous // vector loads/stores, override m_viter and m_left to use scalar code // for all iterations. if ( inca != 1 || incy != 1 ) { m_viter = 0; m_left = m; } a0 = a + 0*lda; a1 = a + 1*lda; a2 = a + 2*lda; a3 = a + 3*lda; a4 = a + 4*lda; a5 = a + 5*lda; a6 = a + 6*lda; a7 = a + 7*lda; y0 = y; chi0 = *( x + 0*incx ); chi1 = *( x + 1*incx ); chi2 = *( x + 2*incx ); chi3 = *( x + 3*incx ); chi4 = *( x + 4*incx ); chi5 = *( x + 5*incx ); chi6 = *( x + 6*incx ); chi7 = *( x + 7*incx ); // Scale each chi scalar by alpha. PASTEMAC(d,scals)( *alpha, chi0 ); PASTEMAC(d,scals)( *alpha, chi1 ); PASTEMAC(d,scals)( *alpha, chi2 ); PASTEMAC(d,scals)( *alpha, chi3 ); PASTEMAC(d,scals)( *alpha, chi4 ); PASTEMAC(d,scals)( *alpha, chi5 ); PASTEMAC(d,scals)( *alpha, chi6 ); PASTEMAC(d,scals)( *alpha, chi7 ); // Broadcast the (alpha*chi?) scalars to all elements of vector registers. chi0v.v = _mm256_broadcast_sd( &chi0 ); chi1v.v = _mm256_broadcast_sd( &chi1 ); chi2v.v = _mm256_broadcast_sd( &chi2 ); chi3v.v = _mm256_broadcast_sd( &chi3 ); chi4v.v = _mm256_broadcast_sd( &chi4 ); chi5v.v = _mm256_broadcast_sd( &chi5 ); chi6v.v = _mm256_broadcast_sd( &chi6 ); chi7v.v = _mm256_broadcast_sd( &chi7 ); // If there are vectorized iterations, perform them with vector // instructions. for ( i = 0; i < m_viter; ++i ) { // Load the input values. y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg ); a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg ); a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg ); a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg ); // perform : y += alpha * x; y0v.v = _mm256_fmadd_pd( a0v.v, chi0v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a1v.v, chi1v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a2v.v, chi2v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a3v.v, chi3v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a4v.v, chi4v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a5v.v, chi5v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a6v.v, chi6v.v, y0v.v ); y0v.v = _mm256_fmadd_pd( a7v.v, chi7v.v, y0v.v ); // Store the output. _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v ); y0 += n_elem_per_reg; a0 += n_elem_per_reg; a1 += n_elem_per_reg; a2 += n_elem_per_reg; a3 += n_elem_per_reg; a4 += n_elem_per_reg; a5 += n_elem_per_reg; a6 += n_elem_per_reg; a7 += n_elem_per_reg; } // If there are leftover iterations, perform them with scalar code. for ( i = 0; i < m_left ; ++i ) { double y0c = *y0; const double a0c = *a0; const double a1c = *a1; const double a2c = *a2; const double a3c = *a3; const double a4c = *a4; const double a5c = *a5; const double a6c = *a6; const double a7c = *a7; y0c += chi0 * a0c; y0c += chi1 * a1c; y0c += chi2 * a2c; y0c += chi3 * a3c; y0c += chi4 * a4c; y0c += chi5 * a5c; y0c += chi6 * a6c; y0c += chi7 * a7c; *y0 = y0c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; a4 += inca; a5 += inca; a6 += inca; a7 += inca; y0 += incy; } } cython-blis-0.9.1/blis/_src/kernels/zen/1f/bli_dotxf_zen_int_8.c000066400000000000000000000644211427272030600245050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018, The University of Texas at Austin Copyright (C) 2016 - 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "blis.h" /* Union data structure to access AVX registers One 256-bit AVX register holds 8 SP elements. */ typedef union { __m256 v; float f[8] __attribute__((aligned(64))); } v8sf_t; /* Union data structure to access AVX registers * One 256-bit AVX register holds 4 DP elements. */ typedef union { __m256d v; double d[4] __attribute__((aligned(64))); } v4df_t; // ----------------------------------------------------------------------------- void bli_sdotxf_zen_int_8 ( conj_t conjat, conj_t conjx, dim_t m, dim_t b_n, float* restrict alpha, float* restrict a, inc_t inca, inc_t lda, float* restrict x, inc_t incx, float* restrict beta, float* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 8; const dim_t n_elem_per_reg = 8; // If the b_n dimension is zero, y is empty and there is no computation. if ( bli_zero_dim1( b_n ) ) return; // If the m dimension is zero, or if alpha is zero, the computation // simplifies to updating y. if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) ) { sscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx ); f ( BLIS_NO_CONJUGATE, b_n, beta, y, incy, cntx ); return; } // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over dotxv. if ( b_n != fuse_fac ) { sdotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx ); for ( dim_t i = 0; i < b_n; ++i ) { float* a1 = a + (0 )*inca + (i )*lda; float* x1 = x + (0 )*incx; float* psi1 = y + (i )*incy; f ( conjat, conjx, m, alpha, a1, inca, x1, incx, beta, psi1, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. // However, m may not be a multiple of the number of elements per vector. // Going forward, we handle two possible storage formats of A explicitly: // (1) A is stored by columns, or (2) A is stored by rows. Either case is // further split into two subproblems along the m dimension: // (a) a vectorized part, starting at m = 0 and ending at any 0 <= m' <= m. // (b) a scalar part, starting at m' and ending at m. If no vectorization // is possible then m' == 0 and thus the scalar part is the entire // problem. If 0 < m', then the a and x pointers and m variable will // be adjusted accordingly for the second subproblem. // Note: since parts (b) for both (1) and (2) are so similar, they are // factored out into one code block after the following conditional, which // distinguishes between (1) and (2). // Intermediate variables to hold the completed dot products float rho0 = 0, rho1 = 0, rho2 = 0, rho3 = 0, rho4 = 0, rho5 = 0, rho6 = 0, rho7 = 0; if ( inca == 1 && incx == 1 ) { const dim_t n_iter_unroll = 1; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. dim_t m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll ); // Set up pointers for x and the b_n columns of A (rows of A^T). float* restrict x0 = x; float* restrict a0 = a + 0*lda; float* restrict a1 = a + 1*lda; float* restrict a2 = a + 2*lda; float* restrict a3 = a + 3*lda; float* restrict a4 = a + 4*lda; float* restrict a5 = a + 5*lda; float* restrict a6 = a + 6*lda; float* restrict a7 = a + 7*lda; // Initialize b_n rho vector accumulators to zero. v8sf_t rho0v; rho0v.v = _mm256_setzero_ps(); v8sf_t rho1v; rho1v.v = _mm256_setzero_ps(); v8sf_t rho2v; rho2v.v = _mm256_setzero_ps(); v8sf_t rho3v; rho3v.v = _mm256_setzero_ps(); v8sf_t rho4v; rho4v.v = _mm256_setzero_ps(); v8sf_t rho5v; rho5v.v = _mm256_setzero_ps(); v8sf_t rho6v; rho6v.v = _mm256_setzero_ps(); v8sf_t rho7v; rho7v.v = _mm256_setzero_ps(); v8sf_t x0v; v8sf_t a0v, a1v, a2v, a3v, a4v, a5v, a6v, a7v; // If there are vectorized iterations, perform them with vector // instructions. for ( dim_t i = 0; i < m_viter; ++i ) { // Load the input values. x0v.v = _mm256_loadu_ps( x0 + 0*n_elem_per_reg ); a0v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg ); a1v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg ); a2v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg ); a3v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg ); a4v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg ); a5v.v = _mm256_loadu_ps( a5 + 0*n_elem_per_reg ); a6v.v = _mm256_loadu_ps( a6 + 0*n_elem_per_reg ); a7v.v = _mm256_loadu_ps( a7 + 0*n_elem_per_reg ); // perform: rho?v += a?v * x0v; rho0v.v = _mm256_fmadd_ps( a0v.v, x0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_ps( a1v.v, x0v.v, rho1v.v ); rho2v.v = _mm256_fmadd_ps( a2v.v, x0v.v, rho2v.v ); rho3v.v = _mm256_fmadd_ps( a3v.v, x0v.v, rho3v.v ); rho4v.v = _mm256_fmadd_ps( a4v.v, x0v.v, rho4v.v ); rho5v.v = _mm256_fmadd_ps( a5v.v, x0v.v, rho5v.v ); rho6v.v = _mm256_fmadd_ps( a6v.v, x0v.v, rho6v.v ); rho7v.v = _mm256_fmadd_ps( a7v.v, x0v.v, rho7v.v ); x0 += n_elem_per_reg * n_iter_unroll; a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; a4 += n_elem_per_reg * n_iter_unroll; a5 += n_elem_per_reg * n_iter_unroll; a6 += n_elem_per_reg * n_iter_unroll; a7 += n_elem_per_reg * n_iter_unroll; } #if 0 rho0 += rho0v.f[0] + rho0v.f[1] + rho0v.f[2] + rho0v.f[3] + rho0v.f[4] + rho0v.f[5] + rho0v.f[6] + rho0v.f[7]; rho1 += rho1v.f[0] + rho1v.f[1] + rho1v.f[2] + rho1v.f[3] + rho1v.f[4] + rho1v.f[5] + rho1v.f[6] + rho1v.f[7]; rho2 += rho2v.f[0] + rho2v.f[1] + rho2v.f[2] + rho2v.f[3] + rho2v.f[4] + rho2v.f[5] + rho2v.f[6] + rho2v.f[7]; rho3 += rho3v.f[0] + rho3v.f[1] + rho3v.f[2] + rho3v.f[3] + rho3v.f[4] + rho3v.f[5] + rho3v.f[6] + rho3v.f[7]; rho4 += rho4v.f[0] + rho4v.f[1] + rho4v.f[2] + rho4v.f[3] + rho4v.f[4] + rho4v.f[5] + rho4v.f[6] + rho4v.f[7]; rho5 += rho5v.f[0] + rho5v.f[1] + rho5v.f[2] + rho5v.f[3] + rho5v.f[4] + rho5v.f[5] + rho5v.f[6] + rho5v.f[7]; rho6 += rho6v.f[0] + rho6v.f[1] + rho6v.f[2] + rho6v.f[3] + rho6v.f[4] + rho6v.f[5] + rho6v.f[6] + rho6v.f[7]; rho7 += rho7v.f[0] + rho7v.f[1] + rho7v.f[2] + rho7v.f[3] + rho7v.f[4] + rho7v.f[5] + rho7v.f[6] + rho7v.f[7]; #else // Now we need to sum the elements within each vector. v8sf_t onev; onev.v = _mm256_set1_ps( 1.0f ); // Sum the elements of a given rho?v by dotting it with 1. The '1' in // '0xf1' stores the sum of the upper four and lower four values to // the low elements of each lane: elements 4 and 0, respectively. (The // 'f' in '0xf1' means include all four elements of each lane in the // summation.) rho0v.v = _mm256_dp_ps( rho0v.v, onev.v, 0xf1 ); rho1v.v = _mm256_dp_ps( rho1v.v, onev.v, 0xf1 ); rho2v.v = _mm256_dp_ps( rho2v.v, onev.v, 0xf1 ); rho3v.v = _mm256_dp_ps( rho3v.v, onev.v, 0xf1 ); rho4v.v = _mm256_dp_ps( rho4v.v, onev.v, 0xf1 ); rho5v.v = _mm256_dp_ps( rho5v.v, onev.v, 0xf1 ); rho6v.v = _mm256_dp_ps( rho6v.v, onev.v, 0xf1 ); rho7v.v = _mm256_dp_ps( rho7v.v, onev.v, 0xf1 ); // Manually add the results from above to finish the sum. rho0 = rho0v.f[0] + rho0v.f[4]; rho1 = rho1v.f[0] + rho1v.f[4]; rho2 = rho2v.f[0] + rho2v.f[4]; rho3 = rho3v.f[0] + rho3v.f[4]; rho4 = rho4v.f[0] + rho4v.f[4]; rho5 = rho5v.f[0] + rho5v.f[4]; rho6 = rho6v.f[0] + rho6v.f[4]; rho7 = rho7v.f[0] + rho7v.f[4]; #endif // Adjust for scalar subproblem. m -= n_elem_per_reg * n_iter_unroll * m_viter; a += n_elem_per_reg * n_iter_unroll * m_viter /* * inca */; x += n_elem_per_reg * n_iter_unroll * m_viter /* * incx */; } else if ( lda == 1 ) { const dim_t n_iter_unroll = 4; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. dim_t m_viter = ( m ) / ( n_iter_unroll ); // Initialize pointers for x and A. float* restrict x0 = x; float* restrict a0 = a; // Initialize rho vector accumulators to zero. v8sf_t rho0v; rho0v.v = _mm256_setzero_ps(); v8sf_t rho1v; rho1v.v = _mm256_setzero_ps(); v8sf_t rho2v; rho2v.v = _mm256_setzero_ps(); v8sf_t rho3v; rho3v.v = _mm256_setzero_ps(); v8sf_t x0v, x1v, x2v, x3v; v8sf_t a0v, a1v, a2v, a3v; for ( dim_t i = 0; i < m_viter; ++i ) { // Load the input values. a0v.v = _mm256_loadu_ps( a0 + 0*inca ); a1v.v = _mm256_loadu_ps( a0 + 1*inca ); a2v.v = _mm256_loadu_ps( a0 + 2*inca ); a3v.v = _mm256_loadu_ps( a0 + 3*inca ); x0v.v = _mm256_broadcast_ss( x0 + 0*incx ); x1v.v = _mm256_broadcast_ss( x0 + 1*incx ); x2v.v = _mm256_broadcast_ss( x0 + 2*incx ); x3v.v = _mm256_broadcast_ss( x0 + 3*incx ); // perform : rho?v += a?v * x?v; rho0v.v = _mm256_fmadd_ps( a0v.v, x0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_ps( a1v.v, x1v.v, rho1v.v ); rho2v.v = _mm256_fmadd_ps( a2v.v, x2v.v, rho2v.v ); rho3v.v = _mm256_fmadd_ps( a3v.v, x3v.v, rho3v.v ); x0 += incx * n_iter_unroll; a0 += inca * n_iter_unroll; } // Combine the 8 accumulators into one vector register. rho0v.v = _mm256_add_ps( rho0v.v, rho1v.v ); rho2v.v = _mm256_add_ps( rho2v.v, rho3v.v ); rho0v.v = _mm256_add_ps( rho0v.v, rho2v.v ); // Write vector components to scalar values. rho0 = rho0v.f[0]; rho1 = rho0v.f[1]; rho2 = rho0v.f[2]; rho3 = rho0v.f[3]; rho4 = rho0v.f[4]; rho5 = rho0v.f[5]; rho6 = rho0v.f[6]; rho7 = rho0v.f[7]; // Adjust for scalar subproblem. m -= n_iter_unroll * m_viter; a += n_iter_unroll * m_viter * inca; x += n_iter_unroll * m_viter * incx; } else { // No vectorization possible; use scalar iterations for the entire // problem. } // Scalar edge case. { // Initialize pointers for x and the b_n columns of A (rows of A^T). float* restrict x0 = x; float* restrict a0 = a + 0*lda; float* restrict a1 = a + 1*lda; float* restrict a2 = a + 2*lda; float* restrict a3 = a + 3*lda; float* restrict a4 = a + 4*lda; float* restrict a5 = a + 5*lda; float* restrict a6 = a + 6*lda; float* restrict a7 = a + 7*lda; // If there are leftover iterations, perform them with scalar code. for ( dim_t i = 0; i < m ; ++i ) { const float x0c = *x0; const float a0c = *a0; const float a1c = *a1; const float a2c = *a2; const float a3c = *a3; const float a4c = *a4; const float a5c = *a5; const float a6c = *a6; const float a7c = *a7; rho0 += a0c * x0c; rho1 += a1c * x0c; rho2 += a2c * x0c; rho3 += a3c * x0c; rho4 += a4c * x0c; rho5 += a5c * x0c; rho6 += a6c * x0c; rho7 += a7c * x0c; x0 += incx; a0 += inca; a1 += inca; a2 += inca; a3 += inca; a4 += inca; a5 += inca; a6 += inca; a7 += inca; } } // Now prepare the final rho values to output/accumulate back into // the y vector. v8sf_t rho0v, y0v; // Insert the scalar rho values into a single vector. rho0v.f[0] = rho0; rho0v.f[1] = rho1; rho0v.f[2] = rho2; rho0v.f[3] = rho3; rho0v.f[4] = rho4; rho0v.f[5] = rho5; rho0v.f[6] = rho6; rho0v.f[7] = rho7; // Broadcast the alpha scalar. v8sf_t alphav; alphav.v = _mm256_broadcast_ss( alpha ); // We know at this point that alpha is nonzero; however, beta may still // be zero. If beta is indeed zero, we must overwrite y rather than scale // by beta (in case y contains NaN or Inf). if ( PASTEMAC(s,eq0)( *beta ) ) { // Apply alpha to the accumulated dot product in rho: // y := alpha * rho y0v.v = _mm256_mul_ps( alphav.v, rho0v.v ); } else { // Broadcast the beta scalar. v8sf_t betav; betav.v = _mm256_broadcast_ss( beta ); // Load y. if ( incy == 1 ) { y0v.v = _mm256_loadu_ps( y + 0*n_elem_per_reg ); } else { y0v.f[0] = *(y + 0*incy); y0v.f[1] = *(y + 1*incy); y0v.f[2] = *(y + 2*incy); y0v.f[3] = *(y + 3*incy); y0v.f[4] = *(y + 4*incy); y0v.f[5] = *(y + 5*incy); y0v.f[6] = *(y + 6*incy); y0v.f[7] = *(y + 7*incy); } // Apply beta to y and alpha to the accumulated dot product in rho: // y := beta * y + alpha * rho y0v.v = _mm256_mul_ps( betav.v, y0v.v ); y0v.v = _mm256_fmadd_ps( alphav.v, rho0v.v, y0v.v ); } // Store the output. if ( incy == 1 ) { _mm256_storeu_ps( (y + 0*n_elem_per_reg), y0v.v ); } else { *(y + 0*incy) = y0v.f[0]; *(y + 1*incy) = y0v.f[1]; *(y + 2*incy) = y0v.f[2]; *(y + 3*incy) = y0v.f[3]; *(y + 4*incy) = y0v.f[4]; *(y + 5*incy) = y0v.f[5]; *(y + 6*incy) = y0v.f[6]; *(y + 7*incy) = y0v.f[7]; } } // ----------------------------------------------------------------------------- void bli_ddotxf_zen_int_8 ( conj_t conjat, conj_t conjx, dim_t m, dim_t b_n, double* restrict alpha, double* restrict a, inc_t inca, inc_t lda, double* restrict x, inc_t incx, double* restrict beta, double* restrict y, inc_t incy, cntx_t* restrict cntx ) { const dim_t fuse_fac = 8; const dim_t n_elem_per_reg = 4; // If the b_n dimension is zero, y is empty and there is no computation. if ( bli_zero_dim1( b_n ) ) return; // If the m dimension is zero, or if alpha is zero, the computation // simplifies to updating y. if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) ) { dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx ); f ( BLIS_NO_CONJUGATE, b_n, beta, y, incy, cntx ); return; } // If b_n is not equal to the fusing factor, then perform the entire // operation as a loop over dotxv. if ( b_n != fuse_fac ) { ddotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx ); for ( dim_t i = 0; i < b_n; ++i ) { double* a1 = a + (0 )*inca + (i )*lda; double* x1 = x + (0 )*incx; double* psi1 = y + (i )*incy; f ( conjat, conjx, m, alpha, a1, inca, x1, incx, beta, psi1, cntx ); } return; } // At this point, we know that b_n is exactly equal to the fusing factor. // However, m may not be a multiple of the number of elements per vector. // Going forward, we handle two possible storage formats of A explicitly: // (1) A is stored by columns, or (2) A is stored by rows. Either case is // further split into two subproblems along the m dimension: // (a) a vectorized part, starting at m = 0 and ending at any 0 <= m' <= m. // (b) a scalar part, starting at m' and ending at m. If no vectorization // is possible then m' == 0 and thus the scalar part is the entire // problem. If 0 < m', then the a and x pointers and m variable will // be adjusted accordingly for the second subproblem. // Note: since parts (b) for both (1) and (2) are so similar, they are // factored out into one code block after the following conditional, which // distinguishes between (1) and (2). // Intermediate variables to hold the completed dot products double rho0 = 0, rho1 = 0, rho2 = 0, rho3 = 0, rho4 = 0, rho5 = 0, rho6 = 0, rho7 = 0; if ( inca == 1 && incx == 1 ) { const dim_t n_iter_unroll = 1; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. dim_t m_viter = ( m ) / ( n_elem_per_reg * n_iter_unroll ); // Set up pointers for x and the b_n columns of A (rows of A^T). double* restrict x0 = x; double* restrict a0 = a + 0*lda; double* restrict a1 = a + 1*lda; double* restrict a2 = a + 2*lda; double* restrict a3 = a + 3*lda; double* restrict a4 = a + 4*lda; double* restrict a5 = a + 5*lda; double* restrict a6 = a + 6*lda; double* restrict a7 = a + 7*lda; // Initialize b_n rho vector accumulators to zero. v4df_t rho0v; rho0v.v = _mm256_setzero_pd(); v4df_t rho1v; rho1v.v = _mm256_setzero_pd(); v4df_t rho2v; rho2v.v = _mm256_setzero_pd(); v4df_t rho3v; rho3v.v = _mm256_setzero_pd(); v4df_t rho4v; rho4v.v = _mm256_setzero_pd(); v4df_t rho5v; rho5v.v = _mm256_setzero_pd(); v4df_t rho6v; rho6v.v = _mm256_setzero_pd(); v4df_t rho7v; rho7v.v = _mm256_setzero_pd(); v4df_t x0v; v4df_t a0v, a1v, a2v, a3v, a4v, a5v, a6v, a7v; // If there are vectorized iterations, perform them with vector // instructions. for ( dim_t i = 0; i < m_viter; ++i ) { // Load the input values. x0v.v = _mm256_loadu_pd( x0 + 0*n_elem_per_reg ); a0v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg ); a1v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg ); a2v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg ); a3v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg ); a4v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg ); a5v.v = _mm256_loadu_pd( a5 + 0*n_elem_per_reg ); a6v.v = _mm256_loadu_pd( a6 + 0*n_elem_per_reg ); a7v.v = _mm256_loadu_pd( a7 + 0*n_elem_per_reg ); // perform: rho?v += a?v * x0v; rho0v.v = _mm256_fmadd_pd( a0v.v, x0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_pd( a1v.v, x0v.v, rho1v.v ); rho2v.v = _mm256_fmadd_pd( a2v.v, x0v.v, rho2v.v ); rho3v.v = _mm256_fmadd_pd( a3v.v, x0v.v, rho3v.v ); rho4v.v = _mm256_fmadd_pd( a4v.v, x0v.v, rho4v.v ); rho5v.v = _mm256_fmadd_pd( a5v.v, x0v.v, rho5v.v ); rho6v.v = _mm256_fmadd_pd( a6v.v, x0v.v, rho6v.v ); rho7v.v = _mm256_fmadd_pd( a7v.v, x0v.v, rho7v.v ); x0 += n_elem_per_reg * n_iter_unroll; a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; a4 += n_elem_per_reg * n_iter_unroll; a5 += n_elem_per_reg * n_iter_unroll; a6 += n_elem_per_reg * n_iter_unroll; a7 += n_elem_per_reg * n_iter_unroll; } #if 0 rho0 += rho0v.d[0] + rho0v.d[1] + rho0v.d[2] + rho0v.d[3]; rho1 += rho1v.d[0] + rho1v.d[1] + rho1v.d[2] + rho1v.d[3]; rho2 += rho2v.d[0] + rho2v.d[1] + rho2v.d[2] + rho2v.d[3]; rho3 += rho3v.d[0] + rho3v.d[1] + rho3v.d[2] + rho3v.d[3]; rho4 += rho4v.d[0] + rho4v.d[1] + rho4v.d[2] + rho4v.d[3]; rho5 += rho5v.d[0] + rho5v.d[1] + rho5v.d[2] + rho5v.d[3]; rho6 += rho6v.d[0] + rho6v.d[1] + rho6v.d[2] + rho6v.d[3]; rho7 += rho7v.d[0] + rho7v.d[1] + rho7v.d[2] + rho7v.d[3]; #else // Sum the elements of a given rho?v. This computes the sum of // elements within lanes and stores the sum to both elements. rho0v.v = _mm256_hadd_pd( rho0v.v, rho0v.v ); rho1v.v = _mm256_hadd_pd( rho1v.v, rho1v.v ); rho2v.v = _mm256_hadd_pd( rho2v.v, rho2v.v ); rho3v.v = _mm256_hadd_pd( rho3v.v, rho3v.v ); rho4v.v = _mm256_hadd_pd( rho4v.v, rho4v.v ); rho5v.v = _mm256_hadd_pd( rho5v.v, rho5v.v ); rho6v.v = _mm256_hadd_pd( rho6v.v, rho6v.v ); rho7v.v = _mm256_hadd_pd( rho7v.v, rho7v.v ); // Manually add the results from above to finish the sum. rho0 = rho0v.d[0] + rho0v.d[2]; rho1 = rho1v.d[0] + rho1v.d[2]; rho2 = rho2v.d[0] + rho2v.d[2]; rho3 = rho3v.d[0] + rho3v.d[2]; rho4 = rho4v.d[0] + rho4v.d[2]; rho5 = rho5v.d[0] + rho5v.d[2]; rho6 = rho6v.d[0] + rho6v.d[2]; rho7 = rho7v.d[0] + rho7v.d[2]; #endif // Adjust for scalar subproblem. m -= n_elem_per_reg * n_iter_unroll * m_viter; a += n_elem_per_reg * n_iter_unroll * m_viter /* * inca */; x += n_elem_per_reg * n_iter_unroll * m_viter /* * incx */; } else if ( lda == 1 ) { const dim_t n_iter_unroll = 3; const dim_t n_reg_per_row = 2; // fuse_fac / n_elem_per_reg; // Use the unrolling factor and the number of elements per register // to compute the number of vectorized and leftover iterations. dim_t m_viter = ( m ) / ( n_reg_per_row * n_iter_unroll ); // Initialize pointers for x and A. double* restrict x0 = x; double* restrict a0 = a; // Initialize rho vector accumulators to zero. v4df_t rho0v; rho0v.v = _mm256_setzero_pd(); v4df_t rho1v; rho1v.v = _mm256_setzero_pd(); v4df_t rho2v; rho2v.v = _mm256_setzero_pd(); v4df_t rho3v; rho3v.v = _mm256_setzero_pd(); v4df_t rho4v; rho4v.v = _mm256_setzero_pd(); v4df_t rho5v; rho5v.v = _mm256_setzero_pd(); v4df_t x0v, x1v, x2v; v4df_t a0v, a1v, a2v, a3v, a4v, a5v; for ( dim_t i = 0; i < m_viter; ++i ) { // Load the input values. a0v.v = _mm256_loadu_pd( a0 + 0*inca + 0*n_elem_per_reg ); a1v.v = _mm256_loadu_pd( a0 + 0*inca + 1*n_elem_per_reg ); a2v.v = _mm256_loadu_pd( a0 + 1*inca + 0*n_elem_per_reg ); a3v.v = _mm256_loadu_pd( a0 + 1*inca + 1*n_elem_per_reg ); a4v.v = _mm256_loadu_pd( a0 + 2*inca + 0*n_elem_per_reg ); a5v.v = _mm256_loadu_pd( a0 + 2*inca + 1*n_elem_per_reg ); x0v.v = _mm256_broadcast_sd( x0 + 0*incx ); x1v.v = _mm256_broadcast_sd( x0 + 1*incx ); x2v.v = _mm256_broadcast_sd( x0 + 2*incx ); // perform : rho?v += a?v * x?v; rho0v.v = _mm256_fmadd_pd( a0v.v, x0v.v, rho0v.v ); rho1v.v = _mm256_fmadd_pd( a1v.v, x0v.v, rho1v.v ); rho2v.v = _mm256_fmadd_pd( a2v.v, x1v.v, rho2v.v ); rho3v.v = _mm256_fmadd_pd( a3v.v, x1v.v, rho3v.v ); rho4v.v = _mm256_fmadd_pd( a4v.v, x2v.v, rho4v.v ); rho5v.v = _mm256_fmadd_pd( a5v.v, x2v.v, rho5v.v ); x0 += incx * n_iter_unroll; a0 += inca * n_iter_unroll; } // Combine the 8 accumulators into one vector register. rho0v.v = _mm256_add_pd( rho0v.v, rho2v.v ); rho0v.v = _mm256_add_pd( rho0v.v, rho4v.v ); rho1v.v = _mm256_add_pd( rho1v.v, rho3v.v ); rho1v.v = _mm256_add_pd( rho1v.v, rho5v.v ); // Write vector components to scalar values. rho0 = rho0v.d[0]; rho1 = rho0v.d[1]; rho2 = rho0v.d[2]; rho3 = rho0v.d[3]; rho4 = rho1v.d[0]; rho5 = rho1v.d[1]; rho6 = rho1v.d[2]; rho7 = rho1v.d[3]; // Adjust for scalar subproblem. m -= n_iter_unroll * m_viter; a += n_iter_unroll * m_viter * inca; x += n_iter_unroll * m_viter * incx; } else { // No vectorization possible; use scalar iterations for the entire // problem. } // Scalar edge case. { // Initialize pointers for x and the b_n columns of A (rows of A^T). double* restrict x0 = x; double* restrict a0 = a + 0*lda; double* restrict a1 = a + 1*lda; double* restrict a2 = a + 2*lda; double* restrict a3 = a + 3*lda; double* restrict a4 = a + 4*lda; double* restrict a5 = a + 5*lda; double* restrict a6 = a + 6*lda; double* restrict a7 = a + 7*lda; // If there are leftover iterations, perform them with scalar code. for ( dim_t i = 0; i < m ; ++i ) { const double x0c = *x0; const double a0c = *a0; const double a1c = *a1; const double a2c = *a2; const double a3c = *a3; const double a4c = *a4; const double a5c = *a5; const double a6c = *a6; const double a7c = *a7; rho0 += a0c * x0c; rho1 += a1c * x0c; rho2 += a2c * x0c; rho3 += a3c * x0c; rho4 += a4c * x0c; rho5 += a5c * x0c; rho6 += a6c * x0c; rho7 += a7c * x0c; x0 += incx; a0 += inca; a1 += inca; a2 += inca; a3 += inca; a4 += inca; a5 += inca; a6 += inca; a7 += inca; } } // Now prepare the final rho values to output/accumulate back into // the y vector. v4df_t rho0v, rho1v, y0v, y1v; // Insert the scalar rho values into a single vector. rho0v.d[0] = rho0; rho0v.d[1] = rho1; rho0v.d[2] = rho2; rho0v.d[3] = rho3; rho1v.d[0] = rho4; rho1v.d[1] = rho5; rho1v.d[2] = rho6; rho1v.d[3] = rho7; // Broadcast the alpha scalar. v4df_t alphav; alphav.v = _mm256_broadcast_sd( alpha ); // We know at this point that alpha is nonzero; however, beta may still // be zero. If beta is indeed zero, we must overwrite y rather than scale // by beta (in case y contains NaN or Inf). if ( PASTEMAC(d,eq0)( *beta ) ) { // Apply alpha to the accumulated dot product in rho: // y := alpha * rho y0v.v = _mm256_mul_pd( alphav.v, rho0v.v ); y1v.v = _mm256_mul_pd( alphav.v, rho1v.v ); } else { // Broadcast the beta scalar. v4df_t betav; betav.v = _mm256_broadcast_sd( beta ); // Load y. if ( incy == 1 ) { y0v.v = _mm256_loadu_pd( y + 0*n_elem_per_reg ); y1v.v = _mm256_loadu_pd( y + 1*n_elem_per_reg ); } else { y0v.d[0] = *(y + 0*incy); y0v.d[1] = *(y + 1*incy); y0v.d[2] = *(y + 2*incy); y0v.d[3] = *(y + 3*incy); y1v.d[0] = *(y + 4*incy); y1v.d[1] = *(y + 5*incy); y1v.d[2] = *(y + 6*incy); y1v.d[3] = *(y + 7*incy); } // Apply beta to y and alpha to the accumulated dot product in rho: // y := beta * y + alpha * rho y0v.v = _mm256_mul_pd( betav.v, y0v.v ); y1v.v = _mm256_mul_pd( betav.v, y1v.v ); y0v.v = _mm256_fmadd_pd( alphav.v, rho0v.v, y0v.v ); y1v.v = _mm256_fmadd_pd( alphav.v, rho1v.v, y1v.v ); } if ( incy == 1 ) { // Store the output. _mm256_storeu_pd( (y + 0*n_elem_per_reg), y0v.v ); _mm256_storeu_pd( (y + 1*n_elem_per_reg), y1v.v ); } else { *(y + 0*incy) = y0v.d[0]; *(y + 1*incy) = y0v.d[1]; *(y + 2*incy) = y0v.d[2]; *(y + 3*incy) = y0v.d[3]; *(y + 4*incy) = y1v.d[0]; *(y + 5*incy) = y1v.d[1]; *(y + 6*incy) = y1v.d[2]; *(y + 7*incy) = y1v.d[3]; } } cython-blis-0.9.1/blis/_src/kernels/zen/3/000077500000000000000000000000001427272030600202375ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen/3/bli_gemm_small.c000066400000000000000000005122111427272030600233500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2017 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "xmmintrin.h" #include "blis.h" #define AOCL_DTL_TRACE_ENTRY(x) ; #define AOCL_DTL_TRACE_EXIT(x) ; #define AOCL_DTL_TRACE_EXIT_ERR(x,y) ; #ifdef BLIS_ENABLE_SMALL_MATRIX #define MR 32 #define D_MR (MR >> 1) #define NR 3 #define D_BLIS_SMALL_MATRIX_K_THRES_ROME 256 #define BLIS_ENABLE_PREFETCH #define D_BLIS_SMALL_MATRIX_THRES (BLIS_SMALL_MATRIX_THRES / 2 ) #define D_BLIS_SMALL_M_RECT_MATRIX_THRES (BLIS_SMALL_M_RECT_MATRIX_THRES / 2) #define D_BLIS_SMALL_K_RECT_MATRIX_THRES (BLIS_SMALL_K_RECT_MATRIX_THRES / 2) #define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. #define AT_MR 4 // The kernel dimension of the A transpose GEMM kernel.(AT_MR * NR). static err_t bli_sgemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); static err_t bli_dgemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); static err_t bli_sgemm_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); static err_t bli_dgemm_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); /* * The bli_gemm_small function will use the * custom MRxNR kernels, to perform the computation. * The custom kernels are used if the [M * N] < 240 * 240 */ err_t bli_gemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); #ifdef BLIS_ENABLE_MULTITHREADING AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); return BLIS_NOT_YET_IMPLEMENTED; #endif // If alpha is zero, scale by beta and return. if (bli_obj_equals(alpha, &BLIS_ZERO)) { return BLIS_NOT_YET_IMPLEMENTED; } // if row major format return. if ((bli_obj_row_stride( a ) != 1) || (bli_obj_row_stride( b ) != 1) || (bli_obj_row_stride( c ) != 1)) { return BLIS_INVALID_ROW_STRIDE; } num_t dt = bli_obj_dt(c); if (bli_obj_has_trans( a )) { if (bli_obj_has_notrans( b )) { if (dt == BLIS_FLOAT) { return bli_sgemm_small_atbn(alpha, a, b, beta, c, cntx, cntl); } else if (dt == BLIS_DOUBLE) { return bli_dgemm_small_atbn(alpha, a, b, beta, c, cntx, cntl); } } return BLIS_NOT_YET_IMPLEMENTED; } if (dt == BLIS_DOUBLE) { return bli_dgemm_small(alpha, a, b, beta, c, cntx, cntl); } if (dt == BLIS_FLOAT) { return bli_sgemm_small(alpha, a, b, beta, c, cntx, cntl); } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); return BLIS_NOT_YET_IMPLEMENTED; }; static err_t bli_sgemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7); gint_t M = bli_obj_length( c ); // number of rows of Matrix C gint_t N = bli_obj_width( c ); // number of columns of Matrix C gint_t K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . gint_t L = M * N; // when N is equal to 1 call GEMV instead of GEMM if (N == 1) { bli_gemv ( alpha, a, b, beta, c ); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); return BLIS_SUCCESS; } if ((((L) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES)) || ((M < BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < BLIS_SMALL_K_RECT_MATRIX_THRES))) && ((L!=0) && (K!=0))) { guint_t lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. guint_t ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. guint_t ldc = bli_obj_col_stride( c ); // column stride of matrix C guint_t row_idx, col_idx, k; float *A = bli_obj_buffer_at_off(a); // pointer to elements of Matrix A float *B = bli_obj_buffer_at_off(b); // pointer to elements of Matrix B float *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C float *tA = A, *tB = B, *tC = C;//, *tA_pack; float *tA_packed; // temporary pointer to hold packed A memory pointer guint_t row_idx_packed; //packed A memory row index guint_t lda_packed; //lda of packed A guint_t col_idx_start; //starting index after A matrix is packed. dim_t tb_inc_row = 1; // row stride of matrix B dim_t tb_inc_col = ldb; // column stride of matrix B __m256 ymm4, ymm5, ymm6, ymm7; __m256 ymm8, ymm9, ymm10, ymm11; __m256 ymm12, ymm13, ymm14, ymm15; __m256 ymm0, ymm1, ymm2, ymm3; gint_t n_remainder; // If the N is non multiple of 3.(N%3) gint_t m_remainder; // If the M is non multiple of 32.(M%32) gint_t required_packing_A = 1; mem_t local_mem_buf_A_s; float *A_pack = NULL; rntm_t rntm; const num_t dt_exec = bli_obj_dt( c ); float* restrict alpha_cast = bli_obj_buffer_for_1x1( dt_exec, alpha ); float* restrict beta_cast = bli_obj_buffer_for_1x1( dt_exec, beta ); /*Beta Zero Check*/ bool is_beta_non_zero=0; if ( !bli_obj_equals( beta, &BLIS_ZERO ) ){ is_beta_non_zero = 1; } //update the pointer math if matrix B needs to be transposed. if (bli_obj_has_trans( b )) { tb_inc_col = 1; //switch row and column strides tb_inc_row = ldb; } /* * This function was using global array to pack part of A input when needed. * However, using this global array make the function non-reentrant. * Instead of using a global array we should allocate buffer for each invocation. * Since the buffer size is too big or stack and doing malloc every time will be too expensive, * better approach is to get the buffer from the pre-allocated pool and return * it the pool once we are doing. * * In order to get the buffer from pool, we need access to memory broker, * currently this function is not invoked in such a way that it can receive * the memory broker (via rntm). Following hack will get the global memory * broker that can be use it to access the pool. * * Note there will be memory allocation at least on first innovation * as there will not be any pool created for this size. * Subsequent invocations will just reuse the buffer from the pool. */ bli_rntm_init_from_global( &rntm ); bli_rntm_set_num_threads_only( 1, &rntm ); bli_pba_rntm_set_pba( &rntm ); // Get the current size of the buffer pool for A block packing. // We will use the same size to avoid pool re-initialization siz_t buffer_size = bli_pool_block_size(bli_pba_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), bli_rntm_pba(&rntm))); // Based on the available memory in the buffer we will decide if // we want to do packing or not. // // This kernel assumes that "A" will be un-packged if N <= 3. // Usually this range (N <= 3) is handled by SUP, however, // if SUP is disabled or for any other condition if we do // enter this kernel with N <= 3, we want to make sure that // "A" remains unpacked. // // If this check is removed it will result in the crash as // reported in CPUPL-587. // if ((N <= 3) || (((MR * K) << 2) > buffer_size)) { required_packing_A = 0; } else { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_sgemm_small: Requesting mem pool block of size %lu\n", buffer_size); #endif // Get the buffer from the pool, if there is no pool with // required size, it will be created. bli_pba_acquire_m(&rntm, buffer_size, BLIS_BITVAL_BUFFER_FOR_A_BLOCK, &local_mem_buf_A_s); A_pack = bli_mem_buffer(&local_mem_buf_A_s); } /* * The computation loop runs for MRxN columns of C matrix, thus * accessing the MRxK A matrix data and KxNR B matrix data. * The computation is organized as inner loops of dimension MRxNR. */ // Process MR rows of C matrix at a time. for (row_idx = 0; (row_idx + (MR - 1)) < M; row_idx += MR) { col_idx_start = 0; tA_packed = A; row_idx_packed = row_idx; lda_packed = lda; // This is the part of the pack and compute optimization. // During the first column iteration, we store the accessed A matrix into // contiguous static memory. This helps to keep te A matrix in Cache and // aviods the TLB misses. if (required_packing_A) { col_idx = 0; //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; tA_packed = A_pack; #ifdef BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 16), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing MR x K ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); _mm256_storeu_ps(tA_packed, ymm3); // the packing of matrix A // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); _mm256_storeu_ps(tA_packed + 8, ymm3); // the packing of matrix A // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); _mm256_storeu_ps(tA_packed + 16, ymm3); // the packing of matrix A // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); _mm256_storeu_ps(tA_packed + 24, ymm3); // the packing of matrix A // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); tA += lda; tA_packed += MR; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm11 = _mm256_mul_ps(ymm11, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate col 1. ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_ps(tC + 16); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(tC + 24); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7); float* ttC = tC +ldc; ymm2 = _mm256_loadu_ps(ttC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(ttC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(ttC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_ps(ttC + 24); ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11); ttC += ldc; ymm2 = _mm256_loadu_ps(ttC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(ttC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(ttC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(ttC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15); } _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); _mm256_storeu_ps(tC + 16, ymm6); _mm256_storeu_ps(tC + 24, ymm7); // multiply C by beta and accumulate, col 2. tC += ldc; _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); _mm256_storeu_ps(tC + 24, ymm11); // multiply C by beta and accumulate, col 3. tC += ldc; _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); // modify the pointer arithematic to use packed A matrix. col_idx_start = NR; tA_packed = A_pack; row_idx_packed = 0; lda_packed = MR; } // Process NR columns of C matrix at a time. for (col_idx = col_idx_start; (col_idx + (NR - 1)) < N; col_idx += NR) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = tA_packed + row_idx_packed; #ifdef BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 16), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing MR x K ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); tA += lda_packed; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm11 = _mm256_mul_ps(ymm11, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate col 1. ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_ps(tC + 16); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(tC + 24); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7); float* ttC = tC +ldc; ymm2 = _mm256_loadu_ps(ttC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(ttC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(ttC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_ps(ttC + 24); ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11); ttC = ttC +ldc; ymm2 = _mm256_loadu_ps(ttC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(ttC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(ttC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(ttC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15); } _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); _mm256_storeu_ps(tC + 16, ymm6); _mm256_storeu_ps(tC + 24, ymm7); // multiply C by beta and accumulate, col 2. tC += ldc; _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); _mm256_storeu_ps(tC + 24, ymm11); // multiply C by beta and accumulate, col 3. tC += ldc; _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm8 = _mm256_fmadd_ps(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_ps(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm9 = _mm256_fmadd_ps(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_ps(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); ymm11 = _mm256_fmadd_ps(ymm0, ymm3, ymm11); ymm15 = _mm256_fmadd_ps(ymm1, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm11 = _mm256_mul_ps(ymm11, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); // multiply C by beta and accumulate, col 1. if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); ymm2 = _mm256_loadu_ps(tC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(tC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_ps(tC + 24); ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11); float* ttC = tC +ldc; // multiply C by beta and accumulate, col 2. ymm2 = _mm256_loadu_ps(ttC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(ttC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(ttC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(ttC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15); } _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); _mm256_storeu_ps(tC + 24, ymm11); tC += ldc; _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm12 = _mm256_fmadd_ps(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); ymm15 = _mm256_fmadd_ps(ymm0, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC + 0); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(tC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15); } _mm256_storeu_ps(tC + 0, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); } } m_remainder = M - row_idx; if (m_remainder >= 24) { m_remainder -= 24; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_ps(tC + 16); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); float* ttC = tC +ldc; ymm2 = _mm256_loadu_ps(ttC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(ttC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(ttC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); ttC += ldc; ymm2 = _mm256_loadu_ps(ttC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(ttC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(ttC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); } _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); _mm256_storeu_ps(tC + 16, ymm6); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm8 = _mm256_fmadd_ps(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_ps(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm9 = _mm256_fmadd_ps(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_ps(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(tC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); float* ttC = tC +ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(ttC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(ttC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(ttC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); } _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); tC += ldc; _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm12 = _mm256_fmadd_ps(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC + 0); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); } _mm256_storeu_ps(tC + 0, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); } row_idx += 24; } if (m_remainder >= 16) { m_remainder -= 16; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_ps(ymm1, ymm3, ymm6); ymm8 = _mm256_fmadd_ps(ymm2, ymm3, ymm8); ymm3 = _mm256_loadu_ps(tA + 8); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); float* ttC = tC + ldc; ymm2 = _mm256_loadu_ps(ttC); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(ttC + 8); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7); ttC += ldc; ymm2 = _mm256_loadu_ps(ttC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(ttC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); } _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm6); _mm256_storeu_ps(tC + 8, ymm7); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_ps(ymm1, ymm3, ymm6); ymm3 = _mm256_loadu_ps(tA + 8); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); float* ttC = tC + ldc; ymm2 = _mm256_loadu_ps(ttC); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(ttC + 8); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7); } _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm6); _mm256_storeu_ps(tC + 8, ymm7); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_ps(tA + 8); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); // multiply C by beta and accumulate. if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); } _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); } row_idx += 16; } if (m_remainder >= 8) { m_remainder -= 8; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + ldc); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_ps(tC + 2*ldc); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); } _mm256_storeu_ps(tC, ymm4); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm5); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm6); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + ldc); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); } _mm256_storeu_ps(tC, ymm4); // multiply C by beta and accumulate. tC += ldc; _mm256_storeu_ps(tC, ymm5); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); ymm4 = _mm256_mul_ps(ymm4, ymm0); if(is_beta_non_zero) { ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); } _mm256_storeu_ps(tC, ymm4); } row_idx += 8; } // M is not a multiple of 32. // The handling of edge case where the remainder // dimension is less than 8. The padding takes place // to handle this case. if ((m_remainder) && (lda > 7)) { float f_temp[8] = {0.0}; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm5 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); ymm0 = _mm256_broadcast_ss(alpha_cast); ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); if(is_beta_non_zero){ ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); } _mm256_storeu_ps(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); if(is_beta_non_zero){ ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7); } _mm256_storeu_ps(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); if(is_beta_non_zero){ ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); } _mm256_storeu_ps(f_temp, ymm9); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; ymm3 = _mm256_loadu_ps(tA); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); tA += lda; } ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm0 = _mm256_broadcast_ss(alpha_cast); ymm1 = _mm256_broadcast_ss(beta_cast); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); if(is_beta_non_zero){ ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); } _mm256_storeu_ps(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); if(is_beta_non_zero){ ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7); } _mm256_storeu_ps(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_ps(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; ymm3 = _mm256_loadu_ps(tA); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); tA += lda; } ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm0 = _mm256_broadcast_ss(alpha_cast); // multiply C by beta and accumulate. ymm5 = _mm256_mul_ps(ymm5, ymm0); for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); if(is_beta_non_zero){ ymm1 = _mm256_broadcast_ss(beta_cast); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); } _mm256_storeu_ps(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } m_remainder = 0; } if (m_remainder) { float result; for (; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; result = 0; for (k = 0; k < K; ++k) { result += (*tA) * (*tB); tA += lda; tB += tb_inc_row; } result *= (*alpha_cast); if(is_beta_non_zero){ (*tC) = (*tC) * (*beta_cast) + result; }else{ (*tC) = result; } } } } // Return the buffer to pool if ((required_packing_A == 1) && bli_mem_is_alloc( &local_mem_buf_A_s) ) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_sgemm_small(): releasing mem pool block\n" ); #endif bli_pba_release(&rntm, &local_mem_buf_A_s); } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7); return BLIS_SUCCESS; } else { AOCL_DTL_TRACE_EXIT_ERR( AOCL_DTL_LEVEL_INFO, "Invalid dimesions for small gemm." ); return BLIS_NONCONFORMAL_DIMENSIONS; } }; static err_t bli_dgemm_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO); gint_t M = bli_obj_length( c ); // number of rows of Matrix C gint_t N = bli_obj_width( c ); // number of columns of Matrix C gint_t K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . gint_t L = M * N; // when N is equal to 1 call GEMV instead of GEMM if (N == 1) { bli_gemv ( alpha, a, b, beta, c ); AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return BLIS_SUCCESS; } if (N<3) //Implemenation assumes that N is atleast 3. { AOCL_DTL_TRACE_EXIT_ERR( AOCL_DTL_LEVEL_INFO, "N < 3, cannot be processed by small gemm" ); return BLIS_NOT_YET_IMPLEMENTED; } #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if( (L && K) && ((K < D_BLIS_SMALL_MATRIX_K_THRES_ROME) || ((N < BLIS_SMALL_MATRIX_THRES_ROME) && (K < BLIS_SMALL_MATRIX_THRES_ROME)))) #else if ((((L) < (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES)) || ((M < D_BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < D_BLIS_SMALL_K_RECT_MATRIX_THRES))) && ((L!=0) && (K!=0))) #endif { guint_t lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. guint_t ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. guint_t ldc = bli_obj_col_stride( c ); // column stride of matrix C guint_t row_idx, col_idx, k; double *A = bli_obj_buffer_at_off(a); // pointer to elements of Matrix A double *B = bli_obj_buffer_at_off(b); // pointer to elements of Matrix B double *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C double *tA = A, *tB = B, *tC = C;//, *tA_pack; double *tA_packed; // temprorary pointer to hold packed A memory pointer guint_t row_idx_packed; //packed A memory row index guint_t lda_packed; //lda of packed A guint_t col_idx_start; //starting index after A matrix is packed. dim_t tb_inc_row = 1; // row stride of matrix B dim_t tb_inc_col = ldb; // column stride of matrix B __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm0, ymm1, ymm2, ymm3; gint_t n_remainder; // If the N is non multiple of 3.(N%3) gint_t m_remainder; // If the M is non multiple of 16.(M%16) double *alpha_cast, *beta_cast; // alpha, beta multiples alpha_cast = bli_obj_buffer_for_1x1(BLIS_DOUBLE, alpha); beta_cast = bli_obj_buffer_for_1x1(BLIS_DOUBLE, beta); gint_t required_packing_A = 1; mem_t local_mem_buf_A_s; double *D_A_pack = NULL; rntm_t rntm; //update the pointer math if matrix B needs to be transposed. if (bli_obj_has_trans( b )) { tb_inc_col = 1; //switch row and column strides tb_inc_row = ldb; } //checking whether beta value is zero. //if true, we should perform C=alpha * A*B operation //instead of C = beta * C + alpha * (A * B) bool is_beta_non_zero = 0; if(!bli_obj_equals(beta, &BLIS_ZERO)) is_beta_non_zero = 1; /* * This function was using global array to pack part of A input when needed. * However, using this global array make the function non-reentrant. * Instead of using a global array we should allocate buffer for each invocation. * Since the buffer size is too big or stack and doing malloc every time will be too expensive, * better approach is to get the buffer from the pre-allocated pool and return * it the pool once we are doing. * * In order to get the buffer from pool, we need access to memory broker, * currently this function is not invoked in such a way that it can receive * the memory broker (via rntm). Following hack will get the global memory * broker that can be use it to access the pool. * * Note there will be memory allocation at least on first innovation * as there will not be any pool created for this size. * Subsequent invocations will just reuse the buffer from the pool. */ bli_rntm_init_from_global( &rntm ); bli_rntm_set_num_threads_only( 1, &rntm ); bli_pba_rntm_set_pba( &rntm ); // Get the current size of the buffer pool for A block packing. // We will use the same size to avoid pool re-initliazaton siz_t buffer_size = bli_pool_block_size( bli_pba_pool(bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK), bli_rntm_pba(&rntm))); // // This kernel assumes that "A" will be unpackged if N <= 3. // Usually this range (N <= 3) is handled by SUP, however, // if SUP is disabled or for any other condition if we do // enter this kernel with N <= 3, we want to make sure that // "A" remains unpacked. // // If this check is removed it will result in the crash as // reported in CPUPL-587. // if ((N <= 3) || ((D_MR * K) << 3) > buffer_size) { required_packing_A = 0; } if (required_packing_A == 1) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_dgemm_small: Requesting mem pool block of size %lu\n", buffer_size); #endif // Get the buffer from the pool. bli_pba_acquire_m(&rntm, buffer_size, BLIS_BITVAL_BUFFER_FOR_A_BLOCK, &local_mem_buf_A_s); D_A_pack = bli_mem_buffer(&local_mem_buf_A_s); } /* * The computation loop runs for D_MRxN columns of C matrix, thus * accessing the D_MRxK A matrix data and KxNR B matrix data. * The computation is organized as inner loops of dimension D_MRxNR. */ // Process D_MR rows of C matrix at a time. for (row_idx = 0; (row_idx + (D_MR - 1)) < M; row_idx += D_MR) { col_idx_start = 0; tA_packed = A; row_idx_packed = row_idx; lda_packed = lda; // This is the part of the pack and compute optimization. // During the first column iteration, we store the accessed A matrix into // contiguous static memory. This helps to keep te A matrix in Cache and // aviods the TLB misses. if (required_packing_A) { col_idx = 0; //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; tA_packed = D_A_pack; #ifdef BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing D_MR x K ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); _mm256_storeu_pd(tA_packed, ymm3); // the packing of matrix A // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); _mm256_storeu_pd(tA_packed + 4, ymm3); // the packing of matrix A // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); _mm256_storeu_pd(tA_packed + 8, ymm3); // the packing of matrix A // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); _mm256_storeu_pd(tA_packed + 12, ymm3); // the packing of matrix A // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); tA += lda; tA_packed += D_MR; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate col 1. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_pd(tC + 8); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(tC + 12); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); double* ttC = tC + ldc; // multiply C by beta and accumulate, col 2. ymm2 = _mm256_loadu_pd(ttC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(ttC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(ttC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_pd(ttC + 12); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11); ttC += ldc; // multiply C by beta and accumulate, col 3. ymm2 = _mm256_loadu_pd(ttC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(ttC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(ttC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(ttC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); } _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); _mm256_storeu_pd(tC + 8, ymm6); _mm256_storeu_pd(tC + 12, ymm7); tC += ldc; _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); _mm256_storeu_pd(tC + 12, ymm11); tC += ldc; _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); // modify the pointer arithematic to use packed A matrix. col_idx_start = NR; tA_packed = D_A_pack; row_idx_packed = 0; lda_packed = D_MR; } // Process NR columns of C matrix at a time. for (col_idx = col_idx_start; (col_idx + (NR - 1)) < N; col_idx += NR) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = tA_packed + row_idx_packed; #ifdef BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing D_MR x K ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); tA += lda_packed; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate col 1. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_pd(tC + 8); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(tC + 12); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); // multiply C by beta and accumulate, col 2. double* ttC = tC + ldc; ymm2 = _mm256_loadu_pd(ttC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(ttC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(ttC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_pd(ttC + 12); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11); // multiply C by beta and accumulate, col 3. ttC += ldc; ymm2 = _mm256_loadu_pd(ttC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(ttC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(ttC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(ttC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); } _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); _mm256_storeu_pd(tC + 8, ymm6); _mm256_storeu_pd(tC + 12, ymm7); tC += ldc; _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); _mm256_storeu_pd(tC + 12, ymm11); tC += ldc; _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); ymm11 = _mm256_fmadd_pd(ymm0, ymm3, ymm11); ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate, col 1. ymm2 = _mm256_loadu_pd(tC + 0); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_pd(tC + 12); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11); // multiply C by beta and accumulate, col 2. double *ttC = tC + ldc; ymm2 = _mm256_loadu_pd(ttC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(ttC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(ttC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(ttC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); } _mm256_storeu_pd(tC + 0, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); _mm256_storeu_pd(tC + 12, ymm11); tC += ldc; _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC + 0); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(tC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); } _mm256_storeu_pd(tC + 0, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); } } m_remainder = M - row_idx; if (m_remainder >= 12) { m_remainder -= 12; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_pd(tC + 8); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); // multiply C by beta and accumulate. double *ttC = tC +ldc; ymm2 = _mm256_loadu_pd(ttC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(ttC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(ttC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); // multiply C by beta and accumulate. ttC += ldc; ymm2 = _mm256_loadu_pd(ttC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(ttC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(ttC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); } _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); _mm256_storeu_pd(tC + 8, ymm6); tC += ldc; _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); tC += ldc; _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC + 0); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); double *ttC = tC + ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(ttC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(ttC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); } _mm256_storeu_pd(tC + 0, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); tC += ldc; _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC + 0); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); } _mm256_storeu_pd(tC + 0, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); } row_idx += 12; } if (m_remainder >= 8) { m_remainder -= 8; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); ymm8 = _mm256_fmadd_pd(ymm2, ymm3, ymm8); ymm3 = _mm256_loadu_pd(tA + 4); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); double* ttC = tC + ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(ttC + 4); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); ttC += ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(ttC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); } _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); tC += ldc; _mm256_storeu_pd(tC, ymm6); _mm256_storeu_pd(tC + 4, ymm7); tC += ldc; _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); ymm3 = _mm256_loadu_pd(tA + 4); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); double* ttC = tC + ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(ttC + 4); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); } _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); tC += ldc; _mm256_storeu_pd(tC, ymm6); _mm256_storeu_pd(tC + 4, ymm7); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_pd(tA + 4); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); } _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); } row_idx += 8; } if (m_remainder >= 4) { //printf("HERE\n"); m_remainder -= 4; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); double* ttC = tC + ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ttC += ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); } _mm256_storeu_pd(tC, ymm4); tC += ldc; _mm256_storeu_pd(tC, ymm5); tC += ldc; _mm256_storeu_pd(tC, ymm6); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); double* ttC = tC + ldc; // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(ttC); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); } _mm256_storeu_pd(tC, ymm4); tC += ldc; _mm256_storeu_pd(tC, ymm5); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); ymm4 = _mm256_mul_pd(ymm4, ymm0); if(is_beta_non_zero) { // multiply C by beta and accumulate. ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); } _mm256_storeu_pd(tC, ymm4); } row_idx += 4; } // M is not a multiple of 32. // The handling of edge case where the remainder // dimension is less than 8. The padding takes place // to handle this case. if ((m_remainder) && (lda > 3)) { double f_temp[8] = {0.0}; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm5 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); if(is_beta_non_zero) { for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); double* ttC = tC + ldc; for (int i = 0; i < m_remainder; i++) { f_temp[i] = ttC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); ttC += ldc; for (int i = 0; i < m_remainder; i++) { f_temp[i] = ttC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); } _mm256_storeu_pd(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; _mm256_storeu_pd(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; _mm256_storeu_pd(f_temp, ymm9); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; ymm3 = _mm256_loadu_pd(tA); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); tA += lda; } ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); if(is_beta_non_zero) { for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); double* ttC = tC + ldc; for (int i = 0; i < m_remainder; i++) { f_temp[i] = ttC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7); } _mm256_storeu_pd(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; _mm256_storeu_pd(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_pd(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; ymm3 = _mm256_loadu_pd(tA); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); tA += lda; } ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm0 = _mm256_broadcast_sd(alpha_cast); ymm1 = _mm256_broadcast_sd(beta_cast); // multiply C by beta and accumulate. ymm5 = _mm256_mul_pd(ymm5, ymm0); if(is_beta_non_zero) { for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); } _mm256_storeu_pd(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } m_remainder = 0; } if (m_remainder) { double result; for (; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; result = 0; for (k = 0; k < K; ++k) { result += (*tA) * (*tB); tA += lda; tB += tb_inc_row; } result *= (*alpha_cast); if(is_beta_non_zero) (*tC) = (*tC) * (*beta_cast) + result; else (*tC) = result; } } } // Return the buffer to pool if ((required_packing_A == 1) && bli_mem_is_alloc( &local_mem_buf_A_s )) { #ifdef BLIS_ENABLE_MEM_TRACING printf( "bli_dgemm_small(): releasing mem pool block\n" ); #endif bli_pba_release(&rntm, &local_mem_buf_A_s); } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return BLIS_SUCCESS; } else { AOCL_DTL_TRACE_EXIT_ERR( AOCL_DTL_LEVEL_INFO, "Invalid dimesions for small gemm." ); return BLIS_NONCONFORMAL_DIMENSIONS; } }; static err_t bli_sgemm_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO); gint_t M = bli_obj_length( c ); // number of rows of Matrix C gint_t N = bli_obj_width( c ); // number of columns of Matrix C gint_t K = bli_obj_length( b ); // number of rows of Matrix B guint_t lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. guint_t ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. guint_t ldc = bli_obj_col_stride( c ); // column stride of matrix C int row_idx = 0, col_idx = 0, k; float *A = bli_obj_buffer_at_off(a); // pointer to matrix A elements, stored in row major format float *B = bli_obj_buffer_at_off(b); // pointer to matrix B elements, stored in column major format float *C = bli_obj_buffer_at_off(c); // pointer to matrix C elements, stored in column major format float *tA = A, *tB = B, *tC = C; __m256 ymm4, ymm5, ymm6, ymm7; __m256 ymm8, ymm9, ymm10, ymm11; __m256 ymm12, ymm13, ymm14, ymm15; __m256 ymm0, ymm1, ymm2, ymm3; float result; float scratch[8] = {0.0}; const num_t dt_exec = bli_obj_dt( c ); float* restrict alpha_cast = bli_obj_buffer_for_1x1( dt_exec, alpha ); float* restrict beta_cast = bli_obj_buffer_for_1x1( dt_exec, beta ); /*Beta Zero Check*/ bool is_beta_non_zero=0; if ( !bli_obj_equals( beta, &BLIS_ZERO ) ){ is_beta_non_zero = 1; } // The non-copy version of the A^T GEMM gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); //The inner loop computes the 4x3 values of the matrix. //The computation pattern is: // ymm4 ymm5 ymm6 // ymm7 ymm8 ymm9 // ymm10 ymm11 ymm12 // ymm13 ymm14 ymm15 //The Dot operation is performed in the inner loop, 8 float elements fit //in the YMM register hence loop count incremented by 8 for (k = 0; (k + 7) < K; k += 8) { ymm0 = _mm256_loadu_ps(tB + 0); ymm1 = _mm256_loadu_ps(tB + ldb); ymm2 = _mm256_loadu_ps(tB + 2 * ldb); ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); ymm3 = _mm256_loadu_ps(tA + lda); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); ymm3 = _mm256_loadu_ps(tA + 2 * lda); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 3 * lda); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); tA += 8; tB += 8; } // if K is not a multiple of 8, padding is done before load using temproary array. if (k < K) { int iter; float data_feeder[8] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; ymm1 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; ymm2 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); } //horizontal addition and storage of the data. //Results for 4x3 blocks of C is stored here ymm4 = _mm256_hadd_ps(ymm4, ymm4); ymm4 = _mm256_hadd_ps(ymm4, ymm4); _mm256_storeu_ps(scratch, ymm4); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[0] = result + tC[0] * (*beta_cast); }else{ tC[0] = result; } ymm7 = _mm256_hadd_ps(ymm7, ymm7); ymm7 = _mm256_hadd_ps(ymm7, ymm7); _mm256_storeu_ps(scratch, ymm7); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[1] = result + tC[1] * (*beta_cast); }else{ tC[1] = result; } ymm10 = _mm256_hadd_ps(ymm10, ymm10); ymm10 = _mm256_hadd_ps(ymm10, ymm10); _mm256_storeu_ps(scratch, ymm10); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[2] = result + tC[2] * (*beta_cast); }else{ tC[2] = result; } ymm13 = _mm256_hadd_ps(ymm13, ymm13); ymm13 = _mm256_hadd_ps(ymm13, ymm13); _mm256_storeu_ps(scratch, ymm13); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[3] = result + tC[3] * (*beta_cast); }else{ tC[3] = result; } tC += ldc; ymm5 = _mm256_hadd_ps(ymm5, ymm5); ymm5 = _mm256_hadd_ps(ymm5, ymm5); _mm256_storeu_ps(scratch, ymm5); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[0] = result + tC[0] * (*beta_cast); }else{ tC[0] = result; } ymm8 = _mm256_hadd_ps(ymm8, ymm8); ymm8 = _mm256_hadd_ps(ymm8, ymm8); _mm256_storeu_ps(scratch, ymm8); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[1] = result + tC[1] * (*beta_cast); }else{ tC[1] = result; } ymm11 = _mm256_hadd_ps(ymm11, ymm11); ymm11 = _mm256_hadd_ps(ymm11, ymm11); _mm256_storeu_ps(scratch, ymm11); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[2] = result + tC[2] * (*beta_cast); }else{ tC[2] = result; } ymm14 = _mm256_hadd_ps(ymm14, ymm14); ymm14 = _mm256_hadd_ps(ymm14, ymm14); _mm256_storeu_ps(scratch, ymm14); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[3] = result + tC[3] * (*beta_cast); }else{ tC[3] = result; } tC += ldc; ymm6 = _mm256_hadd_ps(ymm6, ymm6); ymm6 = _mm256_hadd_ps(ymm6, ymm6); _mm256_storeu_ps(scratch, ymm6); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[0] = result + tC[0] * (*beta_cast); }else{ tC[0] = result; } ymm9 = _mm256_hadd_ps(ymm9, ymm9); ymm9 = _mm256_hadd_ps(ymm9, ymm9); _mm256_storeu_ps(scratch, ymm9); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[1] = result + tC[1] * (*beta_cast); }else{ tC[1] = result; } ymm12 = _mm256_hadd_ps(ymm12, ymm12); ymm12 = _mm256_hadd_ps(ymm12, ymm12); _mm256_storeu_ps(scratch, ymm12); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[2] = result + tC[2] * (*beta_cast); }else{ tC[2] = result; } ymm15 = _mm256_hadd_ps(ymm15, ymm15); ymm15 = _mm256_hadd_ps(ymm15, ymm15); _mm256_storeu_ps(scratch, ymm15); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[3] = result + tC[3] * (*beta_cast); }else{ tC[3] = result; } } } int processed_col = col_idx; int processed_row = row_idx; //The edge case handling where N is not a multiple of 3 if (processed_col < N) { for (col_idx = processed_col; col_idx < N; col_idx += 1) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); //The inner loop computes the 4x1 values of the matrix. //The computation pattern is: // ymm4 // ymm7 // ymm10 // ymm13 for (k = 0; (k + 7) < K; k += 8) { ymm0 = _mm256_loadu_ps(tB + 0); ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_ps(tA + lda); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); ymm3 = _mm256_loadu_ps(tA + 2 * lda); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm3 = _mm256_loadu_ps(tA + 3 * lda); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); tA += 8; tB += 8; } // if K is not a multiple of 8, padding is done before load using temproary array. if (k < K) { int iter; float data_feeder[8] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); } //horizontal addition and storage of the data. //Results for 4x1 blocks of C is stored here ymm4 = _mm256_hadd_ps(ymm4, ymm4); ymm4 = _mm256_hadd_ps(ymm4, ymm4); _mm256_storeu_ps(scratch, ymm4); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[0] = result + tC[0] * (*beta_cast); }else{ tC[0] = result; } ymm7 = _mm256_hadd_ps(ymm7, ymm7); ymm7 = _mm256_hadd_ps(ymm7, ymm7); _mm256_storeu_ps(scratch, ymm7); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[1] = result + tC[1] * (*beta_cast); }else{ tC[1] = result; } ymm10 = _mm256_hadd_ps(ymm10, ymm10); ymm10 = _mm256_hadd_ps(ymm10, ymm10); _mm256_storeu_ps(scratch, ymm10); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[2] = result + tC[2] * (*beta_cast); }else{ tC[2] = result; } ymm13 = _mm256_hadd_ps(ymm13, ymm13); ymm13 = _mm256_hadd_ps(ymm13, ymm13); _mm256_storeu_ps(scratch, ymm13); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[3] = result + tC[3] * (*beta_cast); }else{ tC[3] = result; } } } processed_row = row_idx; } //The edge case handling where M is not a multiple of 4 if (processed_row < M) { for (row_idx = processed_row; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); for (k = 0; (k + 7) < K; k += 8) { ymm0 = _mm256_loadu_ps(tB + 0); ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); tA += 8; tB += 8; } // if K is not a multiple of 8, padding is done before load using temproary array. if (k < K) { int iter; float data_feeder[8] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); } //horizontal addition and storage of the data. ymm4 = _mm256_hadd_ps(ymm4, ymm4); ymm4 = _mm256_hadd_ps(ymm4, ymm4); _mm256_storeu_ps(scratch, ymm4); result = scratch[0] + scratch[4]; result *= (*alpha_cast); if(is_beta_non_zero){ tC[0] = result + tC[0] * (*beta_cast); }else{ tC[0] = result; } } } } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return BLIS_SUCCESS; } else { AOCL_DTL_TRACE_EXIT_ERR( AOCL_DTL_LEVEL_INFO, "Invalid dimesions for small gemm." ); return BLIS_NONCONFORMAL_DIMENSIONS; } } static err_t bli_dgemm_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO); gint_t M = bli_obj_length( c ); // number of rows of Matrix C gint_t N = bli_obj_width( c ); // number of columns of Matrix C gint_t K = bli_obj_length( b ); // number of rows of Matrix B // The non-copy version of the A^T GEMM gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { guint_t lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. guint_t ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. guint_t ldc = bli_obj_col_stride( c ); // column stride of matrix C guint_t row_idx = 0, col_idx = 0, k; double *A = bli_obj_buffer_at_off(a); // pointer to matrix A elements, stored in row major format double *B = bli_obj_buffer_at_off(b); // pointer to matrix B elements, stored in column major format double *C = bli_obj_buffer_at_off(c); // pointer to matrix C elements, stored in column major format double *tA = A, *tB = B, *tC = C; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm0, ymm1, ymm2, ymm3; double result; double scratch[8] = {0.0}; double *alpha_cast, *beta_cast; // alpha, beta multiples alpha_cast = bli_obj_buffer_for_1x1(BLIS_DOUBLE, alpha); beta_cast = bli_obj_buffer_for_1x1(BLIS_DOUBLE, beta); //check if beta is zero //if true, we need to perform C = alpha * (A * B) //instead of C = beta * C + alpha * (A * B) bool is_beta_non_zero = 0; if(!bli_obj_equals(beta,&BLIS_ZERO)) is_beta_non_zero = 1; for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); //The inner loop computes the 4x3 values of the matrix. //The computation pattern is: // ymm4 ymm5 ymm6 // ymm7 ymm8 ymm9 // ymm10 ymm11 ymm12 // ymm13 ymm14 ymm15 //The Dot operation is performed in the inner loop, 4 double elements fit //in the YMM register hence loop count incremented by 4 for (k = 0; (k + 3) < K; k += 4) { ymm0 = _mm256_loadu_pd(tB + 0); ymm1 = _mm256_loadu_pd(tB + ldb); ymm2 = _mm256_loadu_pd(tB + 2 * ldb); ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); ymm3 = _mm256_loadu_pd(tA + lda); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); ymm3 = _mm256_loadu_pd(tA + 2 * lda); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 3 * lda); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); tA += 4; tB += 4; } // if K is not a multiple of 4, padding is done before load using temproary array. if (k < K) { int iter; double data_feeder[4] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; ymm1 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; ymm2 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); } //horizontal addition and storage of the data. //Results for 4x3 blocks of C is stored here ymm4 = _mm256_hadd_pd(ymm4, ymm4); _mm256_storeu_pd(scratch, ymm4); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[0] = result + tC[0] * (*beta_cast); else tC[0] = result; ymm7 = _mm256_hadd_pd(ymm7, ymm7); _mm256_storeu_pd(scratch, ymm7); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[1] = result + tC[1] * (*beta_cast); else tC[1] = result; ymm10 = _mm256_hadd_pd(ymm10, ymm10); _mm256_storeu_pd(scratch, ymm10); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[2] = result + tC[2] * (*beta_cast); else tC[2] = result; ymm13 = _mm256_hadd_pd(ymm13, ymm13); _mm256_storeu_pd(scratch, ymm13); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[3] = result + tC[3] * (*beta_cast); else tC[3] = result; tC += ldc; ymm5 = _mm256_hadd_pd(ymm5, ymm5); _mm256_storeu_pd(scratch, ymm5); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[0] = result + tC[0] * (*beta_cast); else tC[0] = result; ymm8 = _mm256_hadd_pd(ymm8, ymm8); _mm256_storeu_pd(scratch, ymm8); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[1] = result + tC[1] * (*beta_cast); else tC[1] = result; ymm11 = _mm256_hadd_pd(ymm11, ymm11); _mm256_storeu_pd(scratch, ymm11); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[2] = result + tC[2] * (*beta_cast); else tC[2] = result; ymm14 = _mm256_hadd_pd(ymm14, ymm14); _mm256_storeu_pd(scratch, ymm14); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[3] = result + tC[3] * (*beta_cast); else tC[3] = result; tC += ldc; ymm6 = _mm256_hadd_pd(ymm6, ymm6); _mm256_storeu_pd(scratch, ymm6); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[0] = result + tC[0] * (*beta_cast); else tC[0] = result; ymm9 = _mm256_hadd_pd(ymm9, ymm9); _mm256_storeu_pd(scratch, ymm9); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[1] = result + tC[1] * (*beta_cast); else tC[1] = result; ymm12 = _mm256_hadd_pd(ymm12, ymm12); _mm256_storeu_pd(scratch, ymm12); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[2] = result + tC[2] * (*beta_cast); else tC[2] = result; ymm15 = _mm256_hadd_pd(ymm15, ymm15); _mm256_storeu_pd(scratch, ymm15); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[3] = result + tC[3] * (*beta_cast); else tC[3] = result; } } int processed_col = col_idx; int processed_row = row_idx; //The edge case handling where N is not a multiple of 3 if (processed_col < N) { for (col_idx = processed_col; col_idx < N; col_idx += 1) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); //The inner loop computes the 4x1 values of the matrix. //The computation pattern is: // ymm4 // ymm7 // ymm10 // ymm13 for (k = 0; (k + 3) < K; k += 4) { ymm0 = _mm256_loadu_pd(tB + 0); ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_pd(tA + lda); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); ymm3 = _mm256_loadu_pd(tA + 2 * lda); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm3 = _mm256_loadu_pd(tA + 3 * lda); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); tA += 4; tB += 4; } // if K is not a multiple of 4, padding is done before load using temproary array. if (k < K) { int iter; double data_feeder[4] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); } //horizontal addition and storage of the data. //Results for 4x1 blocks of C is stored here ymm4 = _mm256_hadd_pd(ymm4, ymm4); _mm256_storeu_pd(scratch, ymm4); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[0] = result + tC[0] * (*beta_cast); else tC[0] = result; ymm7 = _mm256_hadd_pd(ymm7, ymm7); _mm256_storeu_pd(scratch, ymm7); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[1] = result + tC[1] * (*beta_cast); else tC[1] = result; ymm10 = _mm256_hadd_pd(ymm10, ymm10); _mm256_storeu_pd(scratch, ymm10); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[2] = result + tC[2] * (*beta_cast); else tC[2] = result; ymm13 = _mm256_hadd_pd(ymm13, ymm13); _mm256_storeu_pd(scratch, ymm13); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[3] = result + tC[3] * (*beta_cast); else tC[3] = result; } } processed_row = row_idx; } // The edge case handling where M is not a multiple of 4 if (processed_row < M) { for (row_idx = processed_row; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); for (k = 0; (k + 3) < K; k += 4) { ymm0 = _mm256_loadu_pd(tB + 0); ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); tA += 4; tB += 4; } // if K is not a multiple of 4, padding is done before load using temproary array. if (k < K) { int iter; double data_feeder[4] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); } //horizontal addition and storage of the data. ymm4 = _mm256_hadd_pd(ymm4, ymm4); _mm256_storeu_pd(scratch, ymm4); result = scratch[0] + scratch[2]; result *= (*alpha_cast); if(is_beta_non_zero) tC[0] = result + tC[0] * (*beta_cast); else tC[0] = result; } } } AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO); return BLIS_SUCCESS; } else { AOCL_DTL_TRACE_EXIT_ERR( AOCL_DTL_LEVEL_INFO, "Invalid dimesions for small gemm." ); return BLIS_NONCONFORMAL_DIMENSIONS; } } #endif cython-blis-0.9.1/blis/_src/kernels/zen/3/bli_gemmt_small.c000066400000000000000000005140621427272030600235420ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name of The University of Texas at Austin nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "immintrin.h" #include "xmmintrin.h" #include "blis.h" #ifdef BLIS_ENABLE_SMALL_MATRIX #define MR 32 #define D_MR (MR >> 1) #define NR 3 #define BLIS_ENABLE_PREFETCH #define F_SCRATCH_DIM (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES) static float A_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); static float C_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); #define D_BLIS_SMALL_MATRIX_THRES (BLIS_SMALL_MATRIX_THRES / 2 ) #define D_BLIS_SMALL_M_RECT_MATRIX_THRES (BLIS_SMALL_M_RECT_MATRIX_THRES / 2) #define D_BLIS_SMALL_K_RECT_MATRIX_THRES (BLIS_SMALL_K_RECT_MATRIX_THRES / 2) #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES) static double D_A_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); static double D_C_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); #define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. #define AT_MR 4 // The kernel dimension of the A transpose GEMMT kernel.(AT_MR * NR). static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ); /* * The bli_gemmt_small function will use the * custom MRxNR kernels, to perform the computation. * The custom kernels are used if the [M * N] < 240 * 240 */ err_t bli_gemmt_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { // FGVZ: This code was originally in bli_gemmt_front(). However, it really // fits more naturally here within the bli_gemmt_small() function. This // becomes a bit more obvious now that the code is here, as it contains // cpp macros such as BLIS_SMALL_MATRIX_A_THRES_M_GEMMT, which are specific // to this implementation. if ( bli_obj_has_trans( a ) ) { // Continue with small implementation. ; } else if ( ( bli_obj_length( a ) <= BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && bli_obj_width( a ) < BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) || ( bli_obj_length( a ) < BLIS_SMALL_MATRIX_A_THRES_M_GEMMT && bli_obj_width( a ) <= BLIS_SMALL_MATRIX_A_THRES_N_GEMMT ) ) { // Continue with small implementation. ; } else { // Reject the problem and return to large code path. return BLIS_FAILURE; } #ifdef BLIS_ENABLE_MULTITHREADING return BLIS_NOT_YET_IMPLEMENTED; #endif // If alpha is zero, scale by beta and return. if (bli_obj_equals(alpha, &BLIS_ZERO)) { return BLIS_NOT_YET_IMPLEMENTED; } // if row major format return. if ((bli_obj_row_stride( a ) != 1) || (bli_obj_row_stride( b ) != 1) || (bli_obj_row_stride( c ) != 1)) { return BLIS_INVALID_ROW_STRIDE; } num_t dt = ((*c).info & (0x7 << 0)); if (bli_obj_has_trans( a )) { if (bli_obj_has_notrans( b )) { if (dt == BLIS_FLOAT) { return bli_sgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } else if (dt == BLIS_DOUBLE) { return bli_dgemmt_small_atbn(alpha, a, b, beta, c, cntx, cntl); } } return BLIS_NOT_YET_IMPLEMENTED; } if (dt == BLIS_DOUBLE) { return bli_dgemmt_small(alpha, a, b, beta, c, cntx, cntl); } if (dt == BLIS_FLOAT) { return bli_sgemmt_small(alpha, a, b, beta, c, cntx, cntl); } return BLIS_NOT_YET_IMPLEMENTED; }; static err_t bli_sgemmt_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { int M = bli_obj_length( c ); // number of rows of Matrix C int N = bli_obj_width( c ); // number of columns of Matrix C int K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . int L = M * N; if ((((L) < (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES)) || ((M < BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < BLIS_SMALL_K_RECT_MATRIX_THRES))) && ((L!=0) && (K!=0))) { int lda = bli_obj_col_stride(a); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. int ldb = bli_obj_col_stride(b); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C int row_idx, col_idx, k; int rs_matC = bli_obj_row_stride( c ); int rsc = 1; float *A = a->buffer; // pointer to elements of Matrix A float *B = b->buffer; // pointer to elements of Matrix B float *C = C_pack; // pointer to elements of Matrix C float *matCbuf = c->buffer; float *tA = A, *tB = B, *tC = C;//, *tA_pack; float *tA_packed; // temprorary pointer to hold packed A memory pointer int row_idx_packed; //packed A memory row index int lda_packed; //lda of packed A int col_idx_start; //starting index after A matrix is packed. dim_t tb_inc_row = 1; // row stride of matrix B dim_t tb_inc_col = ldb; // column stride of matrix B __m256 ymm4, ymm5, ymm6, ymm7; __m256 ymm8, ymm9, ymm10, ymm11; __m256 ymm12, ymm13, ymm14, ymm15; __m256 ymm0, ymm1, ymm2, ymm3; int n_remainder; // If the N is non multiple of 3.(N%3) int m_remainder; // If the M is non multiple of 32.(M%32) float *alpha_cast, *beta_cast; // alpha, beta multiples alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); int required_packing_A = 1; // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv ( alpha, a, b, beta, c ); return BLIS_SUCCESS; } //update the pointer math if matrix B needs to be transposed. if (bli_obj_has_trans( b )) { tb_inc_col = 1; //switch row and column strides tb_inc_row = ldb; } if ((N <= 3) || ((MR * K) > F_SCRATCH_DIM)) { required_packing_A = 0; } /* * The computation loop runs for MRxN columns of C matrix, thus * accessing the MRxK A matrix data and KxNR B matrix data. * The computation is organized as inner loops of dimension MRxNR. */ // Process MR rows of C matrix at a time. for (row_idx = 0; (row_idx + (MR - 1)) < M; row_idx += MR) { col_idx_start = 0; tA_packed = A; row_idx_packed = row_idx; lda_packed = lda; // This is the part of the pack and compute optimization. // During the first column iteration, we store the accessed A matrix into // contiguous static memory. This helps to keep te A matrix in Cache and // aviods the TLB misses. if (required_packing_A) { col_idx = 0; //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; tA_packed = A_pack; #if 0//def BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 16), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing MR x K ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); _mm256_storeu_ps(tA_packed, ymm3); // the packing of matrix A // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); _mm256_storeu_ps(tA_packed + 8, ymm3); // the packing of matrix A // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); _mm256_storeu_ps(tA_packed + 16, ymm3); // the packing of matrix A // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); _mm256_storeu_ps(tA_packed + 24, ymm3); // the packing of matrix A // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); tA += lda; tA_packed += MR; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm11 = _mm256_mul_ps(ymm11, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); // multiply C by beta and accumulate col 1. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_ps(tC + 16); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(tC + 24); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); _mm256_storeu_ps(tC + 16, ymm6); _mm256_storeu_ps(tC + 24, ymm7); // multiply C by beta and accumulate, col 2. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(tC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_ps(tC + 24); ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11);*/ _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); _mm256_storeu_ps(tC + 24, ymm11); // multiply C by beta and accumulate, col 3. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(tC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); // modify the pointer arithematic to use packed A matrix. col_idx_start = NR; tA_packed = A_pack; row_idx_packed = 0; lda_packed = MR; } // Process NR columns of C matrix at a time. for (col_idx = col_idx_start; (col_idx + (NR - 1)) < N; col_idx += NR) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = tA_packed + row_idx_packed; #if 0//def BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 16), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 16), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing MR x K ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); tA += lda_packed; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm11 = _mm256_mul_ps(ymm11, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); // multiply C by beta and accumulate col 1. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_ps(tC + 16); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(tC + 24); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); _mm256_storeu_ps(tC + 16, ymm6); _mm256_storeu_ps(tC + 24, ymm7); // multiply C by beta and accumulate, col 2. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(tC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_ps(tC + 24); ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11);*/ _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); _mm256_storeu_ps(tC + 24, ymm11); // multiply C by beta and accumulate, col 3. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(tC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm8 = _mm256_fmadd_ps(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_ps(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm9 = _mm256_fmadd_ps(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_ps(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); ymm11 = _mm256_fmadd_ps(ymm0, ymm3, ymm11); ymm15 = _mm256_fmadd_ps(ymm1, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm11 = _mm256_mul_ps(ymm11, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); // multiply C by beta and accumulate, col 1. /*ymm2 = _mm256_loadu_ps(tC + 0); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(tC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_ps(tC + 24); ymm11 = _mm256_fmadd_ps(ymm2, ymm1, ymm11);*/ _mm256_storeu_ps(tC + 0, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); _mm256_storeu_ps(tC + 24, ymm11); // multiply C by beta and accumulate, col 2. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(tC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm12 = _mm256_fmadd_ps(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14); ymm3 = _mm256_loadu_ps(tA + 24); ymm15 = _mm256_fmadd_ps(ymm0, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); ymm15 = _mm256_mul_ps(ymm15, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC + 0); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_ps(tC + 24); ymm15 = _mm256_fmadd_ps(ymm2, ymm1, ymm15);*/ _mm256_storeu_ps(tC + 0, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); _mm256_storeu_ps(tC + 24, ymm15); } } m_remainder = M - row_idx; if (m_remainder >= 24) { m_remainder -= 24; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_ps(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_ps(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_ps(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_ps(ymm2, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_ps(tC + 16); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6);*/ _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); _mm256_storeu_ps(tC + 16, ymm6); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(tC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10);*/ _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14);*/ _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm8 = _mm256_fmadd_ps(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_ps(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm9 = _mm256_fmadd_ps(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_ps(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); ymm10 = _mm256_mul_ps(ymm10, ymm0); ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC + 0); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_ps(tC + 16); ymm10 = _mm256_fmadd_ps(ymm2, ymm1, ymm10);*/ _mm256_storeu_ps(tC + 0, ymm8); _mm256_storeu_ps(tC + 8, ymm9); _mm256_storeu_ps(tC + 16, ymm10); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14);*/ _mm256_storeu_ps(tC, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm12 = _mm256_fmadd_ps(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 8); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_ps(tA + 16); ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_ps(ymm12, ymm0); ymm13 = _mm256_mul_ps(ymm13, ymm0); ymm14 = _mm256_mul_ps(ymm14, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC + 0); ymm12 = _mm256_fmadd_ps(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_ps(tC + 8); ymm13 = _mm256_fmadd_ps(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_ps(tC + 16); ymm14 = _mm256_fmadd_ps(ymm2, ymm1, ymm14);*/ _mm256_storeu_ps(tC + 0, ymm12); _mm256_storeu_ps(tC + 8, ymm13); _mm256_storeu_ps(tC + 16, ymm14); } row_idx += 24; } if (m_remainder >= 16) { m_remainder -= 16; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_ps(ymm1, ymm3, ymm6); ymm8 = _mm256_fmadd_ps(ymm2, ymm3, ymm8); ymm3 = _mm256_loadu_ps(tA + 8); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm8 = _mm256_mul_ps(ymm8, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(tC + 8); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ _mm256_storeu_ps(tC, ymm6); _mm256_storeu_ps(tC + 8, ymm7); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm8 = _mm256_fmadd_ps(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_ps(tC + 8); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9);*/ _mm256_storeu_ps(tC, ymm8); _mm256_storeu_ps(tC + 8, ymm9); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_ps(ymm1, ymm3, ymm6); ymm3 = _mm256_loadu_ps(tA + 8); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_ps(tC + 8); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ _mm256_storeu_ps(tC, ymm6); _mm256_storeu_ps(tC + 8, ymm7); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_ps(tA + 8); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_ps(tC + 8); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(tC, ymm4); _mm256_storeu_ps(tC + 8, ymm5); } row_idx += 16; } if (m_remainder >= 8) { m_remainder -= 8; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm6 = _mm256_mul_ps(ymm6, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4);*/ _mm256_storeu_ps(tC, ymm4); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(tC, ymm5); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm6 = _mm256_fmadd_ps(ymm2, ymm1, ymm6);*/ _mm256_storeu_ps(tC, ymm6); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_ps(ymm4, ymm0); ymm5 = _mm256_mul_ps(ymm5, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4);*/ _mm256_storeu_ps(tC, ymm4); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_ps(tC); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(tC, ymm5); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_ps(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); ymm4 = _mm256_mul_ps(ymm4, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_ps(tC); ymm4 = _mm256_fmadd_ps(ymm2, ymm1, ymm4);*/ _mm256_storeu_ps(tC, ymm4); } row_idx += 8; } // M is not a multiple of 32. // The handling of edge case where the remainder // dimension is less than 8. The padding takes place // to handle this case. if ((m_remainder) && (lda > 7)) { float f_temp[8]; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm5 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_ps(tA); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_ss(tB + tb_inc_col * 2); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); //multiply A*B by alpha. ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); ymm9 = _mm256_mul_ps(ymm9, ymm0); /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ _mm256_storeu_ps(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); ymm9 = _mm256_fmadd_ps(ymm2, ymm1, ymm9);*/ _mm256_storeu_ps(f_temp, ymm9); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; ymm3 = _mm256_loadu_ps(tA); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); tA += lda; } ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_ss(tB + tb_inc_col * 1); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7); ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); ymm5 = _mm256_mul_ps(ymm5, ymm0); ymm7 = _mm256_mul_ps(ymm7, ymm0); /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); ymm7 = _mm256_fmadd_ps(ymm2, ymm1, ymm7);*/ _mm256_storeu_ps(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_ps(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; ymm3 = _mm256_loadu_ps(tA); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); tA += lda; } ymm0 = _mm256_broadcast_ss(tB + tb_inc_col * 0); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm0, ymm3, ymm5); ymm0 = _mm256_broadcast_ss(alpha_cast); //ymm1 = _mm256_broadcast_ss(beta_cast); // multiply C by beta and accumulate. ymm5 = _mm256_mul_ps(ymm5, ymm0); /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_ps(f_temp); ymm5 = _mm256_fmadd_ps(ymm2, ymm1, ymm5);*/ _mm256_storeu_ps(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } m_remainder = 0; } if (m_remainder) { float result; for (; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; result = 0; for (k = 0; k < K; ++k) { result += (*tA) * (*tB); tA += lda; tB += tb_inc_row; } result *= (*alpha_cast); (*tC) = /*(*tC) * (*beta_cast) + */result; } } } //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy in case of beta = 0 dim_t _i, _j, k, _l; if(bli_obj_is_lower(c)) // c is lower { //first column _j = 0; k = M >> 3; _i = 0; for ( _l = 0; _l < k; _l++ ) { ymm0 = _mm256_loadu_ps((C + _i*rsc)); _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0); _i += 8; } while (_i < M ) { bli_sscopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; while ( _j < N ) //next column { //k = (_j + (8 - (_j & 7))); _l = _j & 7; k = (_l != 0) ? (_j + (8 - _l)) : _j; k = (k <= M) ? k : M; for ( _i = _j; _i < k; ++_i ) { bli_sscopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } k = (M - _i) >> 3; _l = 0; while ( _l < k ) { ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 8; _l++; } while (_i < M ) { bli_sscopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; } } else //c is upper { for ( _j = 0; _j < N; ++_j ) { k = (_j + 1) >> 3; _i = 0; _l = 0; while ( _l < k ) { ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 8; _l++; } while (_i <= _j ) { bli_sscopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); ++_i; } } } } else {//when beta is non-zero, fmadd and store the results dim_t _i, _j, k, _l; ymm1 = _mm256_broadcast_ss(beta_cast); if(bli_obj_is_lower(c)) //c is lower { //first column _j = 0; k = M >> 3; _i = 0; for ( _l = 0; _l < k; _l++ ) { ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC), ymm0); _i += 8; } while (_i < M ) { bli_sssxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; while ( _j < N ) //next column { //k = (_j + (8 - (_j & 7))); _l = _j & 7; k = (_l != 0) ? (_j + (8 - _l)) : _j; k = (k <= M) ? k : M; for ( _i = _j; _i < k; ++_i ) { bli_sssxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } k = (M - _i) >> 3; _l = 0; while ( _l < k ) { ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 8; _l++; } while (_i < M ) { bli_sssxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; } } else //c is upper { for ( _j = 0; _j < N; ++_j ) { k = (_j + 1) >> 3; _i = 0; _l = 0; while ( _l < k ) { ymm2 = _mm256_loadu_ps((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_ps((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_ps(ymm2, ymm1, ymm0); _mm256_storeu_ps((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 8; _l++; } while (_i <= _j ) { bli_sssxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); ++_i; } } } } return BLIS_SUCCESS; } else return BLIS_NONCONFORMAL_DIMENSIONS; }; static err_t bli_dgemmt_small ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { int M = bli_obj_length( c ); // number of rows of Matrix C int N = bli_obj_width( c ); // number of columns of Matrix C int K = bli_obj_width( a ); // number of columns of OP(A), will be updated if OP(A) is Transpose(A) . int L = M * N; // If alpha is zero, scale by beta and return. if ((((L) < (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES)) || ((M < D_BLIS_SMALL_M_RECT_MATRIX_THRES) && (K < D_BLIS_SMALL_K_RECT_MATRIX_THRES))) && ((L!=0) && (K!=0))) { int lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. int ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C int row_idx, col_idx, k; int rs_matC = bli_obj_row_stride( c ); int rsc = 1; double *A = a->buffer; // pointer to elements of Matrix A double *B = b->buffer; // pointer to elements of Matrix B double *C = D_C_pack; // pointer to elements of Matrix C double *matCbuf = c->buffer; double *tA = A, *tB = B, *tC = C;//, *tA_pack; double *tA_packed; // temprorary pointer to hold packed A memory pointer int row_idx_packed; //packed A memory row index int lda_packed; //lda of packed A int col_idx_start; //starting index after A matrix is packed. dim_t tb_inc_row = 1; // row stride of matrix B dim_t tb_inc_col = ldb; // column stride of matrix B __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm0, ymm1, ymm2, ymm3; int n_remainder; // If the N is non multiple of 3.(N%3) int m_remainder; // If the M is non multiple of 16.(M%16) double *alpha_cast, *beta_cast; // alpha, beta multiples alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); int required_packing_A = 1; // when N is equal to 1 call GEMV instead of GEMMT if (N == 1) { bli_gemv ( alpha, a, b, beta, c ); return BLIS_SUCCESS; } //update the pointer math if matrix B needs to be transposed. if (bli_obj_has_trans( b )) { tb_inc_col = 1; //switch row and column strides tb_inc_row = ldb; } if ((N <= 3) || ((D_MR * K) > D_SCRATCH_DIM)) { required_packing_A = 0; } /* * The computation loop runs for D_MRxN columns of C matrix, thus * accessing the D_MRxK A matrix data and KxNR B matrix data. * The computation is organized as inner loops of dimension D_MRxNR. */ // Process D_MR rows of C matrix at a time. for (row_idx = 0; (row_idx + (D_MR - 1)) < M; row_idx += D_MR) { col_idx_start = 0; tA_packed = A; row_idx_packed = row_idx; lda_packed = lda; // This is the part of the pack and compute optimization. // During the first column iteration, we store the accessed A matrix into // contiguous static memory. This helps to keep te A matrix in Cache and // aviods the TLB misses. if (required_packing_A) { col_idx = 0; //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; tA_packed = D_A_pack; #if 0//def BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing D_MR x K ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); _mm256_storeu_pd(tA_packed, ymm3); // the packing of matrix A // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); _mm256_storeu_pd(tA_packed + 4, ymm3); // the packing of matrix A // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); _mm256_storeu_pd(tA_packed + 8, ymm3); // the packing of matrix A // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); _mm256_storeu_pd(tA_packed + 12, ymm3); // the packing of matrix A // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); tA += lda; tA_packed += D_MR; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); // multiply C by beta and accumulate col 1. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_pd(tC + 8); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(tC + 12); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); _mm256_storeu_pd(tC + 8, ymm6); _mm256_storeu_pd(tC + 12, ymm7); // multiply C by beta and accumulate, col 2. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_pd(tC + 12); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);*/ _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); _mm256_storeu_pd(tC + 12, ymm11); // multiply C by beta and accumulate, col 3. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(tC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); // modify the pointer arithematic to use packed A matrix. col_idx_start = NR; tA_packed = D_A_pack; row_idx_packed = 0; lda_packed = D_MR; } // Process NR columns of C matrix at a time. for (col_idx = col_idx_start; (col_idx + (NR - 1)) < N; col_idx += NR) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = tA_packed + row_idx_packed; #if 0//def BLIS_ENABLE_PREFETCH _mm_prefetch((char*)(tC + 0), _MM_HINT_T0); _mm_prefetch((char*)(tC + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + ldc + 8), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc), _MM_HINT_T0); _mm_prefetch((char*)(tC + 2 * ldc + 8), _MM_HINT_T0); #endif // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. // This loop is processing D_MR x K ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); // ymm7 += ymm0 * ymm3; ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); // ymm11 += ymm1 * ymm3; ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); // ymm15 += ymm2 * ymm3; ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); tA += lda_packed; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); // multiply C by beta and accumulate col 1. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_pd(tC + 8); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(tC + 12); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); _mm256_storeu_pd(tC + 8, ymm6); _mm256_storeu_pd(tC + 12, ymm7); // multiply C by beta and accumulate, col 2. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_pd(tC + 12); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);*/ _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); _mm256_storeu_pd(tC + 12, ymm11); // multiply C by beta and accumulate, col 3. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(tC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); ymm11 = _mm256_fmadd_pd(ymm0, ymm3, ymm11); ymm15 = _mm256_fmadd_pd(ymm1, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); // multiply C by beta and accumulate, col 1. /*ymm2 = _mm256_loadu_pd(tC + 0); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); ymm2 = _mm256_loadu_pd(tC + 12); ymm11 = _mm256_fmadd_pd(ymm2, ymm1, ymm11);*/ _mm256_storeu_pd(tC + 0, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); _mm256_storeu_pd(tC + 12, ymm11); // multiply C by beta and accumulate, col 2. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(tC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); ymm3 = _mm256_loadu_pd(tA + 12); ymm15 = _mm256_fmadd_pd(ymm0, ymm3, ymm15); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC + 0); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); ymm2 = _mm256_loadu_pd(tC + 12); ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15);*/ _mm256_storeu_pd(tC + 0, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); _mm256_storeu_pd(tC + 12, ymm15); } } m_remainder = M - row_idx; if (m_remainder >= 12) { m_remainder -= 12; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); // ymm4 += ymm0 * ymm3; ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); // ymm8 += ymm1 * ymm3; ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); // ymm12 += ymm2 * ymm3; ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); // ymm5 += ymm0 * ymm3; ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); // ymm9 += ymm1 * ymm3; ymm9 = _mm256_fmadd_pd(ymm1, ymm3, ymm9); // ymm13 += ymm2 * ymm3; ymm13 = _mm256_fmadd_pd(ymm2, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); // ymm6 += ymm0 * ymm3; ymm6 = _mm256_fmadd_pd(ymm0, ymm3, ymm6); // ymm10 += ymm1 * ymm3; ymm10 = _mm256_fmadd_pd(ymm1, ymm3, ymm10); // ymm14 += ymm2 * ymm3; ymm14 = _mm256_fmadd_pd(ymm2, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5); ymm2 = _mm256_loadu_pd(tC + 8); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);*/ _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); _mm256_storeu_pd(tC + 8, ymm6); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);*/ _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);*/ _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm8 = _mm256_fmadd_pd(ymm0, ymm3, ymm8); ymm12 = _mm256_fmadd_pd(ymm1, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm9 = _mm256_fmadd_pd(ymm0, ymm3, ymm9); ymm13 = _mm256_fmadd_pd(ymm1, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC + 0); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9); ymm2 = _mm256_loadu_pd(tC + 8); ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10);*/ _mm256_storeu_pd(tC + 0, ymm8); _mm256_storeu_pd(tC + 4, ymm9); _mm256_storeu_pd(tC + 8, ymm10); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);*/ _mm256_storeu_pd(tC, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm12 = _mm256_fmadd_pd(ymm0, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 4); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm3 = _mm256_loadu_pd(tA + 8); ymm14 = _mm256_fmadd_pd(ymm0, ymm3, ymm14); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm12 = _mm256_mul_pd(ymm12, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC + 0); ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); ymm2 = _mm256_loadu_pd(tC + 4); ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); ymm2 = _mm256_loadu_pd(tC + 8); ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14);*/ _mm256_storeu_pd(tC + 0, ymm12); _mm256_storeu_pd(tC + 4, ymm13); _mm256_storeu_pd(tC + 8, ymm14); } row_idx += 12; } if (m_remainder >= 8) { m_remainder -= 8; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); ymm8 = _mm256_fmadd_pd(ymm2, ymm3, ymm8); ymm3 = _mm256_loadu_pd(tA + 4); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm8 = _mm256_mul_pd(ymm8, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(tC + 4); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ _mm256_storeu_pd(tC, ymm6); _mm256_storeu_pd(tC + 4, ymm7); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); ymm2 = _mm256_loadu_pd(tC + 4); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);*/ _mm256_storeu_pd(tC, ymm8); _mm256_storeu_pd(tC + 4, ymm9); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm6 = _mm256_fmadd_pd(ymm1, ymm3, ymm6); ymm3 = _mm256_loadu_pd(tA + 4); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); ymm2 = _mm256_loadu_pd(tC + 4); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ _mm256_storeu_pd(tC, ymm6); _mm256_storeu_pd(tC + 4, ymm7); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_pd(tA + 4); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); ymm2 = _mm256_loadu_pd(tC + 4); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(tC, ymm4); _mm256_storeu_pd(tC + 4, ymm5); } row_idx += 8; } if (m_remainder >= 4) { m_remainder -= 4; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm6 = _mm256_mul_pd(ymm6, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);*/ _mm256_storeu_pd(tC, ymm4); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(tC, ymm5); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6);*/ _mm256_storeu_pd(tC, ymm6); } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm4 = _mm256_mul_pd(ymm4, ymm0); ymm5 = _mm256_mul_pd(ymm5, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);*/ _mm256_storeu_pd(tC, ymm4); // multiply C by beta and accumulate. tC += ldc; /*ymm2 = _mm256_loadu_pd(tC); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(tC, ymm5); col_idx += 2; } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm4 = _mm256_setzero_pd(); for (k = 0; k < K; ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); ymm4 = _mm256_mul_pd(ymm4, ymm0); // multiply C by beta and accumulate. /*ymm2 = _mm256_loadu_pd(tC); ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4);*/ _mm256_storeu_pd(tC, ymm4); } row_idx += 4; } // M is not a multiple of 32. // The handling of edge case where the remainder // dimension is less than 8. The padding takes place // to handle this case. if ((m_remainder) && (lda > 3)) { double f_temp[8]; for (col_idx = 0; (col_idx + 2) < N; col_idx += 3) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; // clear scratch registers. ymm5 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; //broadcasted matrix B elements are multiplied //with matrix A columns. ymm3 = _mm256_loadu_pd(tA); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); tA += lda; } // alpha, beta multiplication. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); ymm2 = _mm256_broadcast_sd(tB + tb_inc_col * 2); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); //multiply A*B by alpha. ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); ymm9 = _mm256_mul_pd(ymm9, ymm0); /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ _mm256_storeu_pd(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm9 = _mm256_fmadd_pd(ymm2, ymm1, ymm9);*/ _mm256_storeu_pd(f_temp, ymm9); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } n_remainder = N - col_idx; // if the N is not multiple of 3. // handling edge case. if (n_remainder == 2) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; ymm3 = _mm256_loadu_pd(tA); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); tA += lda; } ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); ymm1 = _mm256_broadcast_sd(tB + tb_inc_col * 1); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm7 = _mm256_fmadd_pd(ymm1, ymm3, ymm7); ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); ymm5 = _mm256_mul_pd(ymm5, ymm0); ymm7 = _mm256_mul_pd(ymm7, ymm0); /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } tC += ldc; /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm7 = _mm256_fmadd_pd(ymm2, ymm1, ymm7);*/ _mm256_storeu_pd(f_temp, ymm7); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } // if the N is not multiple of 3. // handling edge case. if (n_remainder == 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; ymm5 = _mm256_setzero_pd(); for (k = 0; k < (K - 1); ++k) { // The inner loop broadcasts the B matrix data and // multiplies it with the A matrix. ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; ymm3 = _mm256_loadu_pd(tA); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); tA += lda; } ymm0 = _mm256_broadcast_sd(tB + tb_inc_col * 0); tB += tb_inc_row; for (int i = 0; i < m_remainder; i++) { f_temp[i] = tA[i]; } ymm3 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, ymm5); ymm0 = _mm256_broadcast_sd(alpha_cast); //ymm1 = _mm256_broadcast_sd(beta_cast); // multiply C by beta and accumulate. ymm5 = _mm256_mul_pd(ymm5, ymm0); /*for (int i = 0; i < m_remainder; i++) { f_temp[i] = tC[i]; } ymm2 = _mm256_loadu_pd(f_temp); ymm5 = _mm256_fmadd_pd(ymm2, ymm1, ymm5);*/ _mm256_storeu_pd(f_temp, ymm5); for (int i = 0; i < m_remainder; i++) { tC[i] = f_temp[i]; } } m_remainder = 0; } if (m_remainder) { double result; for (; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { //pointer math to point to proper memory tC = C + ldc * col_idx + row_idx; tB = B + tb_inc_col * col_idx; tA = A + row_idx; result = 0; for (k = 0; k < K; ++k) { result += (*tA) * (*tB); tA += lda; tB += tb_inc_row; } result *= (*alpha_cast); (*tC) = /*(*tC) * (*beta_cast) + */result; } } } //copy/compute sryk values back to C using SIMD if ( bli_seq0( *beta_cast ) ) {//just copy for beta = 0 dim_t _i, _j, k, _l; if(bli_obj_is_lower(c)) //c is lower { //first column _j = 0; k = M >> 2; _i = 0; for ( _l = 0; _l < k; _l++ ) { ymm0 = _mm256_loadu_pd((C + _i*rsc)); _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0); _i += 4; } while (_i < M ) { bli_ddcopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; while ( _j < N ) //next column { //k = (_j + (4 - (_j & 3))); _l = _j & 3; k = (_l != 0) ? (_j + (4 - _l)) : _j; k = (k <= M) ? k : M; for ( _i = _j; _i < k; ++_i ) { bli_ddcopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } k = (M - _i) >> 2; _l = 0; while ( _l < k ) { ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 4; _l++; } while (_i < M ) { bli_ddcopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; } } else //c is upper { for ( _j = 0; _j < N; ++_j ) { k = (_j + 1) >> 2; _i = 0; _l = 0; while ( _l < k ) { ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 4; _l++; } while (_i <= _j ) { bli_ddcopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); ++_i; } } } } else {//when beta is non-zero, fmadd and store the results dim_t _i, _j, k, _l; ymm1 = _mm256_broadcast_sd(beta_cast); if(bli_obj_is_lower(c)) //c is lower { //first column _j = 0; k = M >> 2; _i = 0; for ( _l = 0; _l < k; _l++ ) { ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC), ymm0); _i += 4; } while (_i < M ) { bli_dddxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; while ( _j < N ) //next column { //k = (_j + (4 - (_j & 3))); _l = _j & 3; k = (_l != 0) ? (_j + (4 - _l)) : _j; k = (k <= M) ? k : M; for ( _i = _j; _i < k; ++_i ) { bli_dddxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } k = (M - _i) >> 2; _l = 0; while ( _l < k ) { ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 4; _l++; } while (_i < M ) { bli_dddxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); _i++; } _j++; } } else //c is upper { for ( _j = 0; _j < N; ++_j ) { k = (_j + 1) >> 2; _i = 0; _l = 0; while ( _l < k ) { ymm2 = _mm256_loadu_pd((matCbuf + _i*rs_matC + _j*ldc_matC)); ymm0 = _mm256_loadu_pd((C + _i*rsc + _j*ldc)); ymm0 = _mm256_fmadd_pd(ymm2, ymm1, ymm0); _mm256_storeu_pd((matCbuf + _i*rs_matC + _j*ldc_matC), ymm0); _i += 4; _l++; } while (_i <= _j ) { bli_dddxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); ++_i; } } } } return BLIS_SUCCESS; } else return BLIS_NONCONFORMAL_DIMENSIONS; }; static err_t bli_sgemmt_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { int M = bli_obj_length(c); // number of rows of Matrix C int N = bli_obj_width(c); // number of columns of Matrix C int K = bli_obj_length(b); // number of rows of Matrix B int lda = bli_obj_col_stride(a); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. int ldb = bli_obj_col_stride(b); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C int row_idx = 0, col_idx = 0, k; int rs_matC = bli_obj_row_stride( c ); int rsc = 1; float *A = a->buffer; // pointer to matrix A elements, stored in row major format float *B = b->buffer; // pointer to matrix B elements, stored in column major format float *C = C_pack; // pointer to matrix C elements, stored in column major format float *matCbuf = c->buffer; float *tA = A, *tB = B, *tC = C; __m256 ymm4, ymm5, ymm6, ymm7; __m256 ymm8, ymm9, ymm10, ymm11; __m256 ymm12, ymm13, ymm14, ymm15; __m256 ymm0, ymm1, ymm2, ymm3; float result, scratch[8]; float *alpha_cast, *beta_cast; // alpha, beta multiples alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); //The inner loop computes the 4x3 values of the matrix. //The computation pattern is: // ymm4 ymm5 ymm6 // ymm7 ymm8 ymm9 // ymm10 ymm11 ymm12 // ymm13 ymm14 ymm15 //The Dot operation is performed in the inner loop, 8 float elements fit //in the YMM register hence loop count incremented by 8 for (k = 0; (k + 7) < K; k += 8) { ymm0 = _mm256_loadu_ps(tB + 0); ymm1 = _mm256_loadu_ps(tB + ldb); ymm2 = _mm256_loadu_ps(tB + 2 * ldb); ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); ymm3 = _mm256_loadu_ps(tA + lda); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); ymm3 = _mm256_loadu_ps(tA + 2 * lda); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_ps(tA + 3 * lda); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); tA += 8; tB += 8; } // if K is not a multiple of 8, padding is done before load using temproary array. if (k < K) { int iter; float data_feeder[8] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; ymm1 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; ymm2 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_ps(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_ps(ymm2, ymm3, ymm6); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_ps(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_ps(ymm2, ymm3, ymm9); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_ps(ymm2, ymm3, ymm12); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_ps(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_ps(ymm2, ymm3, ymm15); } //horizontal addition and storage of the data. //Results for 4x3 blocks of C is stored here ymm4 = _mm256_hadd_ps(ymm4, ymm4); ymm4 = _mm256_hadd_ps(ymm4, ymm4); _mm256_storeu_ps(scratch, ymm4); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm7 = _mm256_hadd_ps(ymm7, ymm7); ymm7 = _mm256_hadd_ps(ymm7, ymm7); _mm256_storeu_ps(scratch, ymm7); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm10 = _mm256_hadd_ps(ymm10, ymm10); ymm10 = _mm256_hadd_ps(ymm10, ymm10); _mm256_storeu_ps(scratch, ymm10); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm13 = _mm256_hadd_ps(ymm13, ymm13); ymm13 = _mm256_hadd_ps(ymm13, ymm13); _mm256_storeu_ps(scratch, ymm13); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; tC += ldc; ymm5 = _mm256_hadd_ps(ymm5, ymm5); ymm5 = _mm256_hadd_ps(ymm5, ymm5); _mm256_storeu_ps(scratch, ymm5); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm8 = _mm256_hadd_ps(ymm8, ymm8); ymm8 = _mm256_hadd_ps(ymm8, ymm8); _mm256_storeu_ps(scratch, ymm8); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm11 = _mm256_hadd_ps(ymm11, ymm11); ymm11 = _mm256_hadd_ps(ymm11, ymm11); _mm256_storeu_ps(scratch, ymm11); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm14 = _mm256_hadd_ps(ymm14, ymm14); ymm14 = _mm256_hadd_ps(ymm14, ymm14); _mm256_storeu_ps(scratch, ymm14); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; tC += ldc; ymm6 = _mm256_hadd_ps(ymm6, ymm6); ymm6 = _mm256_hadd_ps(ymm6, ymm6); _mm256_storeu_ps(scratch, ymm6); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm9 = _mm256_hadd_ps(ymm9, ymm9); ymm9 = _mm256_hadd_ps(ymm9, ymm9); _mm256_storeu_ps(scratch, ymm9); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm12 = _mm256_hadd_ps(ymm12, ymm12); ymm12 = _mm256_hadd_ps(ymm12, ymm12); _mm256_storeu_ps(scratch, ymm12); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm15 = _mm256_hadd_ps(ymm15, ymm15); ymm15 = _mm256_hadd_ps(ymm15, ymm15); _mm256_storeu_ps(scratch, ymm15); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; } } int processed_col = col_idx; int processed_row = row_idx; //The edge case handling where N is not a multiple of 3 if (processed_col < N) { for (col_idx = processed_col; col_idx < N; col_idx += 1) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); //The inner loop computes the 4x1 values of the matrix. //The computation pattern is: // ymm4 // ymm7 // ymm10 // ymm13 for (k = 0; (k + 7) < K; k += 8) { ymm0 = _mm256_loadu_ps(tB + 0); ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_ps(tA + lda); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); ymm3 = _mm256_loadu_ps(tA + 2 * lda); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); ymm3 = _mm256_loadu_ps(tA + 3 * lda); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); tA += 8; tB += 8; } // if K is not a multiple of 8, padding is done before load using temproary array. if (k < K) { int iter; float data_feeder[8] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm7 = _mm256_fmadd_ps(ymm0, ymm3, ymm7); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm13 = _mm256_fmadd_ps(ymm0, ymm3, ymm13); } //horizontal addition and storage of the data. //Results for 4x1 blocks of C is stored here ymm4 = _mm256_hadd_ps(ymm4, ymm4); ymm4 = _mm256_hadd_ps(ymm4, ymm4); _mm256_storeu_ps(scratch, ymm4); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm7 = _mm256_hadd_ps(ymm7, ymm7); ymm7 = _mm256_hadd_ps(ymm7, ymm7); _mm256_storeu_ps(scratch, ymm7); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm10 = _mm256_hadd_ps(ymm10, ymm10); ymm10 = _mm256_hadd_ps(ymm10, ymm10); _mm256_storeu_ps(scratch, ymm10); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm13 = _mm256_hadd_ps(ymm13, ymm13); ymm13 = _mm256_hadd_ps(ymm13, ymm13); _mm256_storeu_ps(scratch, ymm13); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; } } processed_row = row_idx; } //The edge case handling where M is not a multiple of 4 if (processed_row < M) { for (row_idx = processed_row; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_ps(); for (k = 0; (k + 7) < K; k += 8) { ymm0 = _mm256_loadu_ps(tB + 0); ymm3 = _mm256_loadu_ps(tA); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); tA += 8; tB += 8; } // if K is not a multiple of 8, padding is done before load using temproary array. if (k < K) { int iter; float data_feeder[8] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_ps(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_ps(data_feeder); ymm4 = _mm256_fmadd_ps(ymm0, ymm3, ymm4); } //horizontal addition and storage of the data. ymm4 = _mm256_hadd_ps(ymm4, ymm4); ymm4 = _mm256_hadd_ps(ymm4, ymm4); _mm256_storeu_ps(scratch, ymm4); result = scratch[0] + scratch[4]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; } } } //copy/compute sryk values back to C if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C { dim_t _i, _j; if(bli_obj_is_lower(c)) //c is lower { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i <= 0 ) { bli_sscopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } else //c is upper { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i >= 0 ) { bli_sscopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } } else //when beta is non-zero, multiply and store result to C { dim_t _i, _j; if(bli_obj_is_lower(c)) //c is lower { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i <= 0 ) { bli_sssxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } else //c is upper { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i >= 0 ) { bli_sssxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } } return BLIS_SUCCESS; } else return BLIS_NONCONFORMAL_DIMENSIONS; } static err_t bli_dgemmt_small_atbn ( obj_t* alpha, obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx, cntl_t* cntl ) { int M = bli_obj_length( c ); // number of rows of Matrix C int N = bli_obj_width( c ); // number of columns of Matrix C int K = bli_obj_length( b ); // number of rows of Matrix B int lda = bli_obj_col_stride( a ); // column stride of matrix OP(A), where OP(A) is Transpose(A) if transA enabled. int ldb = bli_obj_col_stride( b ); // column stride of matrix OP(B), where OP(B) is Transpose(B) if transB enabled. int ldc_matC = bli_obj_col_stride( c ); // column stride of matrix C int ldc = M;//bli_obj_col_stride( c ); // column stride of static buffer for matrix C int row_idx = 0, col_idx = 0, k; int rs_matC = bli_obj_row_stride( c ); int rsc = 1; double *A = a->buffer; // pointer to matrix A elements, stored in row major format double *B = b->buffer; // pointer to matrix B elements, stored in column major format double *C = D_C_pack; // pointer to matrix C elements, stored in column major format double *matCbuf = c->buffer; double *tA = A, *tB = B, *tC = C; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm0, ymm1, ymm2, ymm3; double result, scratch[8]; double *alpha_cast, *beta_cast; // alpha, beta multiples alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); // The non-copy version of the A^T GEMMT gives better performance for the small M cases. // The threshold is controlled by BLIS_ATBN_M_THRES if (M <= BLIS_ATBN_M_THRES) { for (col_idx = 0; (col_idx + (NR - 1)) < N; col_idx += NR) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); //The inner loop computes the 4x3 values of the matrix. //The computation pattern is: // ymm4 ymm5 ymm6 // ymm7 ymm8 ymm9 // ymm10 ymm11 ymm12 // ymm13 ymm14 ymm15 //The Dot operation is performed in the inner loop, 4 double elements fit //in the YMM register hence loop count incremented by 4 for (k = 0; (k + 3) < K; k += 4) { ymm0 = _mm256_loadu_pd(tB + 0); ymm1 = _mm256_loadu_pd(tB + ldb); ymm2 = _mm256_loadu_pd(tB + 2 * ldb); ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); ymm3 = _mm256_loadu_pd(tA + lda); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); ymm3 = _mm256_loadu_pd(tA + 2 * lda); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); ymm3 = _mm256_loadu_pd(tA + 3 * lda); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); tA += 4; tB += 4; } // if K is not a multiple of 4, padding is done before load using temproary array. if (k < K) { int iter; double data_feeder[4] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + ldb]; ymm1 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter + 2 * ldb]; ymm2 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm5 = _mm256_fmadd_pd(ymm1, ymm3, ymm5); ymm6 = _mm256_fmadd_pd(ymm2, ymm3, ymm6); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); ymm8 = _mm256_fmadd_pd(ymm1, ymm3, ymm8); ymm9 = _mm256_fmadd_pd(ymm2, ymm3, ymm9); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm11 = _mm256_fmadd_pd(ymm1, ymm3, ymm11); ymm12 = _mm256_fmadd_pd(ymm2, ymm3, ymm12); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); ymm14 = _mm256_fmadd_pd(ymm1, ymm3, ymm14); ymm15 = _mm256_fmadd_pd(ymm2, ymm3, ymm15); } //horizontal addition and storage of the data. //Results for 4x3 blocks of C is stored here ymm4 = _mm256_hadd_pd(ymm4, ymm4); _mm256_storeu_pd(scratch, ymm4); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm7 = _mm256_hadd_pd(ymm7, ymm7); _mm256_storeu_pd(scratch, ymm7); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm10 = _mm256_hadd_pd(ymm10, ymm10); _mm256_storeu_pd(scratch, ymm10); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm13 = _mm256_hadd_pd(ymm13, ymm13); _mm256_storeu_pd(scratch, ymm13); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; tC += ldc; ymm5 = _mm256_hadd_pd(ymm5, ymm5); _mm256_storeu_pd(scratch, ymm5); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm8 = _mm256_hadd_pd(ymm8, ymm8); _mm256_storeu_pd(scratch, ymm8); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm11 = _mm256_hadd_pd(ymm11, ymm11); _mm256_storeu_pd(scratch, ymm11); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm14 = _mm256_hadd_pd(ymm14, ymm14); _mm256_storeu_pd(scratch, ymm14); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; tC += ldc; ymm6 = _mm256_hadd_pd(ymm6, ymm6); _mm256_storeu_pd(scratch, ymm6); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm9 = _mm256_hadd_pd(ymm9, ymm9); _mm256_storeu_pd(scratch, ymm9); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm12 = _mm256_hadd_pd(ymm12, ymm12); _mm256_storeu_pd(scratch, ymm12); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm15 = _mm256_hadd_pd(ymm15, ymm15); _mm256_storeu_pd(scratch, ymm15); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; } } int processed_col = col_idx; int processed_row = row_idx; //The edge case handling where N is not a multiple of 3 if (processed_col < N) { for (col_idx = processed_col; col_idx < N; col_idx += 1) { for (row_idx = 0; (row_idx + (AT_MR - 1)) < M; row_idx += AT_MR) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); //The inner loop computes the 4x1 values of the matrix. //The computation pattern is: // ymm4 // ymm7 // ymm10 // ymm13 for (k = 0; (k + 3) < K; k += 4) { ymm0 = _mm256_loadu_pd(tB + 0); ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); ymm3 = _mm256_loadu_pd(tA + lda); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); ymm3 = _mm256_loadu_pd(tA + 2 * lda); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); ymm3 = _mm256_loadu_pd(tA + 3 * lda); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); tA += 4; tB += 4; } // if K is not a multiple of 4, padding is done before load using temproary array. if (k < K) { int iter; double data_feeder[4] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm7 = _mm256_fmadd_pd(ymm0, ymm3, ymm7); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[2 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm10 = _mm256_fmadd_pd(ymm0, ymm3, ymm10); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[3 * lda + iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm13 = _mm256_fmadd_pd(ymm0, ymm3, ymm13); } //horizontal addition and storage of the data. //Results for 4x1 blocks of C is stored here ymm4 = _mm256_hadd_pd(ymm4, ymm4); _mm256_storeu_pd(scratch, ymm4); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; ymm7 = _mm256_hadd_pd(ymm7, ymm7); _mm256_storeu_pd(scratch, ymm7); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[1] = result/* + tC[1] * (*beta_cast)*/; ymm10 = _mm256_hadd_pd(ymm10, ymm10); _mm256_storeu_pd(scratch, ymm10); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[2] = result/* + tC[2] * (*beta_cast)*/; ymm13 = _mm256_hadd_pd(ymm13, ymm13); _mm256_storeu_pd(scratch, ymm13); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[3] = result/* + tC[3] * (*beta_cast)*/; } } processed_row = row_idx; } // The edge case handling where M is not a multiple of 4 if (processed_row < M) { for (row_idx = processed_row; row_idx < M; row_idx += 1) { for (col_idx = 0; col_idx < N; col_idx += 1) { tA = A + row_idx * lda; tB = B + col_idx * ldb; tC = C + col_idx * ldc + row_idx; // clear scratch registers. ymm4 = _mm256_setzero_pd(); for (k = 0; (k + 3) < K; k += 4) { ymm0 = _mm256_loadu_pd(tB + 0); ymm3 = _mm256_loadu_pd(tA); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); tA += 4; tB += 4; } // if K is not a multiple of 4, padding is done before load using temproary array. if (k < K) { int iter; double data_feeder[4] = { 0.0 }; for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tB[iter]; ymm0 = _mm256_loadu_pd(data_feeder); for (iter = 0; iter < (K - k); iter++) data_feeder[iter] = tA[iter]; ymm3 = _mm256_loadu_pd(data_feeder); ymm4 = _mm256_fmadd_pd(ymm0, ymm3, ymm4); } //horizontal addition and storage of the data. ymm4 = _mm256_hadd_pd(ymm4, ymm4); _mm256_storeu_pd(scratch, ymm4); result = scratch[0] + scratch[2]; result *= (*alpha_cast); tC[0] = result/* + tC[0] * (*beta_cast)*/; } } } //copy/compute sryk values back to C if ( bli_seq0( *beta_cast ) ) //when beta is 0, just copy result to C { dim_t _i, _j; if(bli_obj_is_lower(c)) //c is lower { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i <= 0 ) { bli_ddcopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } else //c is upper { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i >= 0 ) { bli_ddcopys( *(C + _i*rsc + _j*ldc), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } } else //when beta is non-zero, multiply and store result to C { dim_t _i, _j; if(bli_obj_is_lower(c)) //c is lower { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i <= 0 ) { bli_dddxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } else //c is upper { for ( _j = 0; _j < N; ++_j ) for ( _i = 0; _i < M; ++_i ) if ( (doff_t)_j - (doff_t)_i >= 0 ) { bli_dddxpbys( *(C + _i*rsc + _j*ldc), *(beta_cast), *(matCbuf + _i*rs_matC + _j*ldc_matC) ); } } } return BLIS_SUCCESS; } else return BLIS_NONCONFORMAL_DIMENSIONS; } #endif cython-blis-0.9.1/blis/_src/kernels/zen/3/bli_trsm_small.c000066400000000000000000065012561427272030600234250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2018-2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name of The University of Texas at Austin nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM #include "immintrin.h" #define GEMM_BLK_V1 8 //Block size to perform gemm and apply trsm #define GEMM_ACCUM_A 1 //Peform B1=B1-(B0*A0) operation instead of B1'=(B0*A0) and then B1=B1-B1' #define OPT_CACHE_BLOCKING_L1 1 //Perform trsm block-wise in blocks of GEMM_BLK_V1 instead of all columns of B together. #define REARRANGE_SHFL 0 //Rearrange operations using blend or shuffle #define BLI_AlXB_M_SP 16 #define BLI_XAltB_N_SP 128 #define BLI_AutXB_M_SP 64 #define BLI_AutXB_N_SP 128 // XA = B; A is lower-traingular; No transpose; double precision; non-unit diagonal static err_t bli_dtrsm_small_XAlB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //XA = B; A is lower triabgular; No transpose; double precision; unit-diagonal static err_t bli_dtrsm_small_XAlB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //XA = B; A is lower-triangular; A is transposed; double precision; non-unit-diagonal static err_t bli_dtrsm_small_XAltB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //XA = B; A is lower-triangular; A is transposed; double precision; unit-diagonal static err_t bli_dtrsm_small_XAltB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); // XA = B; A is upper triangular; No transpose; double presicion; non-unit diagonal static err_t bli_dtrsm_small_XAuB ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //XA = B; A is upper triangular; No transpose; double precision; unit-diagonal static err_t bli_dtrsm_small_XAuB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //XA = B; A is upper-triangular; A is transposed; double precision; non-unit diagonal static err_t bli_dtrsm_small_XAutB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //XA = B; A is upper-triangular; A is transposed; double precision; unit diagonal static err_t bli_dtrsm_small_XAutB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //AX = B; A is lower triangular; No transpose; double precision; non-unit diagonal static err_t bli_dtrsm_small_AlXB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //AX = B; A is lower triangular; No transpose; double precision; unit diagonal static err_t bli_dtrsm_small_AlXB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); static void (*fp_blis_strsm_microkernel)( float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b ); static void blis_strsm_microkernel( float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b ); static void blis_strsm_microkernel_alpha( float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal ); static void blis_strsm_microkernel_unitDiag( float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b ); static void blis_strsm_microkernel_alpha_unitDiag( float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal ); static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal); static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal); static void blis_dtrsm_microkernel( double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b ); static void blis_dtrsm_microkernel_alpha( double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, double alphaVal ); static void blis_dtrsm_microkernel_unitDiag( double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b ); static void blis_dtrsm_microkernel_alpha_unitDiag( double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, double alphaVal ); static void dtrsm_XAtB_block_allSmallSizedMatrices(double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); static void dtrsm_XAtB_block_allSmallSizedMatrices_alpha(double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, double alphaVal); static void dtrsm_XAtB_block_allSmallSizedMatrices_unitDiag(double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); static void dtrsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(double *ptr_l, double *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, double alphaVal); static void trsm_AutXB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); static void trsm_AutXB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha); static void trsm_AutXB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b); static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha); //AX = B; A is lower triangular; No transpose; single precision static err_t bli_strsm_small_AlXB ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //A.'X = B; A is upper triangular; A has to be transposed; single precision static err_t bli_strsm_small_AutXB ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //XA.' = B; A is lower triangular; A has to be transposed; single precision static err_t bli_strsm_small_XAltB ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); //A.'X = B; A is upper triangular; A has to be transposed; double precision static err_t bli_dtrsm_small_AutXB ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ); /* * The bli_trsm_small implements unpacked version of TRSM * Currently only column-major is supported, A & B are column-major * Input: A: MxM (triangular matrix) * B: MxN matrix * Output: X: MxN matrix such that AX = alpha*B or XA = alpha*B or A'X = alpha*B or XA' = alpha*B * Here the output X is stored in B * The custom-kernel will be called only when M*(M+N)* sizeof(Matrix Elements) < L3 cache */ err_t bli_trsm_small ( side_t side, obj_t* alpha, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { #ifdef BLIS_ENABLE_MULTITHREADING return BLIS_NOT_YET_IMPLEMENTED; #endif dim_t m = bli_obj_length(b); dim_t n = bli_obj_width(b); if(!(m && n)) return BLIS_SUCCESS; // If alpha is zero, B matrix will become zero after scaling & hence solution is also zero matrix if (bli_obj_equals(alpha, &BLIS_ZERO)) { return BLIS_NOT_YET_IMPLEMENTED; // scale B by alpha } // We have to call matrix scaling if alpha != 1.0 // if row major format return. Check this again. if ((bli_obj_row_stride(a) != 1) || (bli_obj_row_stride(b) != 1)) { return BLIS_INVALID_ROW_STRIDE; } num_t dt = ((*b).info & (0x7 << 0)); // only float and double datatypes are supported as of now. if (dt != BLIS_DOUBLE && dt != BLIS_FLOAT) { return BLIS_EXPECTED_REAL_DATATYPE; } // A is expected to be triangular in trsm if (!bli_obj_is_upper_or_lower (a)) { return BLIS_EXPECTED_TRIANGULAR_OBJECT; } // can use other control structs - even can use array of function pointers, // indexed by a number with bits formed by f('side', 'uplo', 'transa', dt). // In the below implementation, based on the number of finally implemented // cases, can move the checks with more cases higher up. if(side == BLIS_LEFT) { if(bli_obj_has_trans(a)) { if(dt == BLIS_DOUBLE) { if(bli_obj_is_upper(a)) { //return bli_dtrsm_small_AutXB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } else { //return bli_dtrsm_small_AltXB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } } else { if(bli_obj_is_upper(a)) { return bli_strsm_small_AutXB(side, alpha, a, b, cntx, cntl); } else { //return bli_strsm_small_AltXB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } } } else { if(dt == BLIS_DOUBLE) { if(bli_obj_is_upper(a)) { //return bli_dtrsm_small_AuXB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } else { if(bli_obj_has_unit_diag(a)) return bli_dtrsm_small_AlXB_unitDiag(side, alpha, a, b, cntx, cntl); else return bli_dtrsm_small_AlXB(side, alpha, a, b, cntx, cntl); } } else { if(bli_obj_is_upper(a)) { //return bli_strsm_small_AuXB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } else { return bli_strsm_small_AlXB(side, alpha, a, b, cntx, cntl); } } } } else { if(bli_obj_has_trans(a)) { if(dt == BLIS_DOUBLE) { if(bli_obj_is_upper(a)) { if(bli_obj_has_unit_diag(a)) return bli_dtrsm_small_XAutB_unitDiag(side, alpha, a, b, cntx, cntl); else return bli_dtrsm_small_XAutB(side, alpha, a, b, cntx, cntl); } else { if(bli_obj_has_unit_diag(a)) return bli_dtrsm_small_XAltB_unitDiag(side, alpha, a, b, cntx, cntl); else return bli_dtrsm_small_XAltB(side, alpha, a, b, cntx, cntl); } } else { if(bli_obj_is_upper(a)) { //return bli_strsm_small_XAutB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } else { return bli_strsm_small_XAltB(side, alpha, a, b, cntx, cntl); } } } else { if(dt == BLIS_DOUBLE) { if(bli_obj_is_upper(a)) { if(bli_obj_has_unit_diag(a)) return bli_dtrsm_small_XAuB_unitDiag(side, alpha, a, b, cntx, cntl); else return bli_dtrsm_small_XAuB(side, alpha, a, b, cntx, cntl); } else { if(bli_obj_has_unit_diag(a)) return bli_dtrsm_small_XAlB_unitDiag(side, alpha, a, b, cntx, cntl); else return bli_dtrsm_small_XAlB(side, alpha, a, b, cntx, cntl); } } else { if(bli_obj_is_upper(a)) { //return bli_strsm_small_XAuB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } else { //return bli_strsm_small_XAlB(side, alpha, a, b, cntx, cntl); return BLIS_NOT_YET_IMPLEMENTED; } } } } return BLIS_NOT_YET_IMPLEMENTED; }; /* TRSM scalar code for the case AX = alpha * B * A is lower-triangular, non-unit-diagonal, no transpose * Dimensions: A: mxm X: mxn B:mxn */ static err_t dtrsm_small_AlXB ( double *A, double *B, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for (k = 0; k < M; k++) { double lkk_inv = 1.0/A[k+k*lda]; for (j = 0; j < N; j++) { B[k + j*ldb] *= lkk_inv; for (i = k+1; i < M; i++) { B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb]; } } }// k -loop return BLIS_SUCCESS; }// end of function /* TRSM scalar code for the case AX = alpha * B * A is lower-triangular, unit-diagonal, no transpose * Dimensions: A: mxm X: mxn B:mxn */ static err_t dtrsm_small_AlXB_unitDiag ( double *A, double *B, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for (k = 0; k < M; k++) { for (j = 0; j < N; j++) { for (i = k+1; i < M; i++) { B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb]; } } } return BLIS_SUCCESS; }// end of function /* TRSM scalar code for the case XA = alpha * B * A is upper-triangular, non-unit-diagonal no transpose * Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAuB ( double *A, double *B, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for(k = 0; k < N; k++) { double lkk_inv = 1.0/A[k+k*lda]; for(i = 0; i < M; i++) { B[i+k*ldb] *= lkk_inv; for(j = k+1; j < N; j++) { B[i+j*ldb] -= B[i+k*ldb] * A[k+j*lda]; } } } return BLIS_SUCCESS; } /* TRSM scalar code for the case XA = alpha * B * A is lower-triangular, non-unit triangular, no transpose * Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAlB ( double *A, double *B, double alpha, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for(j = 0; j < N; j++) for(i = 0; i < M; i++) B[i+j*ldb] *= alpha; for(k = N;k--;) { double lkk_inv = 1.0/A[(k)+(k)*lda]; for(i = M;i--;) { B[(i)+(k)*ldb] *= lkk_inv; for(j = k;j--;) { B[(i)+(j)*ldb] -= B[(i)+(k)*ldb] * A[(k)+(j)*lda]; } } } return BLIS_SUCCESS; } /* TRSM scalar code for the case XA = alpha * B * A is lower-triangular, unit-diagonal, no transpose *Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAlB_unitDiag( double *A, double *B, double alpha, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for(j = 0 ; j < N; j++) for(i = 0; i < M; i++) B[i+j*ldb] *= alpha; double A_k_j; for(k = N; k--;) { for(j = k; j--;) { A_k_j = A[(k)+(j)*lda]; for(i = M; i--;) { B[(i)+(j)*ldb] -= B[(i)+(k)*ldb] * A_k_j; } } } return BLIS_SUCCESS; } /* TRSM scalar code for the case XA = alpha * B *A is upper-triangular, non-unit-diagonal, A is transposed * Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAutB ( double *A, double *B, double alpha, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for(j = 0; j < N; j++) for(i = 0; i < M; i++) B[i+j*ldb] *=alpha; for(k = N; k--;) { double lkk_inv = 1.0/A[(k)+(k)*lda]; for(i = M; i--;) { B[(i)+(k)*ldb] *= lkk_inv; for(j = k; j--;) { B[(i)+(j)*ldb] -= B[(i)+(k)*ldb] * A[(j)+(k)*lda]; } } } return BLIS_SUCCESS; } /* TRSM scalar code for the case XA = alpha * B * A is upper-triangular, unit-diagonal, A has to be transposed * Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAutB_unitDiag( double *A, double *B, double alpha, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; double A_k_j; for(j = 0; j< N; j++) for(i = 0; i< M; i++) B[i+j*ldb] *= alpha; for(k = N; k--;) { for(j = k; j--;) { A_k_j = A[(j)+(k)*lda]; for(i = M; i--;) { B[(i)+(j)*ldb] -= B[(i)+(k)*ldb] * A_k_j; } } } return BLIS_SUCCESS; } /* TRSM scalar code for the case XA = alpha * B * A is lower-triangular, non-unit-diagonal, A has to be transposed * Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAltB ( double *A, double *B, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for(k = 0; k < N; k++) { double lkk_inv = 1.0/A[k+k*lda]; for(i = 0; i < M; i++) { B[i+k*ldb] *= lkk_inv; for(j = k+1; j < N; j++) { B[i+j*ldb] -= B[i+k*ldb] * A[j+k*lda]; } } } return BLIS_SUCCESS; } /* TRSM scalar code for XA = alpha * B * A is lower-triangular, unit-diagonal, A has to be transposed * Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAltB_unitDiag( double *A, double *B, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for(k = 0; k < N; k++) { for(i = 0; i < M; i++) { for(j = k+1; j < N; j++) { B[i+j*ldb] -= B[i+k*ldb] * A[j+k*lda]; } } } return BLIS_SUCCESS; } /* TRSM scalar code for the case XA = alpha * B * A is upper-triangular, unit-diagonal, no transpose * Dimensions: X:mxn A:nxn B:mxn */ static err_t dtrsm_small_XAuB_unitDiag ( double *A, double *B, dim_t M, dim_t N, dim_t lda, dim_t ldb ) { dim_t i, j, k; for(k = 0; k < N; k++) { for(i = 0; i < M; i++) { for(j = k+1; j < N; j++) { B[i+j*ldb] -= B[i+k*ldb] * A[k+j*lda]; } } } return BLIS_SUCCESS; } /* TRSM for the case AX = alpha * B, Double precision * A is lower-triangular, no-transpose, non-unit diagonal * dimensions A: mxm X: mxn B: mxn b01---> * ***************** ** * * * * * * * * * * * * * * *b01* * * * * * * * * * * a10 ****** b11 ***************** | * * * | * * * * * | * * * | * * * * * | *a10*a11* | *b11* * * * v * * * v * * * * * *********** ***************** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **************** ***************** a11---> */ static err_t bli_dtrsm_small_AlXB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 4; //size of block along 'M' dimpension dim_t D_NR = 8; //size of block along 'N' dimension dim_t m = bli_obj_length(b); // number of rows of matrix B dim_t n = bli_obj_width(b); // number of columns of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME) || (m> D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N) || (m>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M && n D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t m_remainder = m & 3; //number of remainder rows dim_t n_remainder = n & 7; //number of remainder columns dim_t cs_a = bli_obj_col_stride(a); // column stride of A dim_t cs_b = bli_obj_col_stride(b); // column stride of B dim_t i, j, k; //loop variables dim_t k_iter; //number of times GEMM to be performed double AlphaVal = *(double *)AlphaObj->buffer; //value of alpha double *L = a->buffer; //pointer to matrix A double *B = b->buffer; //pointer to matrix B double *a10, *a11, *b01, *b11; //pointers that point to blocks for GEMM and TRSM double *ptr_b01_dup; double f_t[4] __attribute__((aligned(64)));//buffer to store corner column when m_remainder !=0 double* f_temp; double ones = 1.0; //scratch registers __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(j = 0; j+D_NR-1 < n; j += D_NR) //loop along 'N' dimension { for(i = 0;i+D_MR-1 < m; i += D_MR) //loop along 'M' dimension { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM to be performed(in blocks of 4x4) ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); ///GEMM code begins/// for(k = 0; k< k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm16 = _mm256_loadu_pd((double const *)(a10));//A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[0][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[0][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[0][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[0][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[0][4]*A10[0][0] B01[0][4]*A10[1][0] B01[0][4]*A10[2][0] B01[0][4]*A10[3][0]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[0][5]*A10[0][0] B01[0][5]*A10[1][0] B01[0][5]*A10[2][0] B01[0][5]*A10[3][0]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[0][6]*A10[0][0] B01[0][6]*A10[1][0] B01[0][6]*A10[2][0] B01[0][6]*A10[3][0]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[0][7]*A10[0][0] B01[0][7]*A10[1][0] B01[0][7]*A10[2][0] B01[0][7]*A10[3][0]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a));//A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[1][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[1][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[1][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[1][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[1][4]*A10[0][1] B01[1][4]*A10[1][1] B01[1][4]*A10[2][1] B01[1][4]*A10[3][1]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[1][5]*A10[0][1] B01[1][5]*A10[1][1] B01[1][5]*A10[2][1] B01[1][5]*A10[3][1]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[1][6]*A10[0][1] B01[1][6]*A10[1][1] B01[1][6]*A10[2][1] B01[1][6]*A10[3][1]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[1][7]*A10[0][1] B01[1][7]*A10[1][1] B01[1][7]*A10[2][1] B01[1][7]*A10[3][1]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2));//A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[2][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[2][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[2][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[2][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[2][3]*A10[0][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[2][4]*A10[0][2] B01[2][4]*A10[1][2] B01[2][4]*A10[2][2] B01[2][4]*A10[3][2]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[2][5]*A10[0][2] B01[2][5]*A10[1][2] B01[2][5]*A10[2][2] B01[2][5]*A10[3][2]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[2][6]*A10[0][2] B01[2][6]*A10[1][2] B01[2][6]*A10[2][2] B01[2][6]*A10[3][2]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[2][7]*A10[0][2] B01[2][7]*A10[1][2] B01[2][7]*A10[2][2] B01[2][7]*A10[3][2]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3));//A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[3][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[3][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[3][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[3][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[3][0]*A10[0][3] B01[3][0]*A10[3][0] B01[3][0]*A10[2][3] B01[3][0]*A10[3][0]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[3][1]*A10[0][3] B01[3][1]*A10[3][0] B01[3][1]*A10[2][3] B01[3][1]*A10[3][0]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[3][2]*A10[0][3] B01[3][2]*A10[3][0] B01[3][2]*A10[2][3] B01[3][2]*A10[3][0]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[3][3]*A10[0][3] B01[3][3]*A10[3][0] B01[3][3]*A10[2][3] B01[3][3]*A10[3][0]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[3][4]*A10[0][3] B01[3][4]*A10[3][0] B01[3][4]*A10[2][3] B01[3][4]*A10[3][3]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[3][5]*A10[0][3] B01[3][5]*A10[3][0] B01[3][5]*A10[2][3] B01[3][5]*A10[3][3]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[3][6]*A10[0][3] B01[3][6]*A10[3][0] B01[3][6]*A10[2][3] B01[3][6]*A10[3][3]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[3][7]*A10[0][3] B01[3][7]*A10[3][0] B01[3][7]*A10[2][3] B01[3][7]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to calculate next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to calculate next block of B for GEMM } ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b *0)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b *1)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b *2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b *3)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b *4)); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b *5)); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b *6)); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ymm7 = _mm256_loadu_pd((double const *)(b11 + cs_b *7)); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); //B11[0-3][0] * alpha -= B01[0-3][0] ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); //B11[0-3][1] * alpha -= B01[0-3][1] ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); //B11[0-3][2] * alpha -= B01[0-3][2] ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11); //B11[0-3][3] * alpha -= B01[0-3][3] ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); //B11[0-3][4] * alpha -= B01[0-3][4] ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); //B11[0-3][5] * alpha -= B01[0-3][5] ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); //B11[0-3][6] * alpha -= B01[0-3][6] ymm7 = _mm256_fmsub_pd(ymm7, ymm16, ymm15); //B11[0-3][7] * alpha -= B01[0-3][7] ///implement TRSM/// ///transpose of B11// ///unpacklow/// ymm9 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); //B11[0][4] B11[0][5] B11[2][4] B11[2][5] ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); //B11[0][6] B11[0][7] B11[2][6] B11[2][7] //rearrange low elements ymm8 = _mm256_permute2f128_pd(ymm9,ymm11,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm10 = _mm256_permute2f128_pd(ymm9,ymm11,0x31); //B11[2][0] B11[2][1] B11[2][2] B11[2][3] ymm12 = _mm256_permute2f128_pd(ymm13,ymm15,0x20); //B11[4][0] B11[4][1] B11[4][2] B11[4][3] ymm14 = _mm256_permute2f128_pd(ymm13,ymm15,0x31); //B11[6][0] B11[6][1] B11[6][2] B11[6][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); //B11[1][4] B11[1][5] B11[3][4] B11[3][5] ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); //B11[1][6] B11[1][7] B11[3][6] B11[3][7] //rearrange high elements ymm9 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm11 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] ymm13 = _mm256_permute2f128_pd(ymm4,ymm5,0x20); //B11[5][0] B11[5][1] B11[5][2] B11[5][3] ymm15 = _mm256_permute2f128_pd(ymm4,ymm5,0x31); //B11[7][0] B11[7][1] B11[7][2] B11[7][3] ymm0 = _mm256_broadcast_sd((double const *)&ones); //broadcast diagonal elements of A11 ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a +1)); //A11[1][1] ymm3 = _mm256_broadcast_sd((double const *)(a11+cs_a*2 + 2)); //A11[2][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+cs_a*3 + 3)); //A11[3][3] ymm5 = _mm256_unpacklo_pd(ymm1, ymm2); //A11[0][0] A11[0][0] A11[1][1] A11[1][1] ymm6 = _mm256_unpacklo_pd(ymm3, ymm4); //A11[2][2] A11[2][2] A11[3][3] A11[3][3] ymm5 = _mm256_blend_pd(ymm5, ymm6, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm0 = _mm256_div_pd(ymm0, ymm5); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2] //extract a00 ymm1 = _mm256_permute_pd(ymm0, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row 0): perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm1); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm1); //B11[0-3][4] /= A11[0][0] //extract a11 ymm1 = _mm256_permute_pd(ymm0, 0x03); //1/A11[1][1] 1/A11[1][1] 1/A11[3][3] 1/A11[3][3] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x00); //1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1] ymm2 = _mm256_broadcast_sd((double const *)(a11 +1)); //A11[1][0] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][0] ymm4 = _mm256_broadcast_sd((double const *)(a11 +3)); //A11[3][0] a11 += cs_a; //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9); //B11[1][0-3] -= A11[1][0] * B11[0-3][0] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[2][0-3] -= A11[2][0] * B11[0-3][0] ymm11 = _mm256_fnmadd_pd(ymm4, ymm8, ymm11); //B11[3][0-3] -= A11[3][0] * B11[0-3][0] ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13); //B11[5][0-3] -= A11[1][0] * B11[0-3][4] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[6][0-3] -= A11[2][0] * B11[0-3][4] ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15); //B11[7][0-3] -= A11[3][0] * B11[0-3][4] ymm9 = _mm256_mul_pd(ymm9, ymm1); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm1); //B11[0-3][5] /= A11[1][1] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][1] ymm4 = _mm256_broadcast_sd((double const *)(a11 +3)); //A11[3][1] a11 += cs_a; //extract a22 ymm1 = _mm256_permute_pd(ymm0, 0x00); //1/A11[0][0] 1/A110[][0] 1/A11[2][2] 1/A11[2][2] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x11); //1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2] //(ROw2): FMA operations ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10); //B11[2][0-3] -= A11[2][1] * B11[0-3][1] ymm11 = _mm256_fnmadd_pd(ymm4, ymm9, ymm11); //B11[3][0-3] -= A11[3][1] * B11[0-3][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm13, ymm14); //B11[6][0-3] -= A11[2][1] * B11[0-3][5] ymm15 = _mm256_fnmadd_pd(ymm4, ymm13, ymm15); //B11[7][0-3] -= A11[3][1] * B11[0-3][5] //perform mul operation ymm10 = _mm256_mul_pd(ymm10, ymm1); //B11[0-3][2] /= A11[2][2] ymm14 = _mm256_mul_pd(ymm14, ymm1); //B11[0-3][6] /= A11[2][2] ymm4 = _mm256_broadcast_sd((double const *)(a11 +3)); //A11[3][2] a11 += cs_a; //extract a33 ymm1 = _mm256_permute_pd(ymm0, 0x0C); //1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x11);//1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3] //(ROw2): FMA operations ymm11 = _mm256_fnmadd_pd(ymm4, ymm10, ymm11); //B11[3][0-3] -= A11[3][2] * B11[0-3][2] ymm15 = _mm256_fnmadd_pd(ymm4, ymm14, ymm15); //B11[7][0-3] -= A11[3][2] * B11[0-3][6] //perform mul operation ymm11 = _mm256_mul_pd(ymm11, ymm1); //B11[0-3][3] /= A11[3][3] ymm15 = _mm256_mul_pd(ymm15, ymm1); //B11[0-3][7] /= A11[3][3] //unpacklow// ymm1 = _mm256_unpacklo_pd(ymm8, ymm9); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); //B11[4][0] B11[5][0] B11[4][2] B11[5][2] ymm7 = _mm256_unpacklo_pd(ymm14, ymm15); //B11[6][0] B11[7][0] B11[6][2] B11[7][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ///unpack high/// ymm8 = _mm256_unpackhi_pd(ymm8, ymm9); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); //B11[4][1] B11[5][1] B11[4][3] B11[5][3] ymm13 = _mm256_unpackhi_pd(ymm14, ymm15); //B11[6][1] B11[7][1] B11[6][3] B11[7][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); //store B11[0][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); //store B11[1][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store B11[2][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store B11[3][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 4), ymm4); //store B11[4][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 5), ymm5); //store B11[5][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 6), ymm6); //store B11[6][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 7), ymm7); //store B11[7][0-3] } if(m_remainder) //implementation for reamainder rows(when 'M' is not a multiple of D_MR) { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM operation to be done(in blocks of 4x4) int iter; if((j+D_NR) == n) { for(iter = 0; iter < m_remainder; iter++) f_t[iter] = (b11 + cs_b * 7)[iter]; f_temp = f_t; } else f_temp = (b11 + cs_b * 7); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); ///GEMM code Begins/// for(k = 0; k< k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm16 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[0][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[0][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[0][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[0][7] b01 += 1; //move to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0] ) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[0][4]*A10[0][0] B01[0][4]*A10[1][0] B01[0][4]*A10[2][0] B01[0][4]*A10[3][0]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[0][5]*A10[0][0] B01[0][5]*A10[1][0] B01[0][5]*A10[2][0] B01[0][5]*A10[3][0]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[0][6]*A10[0][0] B01[0][6]*A10[1][0] B01[0][6]*A10[2][0] B01[0][6]*A10[3][0]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm16 += (B01[0][7]*A10[0][0] B01[0][7]*A10[1][0] B01[0][7]*A10[2][0] B01[0][7]*A10[3][0]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 1)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[1][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[1][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[1][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[1][7] b01 += 1; //move to next row of B01 ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[1][4]*A10[0][1] B01[1][4]*A10[1][1] B01[1][4]*A10[2][1] B01[1][4]*A10[3][1]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[1][5]*A10[0][1] B01[1][5]*A10[1][1] B01[1][5]*A10[2][1] B01[1][5]*A10[3][1]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[1][6]*A10[0][1] B01[1][6]*A10[1][1] B01[1][6]*A10[2][1] B01[1][6]*A10[3][1]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[1][7]*A10[0][1] B01[1][7]*A10[1][1] B01[1][7]*A10[2][1] B01[1][7]*A10[3][1]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] //A10[1][2] A10[2][2] A10[3][2] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[2][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[2][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[2][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[2][7] b01 += 1; //move to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[2][3]*A10[0][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[2][4]*A10[0][2] B01[2][4]*A10[1][2] B01[2][4]*A10[2][2] B01[2][0]*A10[3][2]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[2][5]*A10[0][2] B01[2][5]*A10[1][2] B01[2][5]*A10[2][2] B01[2][1]*A10[3][2]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[2][6]*A10[0][2] B01[2][6]*A10[1][2] B01[2][6]*A10[2][2] B01[2][2]*A10[3][2]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[2][7]*A10[0][2] B01[2][7]*A10[1][2] B01[2][7]*A10[2][2] B01[2][3]*A10[3][2]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[3][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[3][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[3][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[3][7] b01 += 1; //move to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm8 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm8 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm8 += (B01[3][3]*A10[0][3] B01[3][3]*A10[1][3] B01[3][3]*A10[2][3] B01[3][3]*A10[3][3]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm8 += (B01[3][0]*A10[0][3] B01[3][4]*A10[1][3] B01[3][4]*A10[2][3] B01[3][4]*A10[3][3]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm8 += (B01[3][1]*A10[0][3] B01[3][5]*A10[1][3] B01[3][5]*A10[2][3] B01[3][5]*A10[3][3]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm8 += (B01[3][2]*A10[0][3] B01[3][6]*A10[1][3] B01[3][6]*A10[2][3] B01[3][6]*A10[3][3]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm8 += (B01[3][3]*A10[0][3] B01[3][7]*A10[1][3] B01[3][7]*A10[2][3] B01[3][7]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha value ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b *0)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b *1)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b *2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b *3)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b *4)); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b *5)); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b *6)); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); //B11[0-3][0] *alpha -= B01[0-3][0] ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); //B11[0-3][1] *alpha -= B01[0-3][1] ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); //B11[0-3][2] *alpha -= B01[0-3][2] ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11); //B11[0-3][3] *alpha -= B01[0-3][3] ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); //B11[0-3][4] *alpha -= B01[0-3][4] ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); //B11[0-3][5] *alpha -= B01[0-3][5] ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); //B11[0-3][6] *alpha -= B01[0-3][6] ymm7 = _mm256_fmsub_pd(ymm7, ymm16, ymm15); //B11[0-3][7] *alpha -= B01[0-3][7] if(3 == m_remainder) { ///implement TRSM/// ///unpacklow/// ymm9 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); //B11[0][4] B11[0][5] B11[1][4] B11[1][5] ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); //B11[0][6] B11[0][7] B11[1][6] B11[1][7] //rearrange low elements ymm8 = _mm256_permute2f128_pd(ymm9,ymm11,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm10 = _mm256_permute2f128_pd(ymm9,ymm11,0x31); //B11[2][0] B11[2][1] B11[2][2] B11[2][3] ymm12 = _mm256_permute2f128_pd(ymm13,ymm15,0x20); //B11[4][0] B11[4][1] B11[4][2] B11[4][3] ymm14 = _mm256_permute2f128_pd(ymm13,ymm15,0x31); //B11[6][0] B11[6][1] B11[6][2] B11[6][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); //B11[5][0] B11[5][1] B11[7][0] B11[7][1] ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); //B11[5][2] B11[5][3] B11[7][2] B11[7][3] //rearrange high elements ymm9 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm11 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] ymm13 = _mm256_permute2f128_pd(ymm4,ymm5,0x20); //B11[5][0] B11[5][1] B11[5][2] B11[5][3] ymm15 = _mm256_permute2f128_pd(ymm4,ymm5,0x31); //B11[7][0] B11[7][1] B11[7][2] B11[7][3] ymm0 = _mm256_broadcast_sd((double const *)&ones); //broadcast diagonal elements of A11 ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a +1)); //A11[1][1] ymm3 = _mm256_broadcast_sd((double const *)(a11+cs_a*2 + 2)); //A11[2][2] ymm5 = _mm256_unpacklo_pd(ymm1, ymm2); //A11[0][0] A11[0][0] A11[1][1] A11[1][1] ymm6 = _mm256_unpacklo_pd(ymm3, ymm0); //A11[2][2] A11[2][2] A11[3][3] A11[3][3] ymm5 = _mm256_blend_pd(ymm5, ymm6, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm0 = _mm256_div_pd(ymm0, ymm5); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a00 ymm1 = _mm256_permute_pd(ymm0, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row 0): perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm1); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm1); //B11[0-3][4] /= A11[0][0] //extract a11 ymm1 = _mm256_permute_pd(ymm0, 0x03); //1/A11[1][1] 1/A11[1][1] 1/A11[3][3] 1/A11[3][3] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x00); //1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1] ymm2 = _mm256_broadcast_sd((double const *)(a11 +1)); //A11[1][0] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][0] a11 += cs_a; //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9); //B11[1][0-3] -= B11[0-3][0]*A11[1][0] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[2][0-3] -= B11[0-3][0]*A11[2][0] ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13); //B11[5][0-3] -= B11[0-3][4]*A11[1][4] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[6][0-3] -= B11[0-3][4]*A11[2][4] ymm9 = _mm256_mul_pd(ymm9, ymm1); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm1); //B11[0-3][5] /= A11[1][1] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][1] a11 += cs_a; //extract a22 ymm1 = _mm256_permute_pd(ymm0, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x11); //1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2] //(ROw2): FMA operations ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10); //B11[2][0-3] -= A11[2][1] * B11[0-3][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm13, ymm14); //B11[6][0-3] -= A11[2][1] * B11[0-3][5] //perform mul operation ymm10 = _mm256_mul_pd(ymm10, ymm1); //B11[0-3][2] /=A11[2][2] ymm14 = _mm256_mul_pd(ymm14, ymm1); //B11[0-3][6] /= A11[2][2] ymm11 = _mm256_broadcast_sd((double const *)(&ones)); ymm15 = _mm256_broadcast_sd((double const *)(&ones)); //unpacklow// ymm1 = _mm256_unpacklo_pd(ymm8, ymm9); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); //B11[4][0] B11[5][0] B11[4][2] B11[5][2] ymm7 = _mm256_unpacklo_pd(ymm14, ymm15); //B11[6][0] B11[7][0] B11[6][2] B11[7][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ///unpack high/// ymm8 = _mm256_unpackhi_pd(ymm8, ymm9); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); //B11[0][5] B11[1][5] B11[0][7] B11[1][7] ymm13 = _mm256_unpackhi_pd(ymm14, ymm15); //B11[2][5] B11[3][5] B11[2][7] B11[3][7] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm8 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); //load B11[0-3][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); //load B11[0-3][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //load B11[0-3][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //load B11[0-3][3] ymm12 = _mm256_loadu_pd((double const *)(b11 + cs_b * 4)); //load B11[0-3][4] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b * 5)); //load B11[0-3][5] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b * 6)); //load B11[0-3][6] ymm15 = _mm256_loadu_pd((double const *)(f_temp)); //load B11[0-3][7] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm8, 0x08); ymm1 = _mm256_blend_pd(ymm1, ymm9, 0x08); ymm2 = _mm256_blend_pd(ymm2, ymm10, 0x08); ymm3 = _mm256_blend_pd(ymm3, ymm11, 0x08); ymm4 = _mm256_blend_pd(ymm4, ymm12, 0x08); ymm5 = _mm256_blend_pd(ymm5, ymm13, 0x08); ymm6 = _mm256_blend_pd(ymm6, ymm14, 0x08); ymm7 = _mm256_blend_pd(ymm7, ymm15, 0x08); } else if(2 == m_remainder) { ///implement TRSM/// ///unpacklow/// ymm9 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); //B11[0][4] B11[0][5] B11[1][4] B11[1][5] ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); //B11[0][6] B11[0][7] B11[1][6] B11[1][7] //rearrange low elements ymm8 = _mm256_permute2f128_pd(ymm9,ymm11,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm10 = _mm256_permute2f128_pd(ymm9,ymm11,0x31); //B11[2][0] B11[2][1] B11[2][2] B11[2][3] ymm12 = _mm256_permute2f128_pd(ymm13,ymm15,0x20); //B11[4][0] B11[4][1] B11[4][2] B11[4][3] ymm14 = _mm256_permute2f128_pd(ymm13,ymm15,0x31); //B11[6][0] B11[6][1] B11[6][2] B11[6][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); //B11[5][0] B11[5][1] B11[7][0] B11[7][1] ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); //B11[5][2] B11[5][3] B11[7][2] B11[7][3] //rearrange high elements ymm9 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm11 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] ymm13 = _mm256_permute2f128_pd(ymm4,ymm5,0x20); //B11[5][0] B11[5][1] B11[5][2] B11[5][3] ymm15 = _mm256_permute2f128_pd(ymm4,ymm5,0x31); //B11[7][0] B11[7][1] B11[7][2] B11[7][3] ymm0 = _mm256_broadcast_sd((double const *)&ones); //broadcast diagonal elements of A11 ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a +1)); //A11[1][1] ymm5 = _mm256_unpacklo_pd(ymm1, ymm2); //A11[0][0] A11[0][0] A11[1][1] A11[1][1] ymm5 = _mm256_blend_pd(ymm5, ymm0, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm0 = _mm256_div_pd(ymm0, ymm5); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a00 ymm1 = _mm256_permute_pd(ymm0, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row 0): perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm1); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm1); //B11[0-3][4] /= A11[0][0] //extract a11 ymm1 = _mm256_permute_pd(ymm0, 0x03); //1/A11[1][1] 1/A11[1][1] 1/A11[3][3] 1/A11[3][3] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x00); //1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1] ymm2 = _mm256_broadcast_sd((double const *)(a11 +1)); //A11[1][0] a11 += cs_a; //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9); //B11[1][0-3] -= B11[0-3][0]*A11[1][0] ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13); //B11[5][0-3] -= B11[0-3][4]*A11[1][4] ymm9 = _mm256_mul_pd(ymm9, ymm1); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm1); //B11[0-3][5] /= A11[1][1] ymm10 = _mm256_broadcast_sd((double const *)&ones); //unpacklow// ymm1 = _mm256_unpacklo_pd(ymm8, ymm9); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); //B11[4][0] B11[5][0] B11[4][2] B11[5][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1, ymm10, 0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1, ymm10, 0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm4 = _mm256_permute2f128_pd(ymm5, ymm10, 0x20); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm6 = _mm256_permute2f128_pd(ymm5, ymm10, 0x31); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ///unpack high/// ymm8 = _mm256_unpackhi_pd(ymm8, ymm9); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); //B11[0][5] B11[1][5] B11[0][7] B11[1][7] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm8, ymm10, 0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm8, ymm10, 0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm5 = _mm256_permute2f128_pd(ymm12, ymm10, 0x20); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm7 = _mm256_permute2f128_pd(ymm12, ymm10, 0x31); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm8 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); //load B11[0-3][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); //load B11[0-3][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //load B11[0-3][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //load B11[0-3][3] ymm12 = _mm256_loadu_pd((double const *)(b11 + cs_b * 4)); //load B11[0-3][4] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b * 5)); //load B11[0-3][5] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b * 6)); //load B11[0-3][6] ymm15 = _mm256_loadu_pd((double const *)(f_temp)); //load B11[0-3][7] //determine correct values to store ymm0 = _mm256_permute2f128_pd(ymm0, ymm8, 0x30); ymm1 = _mm256_permute2f128_pd(ymm1, ymm9, 0x30); ymm2 = _mm256_permute2f128_pd(ymm2, ymm10, 0x30); ymm3 = _mm256_permute2f128_pd(ymm3, ymm11, 0x30); ymm4 = _mm256_permute2f128_pd(ymm4, ymm12, 0x30); ymm5 = _mm256_permute2f128_pd(ymm5, ymm13, 0x30); ymm6 = _mm256_permute2f128_pd(ymm6, ymm14, 0x30); ymm7 = _mm256_permute2f128_pd(ymm7, ymm15, 0x30); } else if(1 == m_remainder) { ///implement TRSM/// ///unpacklow/// ymm9 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); //B11[0][4] B11[0][5] B11[1][4] B11[1][5] ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); //B11[0][6] B11[0][7] B11[1][6] B11[1][7] //rearrange low elements ymm8 = _mm256_permute2f128_pd(ymm9,ymm11,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm10 = _mm256_permute2f128_pd(ymm9,ymm11,0x31); //B11[2][0] B11[2][1] B11[2][2] B11[2][3] ymm12 = _mm256_permute2f128_pd(ymm13,ymm15,0x20); //B11[4][0] B11[4][1] B11[4][2] B11[4][3] ymm14 = _mm256_permute2f128_pd(ymm13,ymm15,0x31); //B11[6][0] B11[6][1] B11[6][2] B11[6][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); //B11[5][0] B11[5][1] B11[7][0] B11[7][1] ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); //B11[5][2] B11[5][3] B11[7][2] B11[7][3] //rearrange high elements ymm9 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm11 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] ymm13 = _mm256_permute2f128_pd(ymm4,ymm5,0x20); //B11[5][0] B11[5][1] B11[5][2] B11[5][3] ymm15 = _mm256_permute2f128_pd(ymm4,ymm5,0x31); //B11[7][0] B11[7][1] B11[7][2] B11[7][3] ymm0 = _mm256_broadcast_sd((double const *)&ones); //broadcast diagonal elements of A11 ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm0 = _mm256_div_pd(ymm0, ymm1); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a00 ymm1 = _mm256_permute_pd(ymm0, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm1 = _mm256_permute2f128_pd(ymm1, ymm1, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row 0): perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm1); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm1); //B11[0-3][4] /= A11[0][0] ymm9 = _mm256_broadcast_sd((double const *)(&ones)); //unpacklow// ymm1 = _mm256_unpacklo_pd(ymm8, ymm9); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm5 = _mm256_unpacklo_pd(ymm12, ymm9); //B11[4][0] B11[5][0] B11[4][2] B11[5][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1, ymm9, 0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1, ymm9, 0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm4 = _mm256_permute2f128_pd(ymm5, ymm9, 0x20); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm6 = _mm256_permute2f128_pd(ymm5, ymm9, 0x31); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ///unpack high/// ymm8 = _mm256_unpackhi_pd(ymm8, ymm9); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm12 = _mm256_unpackhi_pd(ymm12, ymm9); //B11[0][5] B11[1][5] B11[0][7] B11[1][7] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm5 = _mm256_permute2f128_pd(ymm12, ymm9, 0x20); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm7 = _mm256_permute2f128_pd(ymm12, ymm9, 0x31); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm8 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); //load B11[0-3][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); //load B11[0-3][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //load B11[0-3][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //load B11[0-3][3] ymm12 = _mm256_loadu_pd((double const *)(b11 + cs_b * 4)); //load B11[0-3][4] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b * 5)); //load B11[0-3][5] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b * 6)); //load B11[0-3][6] ymm15 = _mm256_loadu_pd((double const *)(f_temp)); //load B11[0-3][7] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm8, 0x0E); ymm1 = _mm256_blend_pd(ymm1, ymm9, 0x0E); ymm2 = _mm256_blend_pd(ymm2, ymm10, 0x0E); ymm3 = _mm256_blend_pd(ymm3, ymm11, 0x0E); ymm4 = _mm256_blend_pd(ymm4, ymm12, 0x0E); ymm5 = _mm256_blend_pd(ymm5, ymm13, 0x0E); ymm6 = _mm256_blend_pd(ymm6, ymm14, 0x0E); ymm7 = _mm256_blend_pd(ymm7, ymm15, 0x0E); } _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b * 4), ymm4); //store(B11[0-3][4]) _mm256_storeu_pd((double *)(b11 + cs_b * 5), ymm5); //store(B11[0-3][5]) _mm256_storeu_pd((double *)(b11 + cs_b * 6), ymm6); //store(B11[0-3][6]) _mm256_storeu_pd((double *)(f_temp), ymm7); //store(B11[0-3][7]) if((j+D_NR) == n) { for(iter = 0; iter < m_remainder; iter++) (b11 + cs_b * 7)[iter] = f_t[iter]; } } } if((n & 4)) //implementation for remainder columns(when 'n_remainder' is greater than 4) { for(i = 0;i+D_MR-1 < m; i += D_MR) //loop along 'M' direction { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM to be performed(in block of 4) ///GEMM for previously calculated values /// //load 4x4 block from b11 ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b*2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b*3)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a*2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[1][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[2][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B01[2][3]*A10[3][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B01[3][3]*A10[0][3] B01[3][3]*A10[1][3] B01[3][3]*A10[2][3] B01[3][3]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B01[0-3][1] *alpha -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B01[0-3][2] *alpha -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B01[0-3][3] *alpha -= ymm7 ///implement TRSM/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][0] ymm7 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[3][0] //2nd col a11 += cs_a; ymm8 = _mm256_broadcast_sd((double const *)(a11 + 1)); //A11[1][1] ymm9 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][1] //3rd col a11 += cs_a; ymm11 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][2] ymm12 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][2] //4th col a11 += cs_a; ymm13 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm4, ymm8); //A11[0][0] A11[0][0] A11[2][2] A11[2][2] ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[1][1] A11[1][1] A11[3][3] A11[3][3] ymm14 = _mm256_broadcast_sd((double const *)&ones); ymm4 = _mm256_blend_pd(ymm4, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm4); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //extract a00 ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm4 = _mm256_mul_pd(ymm4, ymm15); //B11[0][0-3] /= A11[0][0] //extract diag a11 from a ymm15 = _mm256_permute_pd(ymm14, 0x03); //1/A11[1][1] 1/A11[1][1] 1/A11[3][3] 1/A11[3][3] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0]*B11[0][0-3] ymm11 = _mm256_fnmadd_pd(ymm6, ymm4, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][0]*B11[0][0-3] ymm13 = _mm256_fnmadd_pd(ymm7, ymm4, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][0]*B11[0][0-3] //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm15); //B11[1][0-3] /= A11[1][1] //extract diag a22 from a ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2] //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) ymm11 = _mm256_fnmadd_pd(ymm9, ymm8, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][1]*B11[1][0-3] ymm13 = _mm256_fnmadd_pd(ymm10, ymm8, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][1]*B11[1][0-3] //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B ymm11 = _mm256_mul_pd(ymm11, ymm15); //B11[2][0-3] /= A11[2][2] //extract diag a33 from a ymm15 = _mm256_permute_pd(ymm14, 0x0C); //1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3] //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) ymm13 = _mm256_fnmadd_pd(ymm12, ymm11, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][2]*B11[2][0-3] //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B ymm13 = _mm256_mul_pd(ymm13, ymm15); //B11[3][0-3] /= A11[3][3] //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b*2), ymm2); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b*3), ymm3); //store(B11[0-3][3]) } if(m_remainder) //implementation for remainder rows(when 'M' is not a multiple of D_MR) { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of GEMM operations to be performed(in blocks of 4x4) dim_t iter; if((j+4) == n) { f_temp = f_t; for(iter = 0; iter < m_remainder; iter++) f_temp[iter] = (b11 + cs_b * 3)[iter]; } else f_temp = (b11 + cs_b * 3); ///GEMM for previously calculated values /// //load 4x4 block from b11 ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for(k = 0; k < k_iter; k++) //looop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B01[2][3]*A10[0][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B01[3][3]*A10[0][3] B01[3][3]*A10[1][3] B01[3][3]*A10[2][3] B01[3][3]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[0-3][3] *alpha -= ymm7 if(3 == m_remainder) { ///implement TRSM/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][0] //2nd col a11 += cs_a; ymm8 = _mm256_broadcast_sd((double const *)(a11 + 1)); //A11[1][1] ymm9 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][1] //3rd col a11 += cs_a; ymm11 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][2] //4th col a11 += cs_a; ymm13 = _mm256_broadcast_sd((double const *)(&ones)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm4, ymm8); //A11[0][0] A11[0][0] A11[1][1] A11[1][1] ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[2][2] A11[3][3] A11[3][3] ymm14 = _mm256_broadcast_sd((double const *)&ones); ymm4 = _mm256_blend_pd(ymm4, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm4); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //extract a00 ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00);//1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm4 = _mm256_mul_pd(ymm4, ymm15); //B11[0][0-3] /= A11[0][0] //extract diag a11 from a ymm15 = _mm256_permute_pd(ymm14, 0x03); //1/A11[1][1] 1/A11[1][1] 1/A11[3][3] 1/A11[3][3] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //1/A11[][] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0]* B11[0][0-3] ymm11 = _mm256_fnmadd_pd(ymm6, ymm4, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][0]* B11[0][0-3] //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm15); //B11[1][0-3] /= A11[1][1] //extract diag a22 from a ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2] //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) ymm11 = _mm256_fnmadd_pd(ymm9, ymm8, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][1]* B11[1][0-3] //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B ymm11 = _mm256_mul_pd(ymm11, ymm15); //B11[2][0-3] /= A11[2][2] ymm13 = _mm256_broadcast_sd((double const *)(&ones)); //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] //load 4x4 block from b11 ymm4 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][2] B11[3][3] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm4, 0x08); ymm1 = _mm256_blend_pd(ymm1, ymm5, 0x08); ymm2 = _mm256_blend_pd(ymm2, ymm6, 0x08); ymm3 = _mm256_blend_pd(ymm3, ymm7, 0x08); } else if( 2 == m_remainder ) { ///implement TRSM/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] //2nd col a11 += cs_a; ymm8 = _mm256_broadcast_sd((double const *)(a11 + 1)); //A11[1][1] //compute reciprocals of L(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm4, ymm8); //A11[0][0] A11[0][0] A11[1][1] A11[1][1] ymm14 = _mm256_broadcast_sd((double const *)&ones); ymm4 = _mm256_blend_pd(ymm4, ymm14, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm4); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //extract a00 ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00);//1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm4 = _mm256_mul_pd(ymm4, ymm15); //B11[0][0-3] /= A11[0][0] //extract diag a11 from a ymm15 = _mm256_permute_pd(ymm14, 0x03); //1/A11[1][1] 1/A11[1][1] 1/A11[3][3] 1/A11[3][3] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //1/A11[][] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0]* B11[0][0-3] //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm15); //B11[1][0-3] /= A11[1][1] ymm11 = _mm256_broadcast_sd((double const *)(&ones)); ymm13 = _mm256_broadcast_sd((double const *)(&ones)); //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] //load 4x4 block from b11 ymm4 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][2] B11[3][3] //determine correct values to store ymm0 = _mm256_permute2f128_pd(ymm0, ymm4,0x30); ymm1 = _mm256_permute2f128_pd(ymm1, ymm5,0x30); ymm2 = _mm256_permute2f128_pd(ymm2, ymm6,0x30); ymm3 = _mm256_permute2f128_pd(ymm3, ymm7,0x30); } else if(1 == m_remainder) { ///implement TRSM/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm14 = _mm256_broadcast_sd((double const *)&ones); ymm14 = _mm256_div_pd(ymm14, ymm4); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //extract a00 ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00);//1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm4 = _mm256_mul_pd(ymm4, ymm15); //B11[0][0-3] /= A11[0][0] ymm8 = _mm256_broadcast_sd((double const *)(&ones)); ymm11 = _mm256_broadcast_sd((double const *)(&ones)); ymm13 = _mm256_broadcast_sd((double const *)(&ones)); //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] //load 4x4 block from b11 ymm4 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][2] B11[3][3] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm4, 0x0E); ymm1 = _mm256_blend_pd(ymm1, ymm5, 0x0E); ymm2 = _mm256_blend_pd(ymm2, ymm6, 0x0E); ymm3 = _mm256_blend_pd(ymm3, ymm7, 0x0E); } _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(f_temp), ymm3); //store(B11[0-3][3]) if((j+4) == n) { for(iter = 0; iter < m_remainder; iter++) (b11 + cs_b * 3)[iter] = f_temp[iter]; } } n_remainder -= 4; j += 4; } if(n_remainder) //implementation fo remaining columns(when 'N' is not a multiple of D_NR) { for(i = 0;i+D_MR-1 < m; i += D_MR) //loop along 'M' direction { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of GEMM operations to be performed(in blocks of 4x4) ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM for previously calculated values /// //load 4x4 block from b11 if(3 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] for(k = 0; k < k_iter; k++) { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha Value ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][3] *alpha -= ymm7 } else if(2 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] for(k = 0; k < k_iter; k++) { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha Value ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][3] *alpha -= ymm7 } else if(1 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] for(k = 0; k < k_iter; k++) { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha Value ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][3] *alpha -= ymm7 } ///implement TRSM/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][0] ymm7 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[3][0] //2nd col a11 += cs_a; ymm8 = _mm256_broadcast_sd((double const *)(a11 + 1)); //A11[1][1] ymm9 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][1] //3rd col a11 += cs_a; ymm11 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][2] ymm12 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][2] //4th col a11 += cs_a; ymm13 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm4, ymm8); //A11[0][0] A11[0][0] A11[1][1] A11[1][1] ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[2][2] A11[3][3] A11[3][3] ymm14 = _mm256_broadcast_sd((double const *)&ones); ymm4 = _mm256_blend_pd(ymm4, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm4); //1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //extract a00 ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0] //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B ymm4 = _mm256_mul_pd(ymm4, ymm15); //B11[0][0-3] /= A11[0][0] //extract diag a11 from a ymm15 = _mm256_permute_pd(ymm14, 0x03); //1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0] * B11[0][0-3] ymm11 = _mm256_fnmadd_pd(ymm6, ymm4, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][0] * B11[0][0-3] ymm13 = _mm256_fnmadd_pd(ymm7, ymm4, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][0] * B11[0][0-3] //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B ymm8 = _mm256_mul_pd(ymm8, ymm15); //B11[1][0-3] /= A11[1][1] //extract diag a22 from a ymm15 = _mm256_permute_pd(ymm14, 0x00); //1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2] //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) ymm11 = _mm256_fnmadd_pd(ymm9, ymm8, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][1] * B11[1][0-3] ymm13 = _mm256_fnmadd_pd(ymm10, ymm8, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][1] * B11[1][0-3] //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B ymm11 = _mm256_mul_pd(ymm11, ymm15); //B11[2][0-3] /= A11[2][2] //extract diag a33 from a ymm15 = _mm256_permute_pd(ymm14, 0x0C); //1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3] //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) ymm13 = _mm256_fnmadd_pd(ymm12, ymm11, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][2] * B11[2][0-3] //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B ymm13 = _mm256_mul_pd(ymm13, ymm15); //B11[3][0-3] /= A11[3][3] //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] if(3 == n_remainder) { _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store(B11[0-3][2]) } else if(2 == n_remainder) { _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) } else if(1 == n_remainder) { _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) } } if(m_remainder) //implementation for remainder rows(when 'M' is not a multiple of D_MR) { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM operations to be performed dim_t iter; if((j+n_remainder) == n) { f_temp = f_t; for(iter = 0; iter < m_remainder; iter++) f_temp[iter] = (b11 + cs_b * (n_remainder -1))[iter]; } else f_temp = (b11 + cs_b * (n_remainder -1)); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM for previously calculated values /// //load 4x4 block from b11 if(3 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[0][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[1][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[2][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[3][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha value ymm8 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] * alpha -= ymm4 ymm9 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] * alpha -= ymm5 ymm10 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[0-3][2] * alpha -= ymm6 ///implement TRSM/// //determine correct values to store if(3 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x08); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x08); ymm2 = _mm256_blend_pd(ymm10, ymm2, 0x08); } else if(2 == m_remainder) { ymm0 = _mm256_permute2f128_pd(ymm8, ymm0, 0x30); ymm1 = _mm256_permute2f128_pd(ymm9, ymm1, 0x30); ymm2 = _mm256_permute2f128_pd(ymm10, ymm2, 0x30); } else if(1 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x0E); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x0E); ymm2 = _mm256_blend_pd(ymm10, ymm2, 0x0E); } _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(f_temp), ymm2); //store(B11[0-3][2]) } if(2 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[0][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[1][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[2][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[3][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha value ymm8 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] * alpha -= ymm4 ymm9 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] * alpha -= ymm5 ///implement TRSM/// //determine correct values to store if(3 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x08); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x08); } else if(2 == m_remainder) { ymm0 = _mm256_permute2f128_pd(ymm8, ymm0, 0x30); ymm1 = _mm256_permute2f128_pd(ymm9, ymm1, 0x30); } else if(1 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x0E); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x0E); } _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(f_temp), ymm1); //store(B11[0-3][1]) } if(n_remainder == 1) { ymm0 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[0][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[1][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[2][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[3][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha value ymm8 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] * alpha -= ymm4 ///implement TRSM/// //determine correct values to store if(3 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x08); } else if(2 == m_remainder) { ymm0 = _mm256_permute2f128_pd(ymm8, ymm0, 0x30); } else if(1 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x0E); } _mm256_storeu_pd((double *)(f_temp), ymm0); //store(B11[0-3][0]) } if((j+n_remainder) == n) { for(iter = 0; iter < m_remainder; iter++) (b11 + cs_b * (n_remainder-1))[iter] = f_temp[iter]; } ///scalar code for trsm without alpha/// dtrsm_small_AlXB(a11, b11, m_remainder, n_remainder, cs_a, cs_b); } } return BLIS_SUCCESS; } /* TRSM for the case AX = alpha * B, Double precision * A is lower-triangular, no-transpose, unit diagonal * dimensions A: mxm X: mxn B: mxn b01---> * ***************** ** * * * * * * * * * * * * * * *b01* * * * * * * * * * * a10 ****** b11 ***************** | * * * | * * * * * | * * * | * * * * * | *a10*a11* | *b11* * * * v * * * v * * * * * *********** ***************** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * **************** ***************** a11---> */ static err_t bli_dtrsm_small_AlXB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 4; //size of block along 'M' dimpension dim_t D_NR = 8; //size of block along 'N' dimension dim_t m = bli_obj_length(b); // number of rows of matrix B dim_t n = bli_obj_width(b); // number of columns of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_ROW_PANEL_M && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME) || (m> D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_N) || (m>D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_ROME_COLUMN_PANEL_M && n D_BLIS_SMALL_MATRIX_THRES_TRSM_ALXB_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t m_remainder = m & (3); //number of remainder rows dim_t n_remainder = n & (7); //number of remainder columns dim_t cs_a = bli_obj_col_stride(a); // column stride of A dim_t cs_b = bli_obj_col_stride(b); // column stride of B dim_t i, j, k; //loop variables dim_t k_iter; //number of times GEMM to be performed double AlphaVal = *(double *)AlphaObj->buffer; //value of alpha double *L = a->buffer; //pointer to matrix A double *B = b->buffer; //pointer to matrix B double *a10, *a11, *b01, *b11; //pointers that point to blocks for GEMM and TRSM double *ptr_b01_dup; double f_t[4] __attribute__((aligned(64)));//buffer to store corner column when m_remainder !=0 double* f_temp; double ones = 1.0; //scratch registers __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(j = 0; j+D_NR-1 < n; j += D_NR) //loop along 'N' dimension { for(i = 0;i+D_MR-1 < m; i += D_MR) //loop along 'M' dimension { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM to be performed(in blocks of 4x4) ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); ///GEMM code begins/// for(k = 0; k< k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm16 = _mm256_loadu_pd((double const *)(a10));//A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[0][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[0][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[0][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[0][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[0][4]*A10[0][0] B01[0][4]*A10[1][0] B01[0][4]*A10[2][0] B01[0][4]*A10[3][0]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[0][5]*A10[0][0] B01[0][5]*A10[1][0] B01[0][5]*A10[2][0] B01[0][5]*A10[3][0]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[0][6]*A10[0][0] B01[0][6]*A10[1][0] B01[0][6]*A10[2][0] B01[0][6]*A10[3][0]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[0][7]*A10[0][0] B01[0][7]*A10[1][0] B01[0][7]*A10[2][0] B01[0][7]*A10[3][0]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a));//A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[1][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[1][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[1][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[1][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[1][4]*A10[0][1] B01[1][4]*A10[1][1] B01[1][4]*A10[2][1] B01[1][4]*A10[3][1]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[1][5]*A10[0][1] B01[1][5]*A10[1][1] B01[1][5]*A10[2][1] B01[1][5]*A10[3][1]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[1][6]*A10[0][1] B01[1][6]*A10[1][1] B01[1][6]*A10[2][1] B01[1][6]*A10[3][1]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[1][7]*A10[0][1] B01[1][7]*A10[1][1] B01[1][7]*A10[2][1] B01[1][7]*A10[3][1]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2));//A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[2][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[2][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[2][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[2][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[2][3]*A10[0][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[2][4]*A10[0][2] B01[2][4]*A10[1][2] B01[2][4]*A10[2][2] B01[2][4]*A10[3][2]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[2][5]*A10[0][2] B01[2][5]*A10[1][2] B01[2][5]*A10[2][2] B01[2][5]*A10[3][2]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[2][6]*A10[0][2] B01[2][6]*A10[1][2] B01[2][6]*A10[2][2] B01[2][6]*A10[3][2]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[2][7]*A10[0][2] B01[2][7]*A10[1][2] B01[2][7]*A10[2][2] B01[2][7]*A10[3][2]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3));//A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[3][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[3][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[3][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[3][7] b01 += 1; //mobe to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[3][0]*A10[0][3] B01[3][0]*A10[3][0] B01[3][0]*A10[2][3] B01[3][0]*A10[3][0]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[3][1]*A10[0][3] B01[3][1]*A10[3][0] B01[3][1]*A10[2][3] B01[3][1]*A10[3][0]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[3][2]*A10[0][3] B01[3][2]*A10[3][0] B01[3][2]*A10[2][3] B01[3][2]*A10[3][0]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[3][3]*A10[0][3] B01[3][3]*A10[3][0] B01[3][3]*A10[2][3] B01[3][3]*A10[3][0]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[3][4]*A10[0][3] B01[3][4]*A10[3][0] B01[3][4]*A10[2][3] B01[3][4]*A10[3][3]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[3][5]*A10[0][3] B01[3][5]*A10[3][0] B01[3][5]*A10[2][3] B01[3][5]*A10[3][3]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[3][6]*A10[0][3] B01[3][6]*A10[3][0] B01[3][6]*A10[2][3] B01[3][6]*A10[3][3]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[3][7]*A10[0][3] B01[3][7]*A10[3][0] B01[3][7]*A10[2][3] B01[3][7]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to calculate next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to calculate next block of B for GEMM } ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b *0)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b *1)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b *2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b *3)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b *4)); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b *5)); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b *6)); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ymm7 = _mm256_loadu_pd((double const *)(b11 + cs_b *7)); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); //B11[0-3][0] * alpha -= B01[0-3][0] ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); //B11[0-3][1] * alpha -= B01[0-3][1] ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); //B11[0-3][2] * alpha -= B01[0-3][2] ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11); //B11[0-3][3] * alpha -= B01[0-3][3] ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); //B11[0-3][4] * alpha -= B01[0-3][4] ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); //B11[0-3][5] * alpha -= B01[0-3][5] ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); //B11[0-3][6] * alpha -= B01[0-3][6] ymm7 = _mm256_fmsub_pd(ymm7, ymm16, ymm15); //B11[0-3][7] * alpha -= B01[0-3][7] ///implement TRSM/// ///transpose of B11// ///unpacklow/// ymm9 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); //B11[0][4] B11[0][5] B11[2][4] B11[2][5] ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); //B11[0][6] B11[0][7] B11[2][6] B11[2][7] //rearrange low elements ymm8 = _mm256_permute2f128_pd(ymm9,ymm11,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm10 = _mm256_permute2f128_pd(ymm9,ymm11,0x31); //B11[2][0] B11[2][1] B11[2][2] B11[2][3] ymm12 = _mm256_permute2f128_pd(ymm13,ymm15,0x20); //B11[4][0] B11[4][1] B11[4][2] B11[4][3] ymm14 = _mm256_permute2f128_pd(ymm13,ymm15,0x31); //B11[6][0] B11[6][1] B11[6][2] B11[6][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); //B11[1][4] B11[1][5] B11[3][4] B11[3][5] ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); //B11[1][6] B11[1][7] B11[3][6] B11[3][7] //rearrange high elements ymm9 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm11 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] ymm13 = _mm256_permute2f128_pd(ymm4,ymm5,0x20); //B11[5][0] B11[5][1] B11[5][2] B11[5][3] ymm15 = _mm256_permute2f128_pd(ymm4,ymm5,0x31); //B11[7][0] B11[7][1] B11[7][2] B11[7][3] ymm2 = _mm256_broadcast_sd((double const *)(a11 +1)); //A11[1][0] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][0] ymm4 = _mm256_broadcast_sd((double const *)(a11 +3)); //A11[3][0] a11 += cs_a; //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9); //B11[1][0-3] -= A11[1][0] * B11[0-3][0] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[2][0-3] -= A11[2][0] * B11[0-3][0] ymm11 = _mm256_fnmadd_pd(ymm4, ymm8, ymm11); //B11[3][0-3] -= A11[3][0] * B11[0-3][0] ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13); //B11[5][0-3] -= A11[1][0] * B11[0-3][4] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[6][0-3] -= A11[2][0] * B11[0-3][4] ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15); //B11[7][0-3] -= A11[3][0] * B11[0-3][4] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][1] ymm4 = _mm256_broadcast_sd((double const *)(a11 +3)); //A11[3][1] a11 += cs_a; //(ROw2): FMA operations ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10); //B11[2][0-3] -= A11[2][1] * B11[0-3][1] ymm11 = _mm256_fnmadd_pd(ymm4, ymm9, ymm11); //B11[3][0-3] -= A11[3][1] * B11[0-3][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm13, ymm14); //B11[6][0-3] -= A11[2][1] * B11[0-3][5] ymm15 = _mm256_fnmadd_pd(ymm4, ymm13, ymm15); //B11[7][0-3] -= A11[3][1] * B11[0-3][5] ymm4 = _mm256_broadcast_sd((double const *)(a11 +3)); //A11[3][2] a11 += cs_a; //(ROw1): FMA operations ymm11 = _mm256_fnmadd_pd(ymm4, ymm10, ymm11); //B11[3][0-3] -= A11[3][2] * B11[0-3][2] ymm15 = _mm256_fnmadd_pd(ymm4, ymm14, ymm15); //B11[7][0-3] -= A11[3][2] * B11[0-3][6] //unpacklow// ymm1 = _mm256_unpacklo_pd(ymm8, ymm9); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); //B11[4][0] B11[5][0] B11[4][2] B11[5][2] ymm7 = _mm256_unpacklo_pd(ymm14, ymm15); //B11[6][0] B11[7][0] B11[6][2] B11[7][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ///unpack high/// ymm8 = _mm256_unpackhi_pd(ymm8, ymm9); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); //B11[4][1] B11[5][1] B11[4][3] B11[5][3] ymm13 = _mm256_unpackhi_pd(ymm14, ymm15); //B11[6][1] B11[7][1] B11[6][3] B11[7][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); //store B11[0][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); //store B11[1][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store B11[2][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store B11[3][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 4), ymm4); //store B11[4][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 5), ymm5); //store B11[5][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 6), ymm6); //store B11[6][0-3] _mm256_storeu_pd((double *)(b11 + cs_b * 7), ymm7); //store B11[7][0-3] } if(m_remainder) //implementation for reamainder rows(when 'M' is not a multiple of D_MR) { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM operation to be done(in blocks of 4x4) dim_t iter; if((j+D_NR) == n) { f_temp = f_t; for(iter = 0; iter < m_remainder; iter++) f_temp[iter] = (b11 + cs_b * 7)[iter]; } else f_temp = (b11 + cs_b * 7); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); ///GEMM code Begins/// for(k = 0; k< k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm16 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[0][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[0][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[0][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[0][7] b01 += 1; //move to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0] ) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[0][4]*A10[0][0] B01[0][4]*A10[1][0] B01[0][4]*A10[2][0] B01[0][4]*A10[3][0]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[0][5]*A10[0][0] B01[0][5]*A10[1][0] B01[0][5]*A10[2][0] B01[0][5]*A10[3][0]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[0][6]*A10[0][0] B01[0][6]*A10[1][0] B01[0][6]*A10[2][0] B01[0][6]*A10[3][0]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm16 += (B01[0][7]*A10[0][0] B01[0][7]*A10[1][0] B01[0][7]*A10[2][0] B01[0][7]*A10[3][0]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 1)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[1][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[1][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[1][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[1][7] b01 += 1; //move to next row of B01 ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[1][4]*A10[0][1] B01[1][4]*A10[1][1] B01[1][4]*A10[2][1] B01[1][4]*A10[3][1]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[1][5]*A10[0][1] B01[1][5]*A10[1][1] B01[1][5]*A10[2][1] B01[1][5]*A10[3][1]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[1][6]*A10[0][1] B01[1][6]*A10[1][1] B01[1][6]*A10[2][1] B01[1][6]*A10[3][1]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[1][7]*A10[0][1] B01[1][7]*A10[1][1] B01[1][7]*A10[2][1] B01[1][7]*A10[3][1]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] //A10[1][2] A10[2][2] A10[3][2] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[2][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[2][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[2][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[2][7] b01 += 1; //move to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm9 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm10 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm11 += (B01[2][3]*A10[0][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm12 += (B01[2][4]*A10[0][2] B01[2][4]*A10[1][2] B01[2][4]*A10[2][2] B01[2][0]*A10[3][2]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm13 += (B01[2][5]*A10[0][2] B01[2][5]*A10[1][2] B01[2][5]*A10[2][2] B01[2][1]*A10[3][2]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm14 += (B01[2][6]*A10[0][2] B01[2][6]*A10[1][2] B01[2][6]*A10[2][2] B01[2][2]*A10[3][2]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm15 += (B01[2][7]*A10[0][2] B01[2][7]*A10[1][2] B01[2][7]*A10[2][2] B01[2][3]*A10[3][2]) ymm16 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm4 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm5 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm6 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm7 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] ymm0 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 4)); //B01[3][4] ymm1 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 5)); //B01[3][5] ymm2 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 6)); //B01[3][6] ymm3 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 7)); //B01[3][7] b01 += 1; //move to next row of B ymm8 = _mm256_fmadd_pd(ymm4, ymm16, ymm8); //ymm8 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm9 = _mm256_fmadd_pd(ymm5, ymm16, ymm9); //ymm8 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm10 = _mm256_fmadd_pd(ymm6, ymm16, ymm10); //ymm8 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) ymm11 = _mm256_fmadd_pd(ymm7, ymm16, ymm11); //ymm8 += (B01[3][3]*A10[0][3] B01[3][3]*A10[1][3] B01[3][3]*A10[2][3] B01[3][3]*A10[3][3]) ymm12 = _mm256_fmadd_pd(ymm0, ymm16, ymm12); //ymm8 += (B01[3][0]*A10[0][3] B01[3][4]*A10[1][3] B01[3][4]*A10[2][3] B01[3][4]*A10[3][3]) ymm13 = _mm256_fmadd_pd(ymm1, ymm16, ymm13); //ymm8 += (B01[3][1]*A10[0][3] B01[3][5]*A10[1][3] B01[3][5]*A10[2][3] B01[3][5]*A10[3][3]) ymm14 = _mm256_fmadd_pd(ymm2, ymm16, ymm14); //ymm8 += (B01[3][2]*A10[0][3] B01[3][6]*A10[1][3] B01[3][6]*A10[2][3] B01[3][6]*A10[3][3]) ymm15 = _mm256_fmadd_pd(ymm3, ymm16, ymm15); //ymm8 += (B01[3][3]*A10[0][3] B01[3][7]*A10[1][3] B01[3][7]*A10[2][3] B01[3][7]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha value ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b *0)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b *1)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b *2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b *3)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b *4)); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b *5)); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b *6)); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); //B11[0-3][0] *alpha -= B01[0-3][0] ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); //B11[0-3][1] *alpha -= B01[0-3][1] ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); //B11[0-3][2] *alpha -= B01[0-3][2] ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11); //B11[0-3][3] *alpha -= B01[0-3][3] ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); //B11[0-3][4] *alpha -= B01[0-3][4] ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); //B11[0-3][5] *alpha -= B01[0-3][5] ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); //B11[0-3][6] *alpha -= B01[0-3][6] ymm7 = _mm256_fmsub_pd(ymm7, ymm16, ymm15); //B11[0-3][7] *alpha -= B01[0-3][7] if(3 == m_remainder) { ///implement TRSM/// ///unpacklow/// ymm9 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); //B11[0][4] B11[0][5] B11[1][4] B11[1][5] ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); //B11[0][6] B11[0][7] B11[1][6] B11[1][7] //rearrange low elements ymm8 = _mm256_permute2f128_pd(ymm9,ymm11,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm10 = _mm256_permute2f128_pd(ymm9,ymm11,0x31); //B11[2][0] B11[2][1] B11[2][2] B11[2][3] ymm12 = _mm256_permute2f128_pd(ymm13,ymm15,0x20); //B11[4][0] B11[4][1] B11[4][2] B11[4][3] ymm14 = _mm256_permute2f128_pd(ymm13,ymm15,0x31); //B11[6][0] B11[6][1] B11[6][2] B11[6][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); //B11[5][0] B11[5][1] B11[7][0] B11[7][1] ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); //B11[5][2] B11[5][3] B11[7][2] B11[7][3] //rearrange high elements ymm9 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm11 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] ymm13 = _mm256_permute2f128_pd(ymm4,ymm5,0x20); //B11[5][0] B11[5][1] B11[5][2] B11[5][3] ymm15 = _mm256_permute2f128_pd(ymm4,ymm5,0x31); //B11[7][0] B11[7][1] B11[7][2] B11[7][3] ymm2 = _mm256_broadcast_sd((double const *)(a11 +1)); //A11[1][0] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][0] a11 += cs_a; //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9); //B11[1][0-3] -= B11[0-3][0]*A11[1][0] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[2][0-3] -= B11[0-3][0]*A11[2][0] ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13); //B11[5][0-3] -= B11[0-3][4]*A11[1][4] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[6][0-3] -= B11[0-3][4]*A11[2][4] ymm3 = _mm256_broadcast_sd((double const *)(a11 +2)); //A11[2][1] a11 += cs_a; //(ROw2): FMA operations ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10); //B11[2][0-3] -= A11[2][1] * B11[0-3][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm13, ymm14); //B11[6][0-3] -= A11[2][1] * B11[0-3][5] ymm11 = _mm256_broadcast_sd((double const *)(&ones)); ymm15 = _mm256_broadcast_sd((double const *)(&ones)); //unpacklow// ymm1 = _mm256_unpacklo_pd(ymm8, ymm9); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); //B11[4][0] B11[5][0] B11[4][2] B11[5][2] ymm7 = _mm256_unpacklo_pd(ymm14, ymm15); //B11[6][0] B11[7][0] B11[6][2] B11[7][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ///unpack high/// ymm8 = _mm256_unpackhi_pd(ymm8, ymm9); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); //B11[0][5] B11[1][5] B11[0][7] B11[1][7] ymm13 = _mm256_unpackhi_pd(ymm14, ymm15); //B11[2][5] B11[3][5] B11[2][7] B11[3][7] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm8 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); //load B11[0-3][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); //load B11[0-3][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //load B11[0-3][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //load B11[0-3][3] ymm12 = _mm256_loadu_pd((double const *)(b11 + cs_b * 4)); //load B11[0-3][4] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b * 5)); //load B11[0-3][5] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b * 6)); //load B11[0-3][6] ymm15 = _mm256_loadu_pd((double const *)(f_temp)); //load B11[0-3][7] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm8, 0x08); ymm1 = _mm256_blend_pd(ymm1, ymm9, 0x08); ymm2 = _mm256_blend_pd(ymm2, ymm10, 0x08); ymm3 = _mm256_blend_pd(ymm3, ymm11, 0x08); ymm4 = _mm256_blend_pd(ymm4, ymm12, 0x08); ymm5 = _mm256_blend_pd(ymm5, ymm13, 0x08); ymm6 = _mm256_blend_pd(ymm6, ymm14, 0x08); ymm7 = _mm256_blend_pd(ymm7, ymm15, 0x08); } else if(2 == m_remainder) { ///implement TRSM/// ///unpacklow/// ymm9 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); //B11[0][4] B11[0][5] B11[1][4] B11[1][5] ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); //B11[0][6] B11[0][7] B11[1][6] B11[1][7] //rearrange low elements ymm8 = _mm256_permute2f128_pd(ymm9,ymm11,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm10 = _mm256_permute2f128_pd(ymm9,ymm11,0x31); //B11[2][0] B11[2][1] B11[2][2] B11[2][3] ymm12 = _mm256_permute2f128_pd(ymm13,ymm15,0x20); //B11[4][0] B11[4][1] B11[4][2] B11[4][3] ymm14 = _mm256_permute2f128_pd(ymm13,ymm15,0x31); //B11[6][0] B11[6][1] B11[6][2] B11[6][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); //B11[5][0] B11[5][1] B11[7][0] B11[7][1] ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); //B11[5][2] B11[5][3] B11[7][2] B11[7][3] //rearrange high elements ymm9 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm11 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] ymm13 = _mm256_permute2f128_pd(ymm4,ymm5,0x20); //B11[5][0] B11[5][1] B11[5][2] B11[5][3] ymm15 = _mm256_permute2f128_pd(ymm4,ymm5,0x31); //B11[7][0] B11[7][1] B11[7][2] B11[7][3] ymm2 = _mm256_broadcast_sd((double const *)(a11 +1)); //A11[1][0] a11 += cs_a; //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9); //B11[1][0-3] -= B11[0-3][0]*A11[1][0] ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13); //B11[5][0-3] -= B11[0-3][4]*A11[1][4] ymm10 = _mm256_broadcast_sd((double const *)&ones); //unpacklow// ymm1 = _mm256_unpacklo_pd(ymm8, ymm9); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); //B11[4][0] B11[5][0] B11[4][2] B11[5][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1, ymm10, 0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1, ymm10, 0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm4 = _mm256_permute2f128_pd(ymm5, ymm10, 0x20); //B11[0][4] B11[1][4] B11[2][4] B11[3][4] ymm6 = _mm256_permute2f128_pd(ymm5, ymm10, 0x31); //B11[0][6] B11[1][6] B11[2][6] B11[3][6] ///unpack high/// ymm8 = _mm256_unpackhi_pd(ymm8, ymm9); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); //B11[0][5] B11[1][5] B11[0][7] B11[1][7] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm8, ymm10, 0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm8, ymm10, 0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm5 = _mm256_permute2f128_pd(ymm12, ymm10, 0x20); //B11[0][5] B11[1][5] B11[2][5] B11[3][5] ymm7 = _mm256_permute2f128_pd(ymm12, ymm10, 0x31); //B11[0][7] B11[1][7] B11[2][7] B11[3][7] ymm8 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); //load B11[0-3][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); //load B11[0-3][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //load B11[0-3][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //load B11[0-3][3] ymm12 = _mm256_loadu_pd((double const *)(b11 + cs_b * 4)); //load B11[0-3][4] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b * 5)); //load B11[0-3][5] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b * 6)); //load B11[0-3][6] ymm15 = _mm256_loadu_pd((double const *)(f_temp)); //load B11[0-3][7] //determine correct values to store ymm0 = _mm256_permute2f128_pd(ymm0, ymm8, 0x30); ymm1 = _mm256_permute2f128_pd(ymm1, ymm9, 0x30); ymm2 = _mm256_permute2f128_pd(ymm2, ymm10, 0x30); ymm3 = _mm256_permute2f128_pd(ymm3, ymm11, 0x30); ymm4 = _mm256_permute2f128_pd(ymm4, ymm12, 0x30); ymm5 = _mm256_permute2f128_pd(ymm5, ymm13, 0x30); ymm6 = _mm256_permute2f128_pd(ymm6, ymm14, 0x30); ymm7 = _mm256_permute2f128_pd(ymm7, ymm15, 0x30); } else if(1 == m_remainder) { ymm8 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); //load B11[0-3][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); //load B11[0-3][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //load B11[0-3][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //load B11[0-3][3] ymm12 = _mm256_loadu_pd((double const *)(b11 + cs_b * 4)); //load B11[0-3][4] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b * 5)); //load B11[0-3][5] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b * 6)); //load B11[0-3][6] ymm15 = _mm256_loadu_pd((double const *)(f_temp)); //load B11[0-3][7] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm8, 0x0E); ymm1 = _mm256_blend_pd(ymm1, ymm9, 0x0E); ymm2 = _mm256_blend_pd(ymm2, ymm10, 0x0E); ymm3 = _mm256_blend_pd(ymm3, ymm11, 0x0E); ymm4 = _mm256_blend_pd(ymm4, ymm12, 0x0E); ymm5 = _mm256_blend_pd(ymm5, ymm13, 0x0E); ymm6 = _mm256_blend_pd(ymm6, ymm14, 0x0E); ymm7 = _mm256_blend_pd(ymm7, ymm15, 0x0E); } _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b * 4), ymm4); //store(B11[0-3][4]) _mm256_storeu_pd((double *)(b11 + cs_b * 5), ymm5); //store(B11[0-3][5]) _mm256_storeu_pd((double *)(b11 + cs_b * 6), ymm6); //store(B11[0-3][6]) _mm256_storeu_pd((double *)(f_temp), ymm7); //store(B11[0-3][7]) if((j+D_NR) == n) { for(iter = 0; iter < m_remainder; iter++) (b11 + cs_b * 7)[iter] = f_temp[iter]; } } } if((n & 4)) //implementation for remainder columns(when 'n_remainder' is greater than 4) { for(i = 0;i+D_MR-1 < m; i += D_MR) //loop along 'M' direction { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM to be performed(in block of 4) ///GEMM for previously calculated values /// //load 4x4 block from b11 ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b*2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b*3)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a*2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[1][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[2][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B01[2][3]*A10[3][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B01[3][3]*A10[0][3] B01[3][3]*A10[1][3] B01[3][3]*A10[2][3] B01[3][3]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B01[0-3][1] *alpha -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B01[0-3][2] *alpha -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B01[0-3][3] *alpha -= ymm7 ///implement TRSM/// //1st col ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][0] ymm7 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[3][0] //2nd col a11 += cs_a; ymm9 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][1] //3rd col a11 += cs_a; ymm12 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][2] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0]*B11[0][0-3] ymm11 = _mm256_fnmadd_pd(ymm6, ymm4, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][0]*B11[0][0-3] ymm13 = _mm256_fnmadd_pd(ymm7, ymm4, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][0]*B11[0][0-3] //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) ymm11 = _mm256_fnmadd_pd(ymm9, ymm8, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][1]*B11[1][0-3] ymm13 = _mm256_fnmadd_pd(ymm10, ymm8, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][1]*B11[1][0-3] //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) ymm13 = _mm256_fnmadd_pd(ymm12, ymm11, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][2]*B11[2][0-3] //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b*2), ymm2); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b*3), ymm3); //store(B11[0-3][3]) } if(m_remainder) //implementation for remainder rows(when 'M' is not a multiple of D_MR) { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of GEMM operations to be performed(in blocks of 4x4) dim_t iter; if((j+4) == n) { f_temp = f_t; for(iter = 0; iter < m_remainder; iter++) f_temp[iter] = (b11 + cs_b * 3)[iter]; } else f_temp = (b11 + cs_b * 3); ///GEMM for previously calculated values /// //load 4x4 block from b11 ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); for(k = 0; k < k_iter; k++) //looop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[0][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B01[0][3]*A10[0][0] B01[0][3]*A10[1][0] B01[0][3]*A10[2][0] B01[0][3]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[1][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B01[1][3]*A10[0][1] B01[1][3]*A10[1][1] B01[1][3]*A10[2][1] B01[1][3]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[2][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B01[2][3]*A10[0][2] B01[2][3]*A10[1][2] B01[2][3]*A10[2][2] B01[2][3]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 3)); //B01[3][3] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B01[3][3]*A10[0][3] B01[3][3]*A10[1][3] B01[3][3]*A10[2][3] B01[3][3]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[0-3][3] *alpha -= ymm7 if(3 == m_remainder) { ///implement TRSM/// //1st col ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][0] //2nd col a11 += cs_a; ymm9 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][1] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0]* B11[0][0-3] ymm11 = _mm256_fnmadd_pd(ymm6, ymm4, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][0]* B11[0][0-3] //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) ymm11 = _mm256_fnmadd_pd(ymm9, ymm8, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][1]* B11[1][0-3] ymm13 = _mm256_broadcast_sd((double const *)(&ones)); //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] //load 4x4 block from b11 ymm4 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][2] B11[3][3] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm4, 0x08); ymm1 = _mm256_blend_pd(ymm1, ymm5, 0x08); ymm2 = _mm256_blend_pd(ymm2, ymm6, 0x08); ymm3 = _mm256_blend_pd(ymm3, ymm7, 0x08); } else if(2 == m_remainder) { ///implement TRSM/// //1st col ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0]* B11[0][0-3] ymm11 = _mm256_broadcast_sd((double const *)(&ones)); ymm13 = _mm256_broadcast_sd((double const *)(&ones)); //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] //load 4x4 block from b11 ymm4 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][2] B11[3][3] //determine correct values to store ymm0 = _mm256_permute2f128_pd(ymm0, ymm4,0x30); ymm1 = _mm256_permute2f128_pd(ymm1, ymm5,0x30); ymm2 = _mm256_permute2f128_pd(ymm2, ymm6,0x30); ymm3 = _mm256_permute2f128_pd(ymm3, ymm7,0x30); } else if(1 == m_remainder) { //load 4x4 block from b11 ymm4 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm7 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][3] B11[1][3] B11[2][2] B11[3][3] //determine correct values to store ymm0 = _mm256_blend_pd(ymm0, ymm4, 0x0E); ymm1 = _mm256_blend_pd(ymm1, ymm5, 0x0E); ymm2 = _mm256_blend_pd(ymm2, ymm6, 0x0E); ymm3 = _mm256_blend_pd(ymm3, ymm7, 0x0E); } _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(f_temp), ymm3); //store(B11[0-3][3]) if((j+4) == n) { for(iter = 0; iter < m_remainder; iter++) (b11 + cs_b * 3)[iter] = f_temp[iter]; } } n_remainder -= 4; j += 4; } if(n_remainder) //implementation fo remaining columns(when 'N' is not a multiple of D_NR) { for(i = 0;i+D_MR-1 < m; i += D_MR) //loop along 'M' direction { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of GEMM operations to be performed(in blocks of 4x4) ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM for previously calculated values /// //load 4x4 block from b11 if(3 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] for(k = 0; k < k_iter; k++) { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[0][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[1][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[2][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B01[3][2] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha Value ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][3] *alpha -= ymm7 } else if(2 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] for(k = 0; k < k_iter; k++) { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[0][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[1][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[2][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B01[3][1] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha Value ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][3] *alpha -= ymm7 } else if(1 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] for(k = 0; k < k_iter; k++) { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[0][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[1][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[2][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B01[3][0] b01 += 1; ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha Value ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] *alpha -= ymm4 ymm1 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][1] *alpha -= ymm5 ymm2 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][2] *alpha -= ymm6 ymm3 = _mm256_broadcast_sd((double const *)(&ones)); //B11[0-3][3] *alpha -= ymm7 } ///implement TRSM/// //1st col ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][0] ymm7 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[3][0] //2nd col a11 += cs_a; ymm9 = _mm256_broadcast_sd((double const *)(a11 + 2)); //A11[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][1] //3rd col a11 += cs_a; ymm12 = _mm256_broadcast_sd((double const *)(a11 + 3)); //A11[3][2] ////unpacklow//// ymm8 = _mm256_unpacklo_pd(ymm0, ymm1); //B11[0][0] B11[0][1] B11[2][0] B11[2][1] ymm13 = _mm256_unpacklo_pd(ymm2, ymm3); //B11[0][2] B11[0][3] B11[2][2] B11[2][3] //rearrange low elements ymm4 = _mm256_permute2f128_pd(ymm8,ymm13,0x20); //B11[0][0] B11[0][1] B11[0][2] B11[0][3] ymm11 = _mm256_permute2f128_pd(ymm8,ymm13,0x31);//B11[2][0] B11[2][1] B11[2][2] B11[2][3] ////unpackhigh//// ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); //B11[1][0] B11[1][1] B11[3][0] B11[3][1] ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); //B11[1][2] B11[1][3] B11[3][2] B11[3][3] //rearrange high elements ymm8 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); //B11[1][0] B11[1][1] B11[1][2] B11[1][3] ymm13 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); //B11[3][0] B11[3][1] B11[3][2] B11[3][3] //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (3, 0) ymm8 = _mm256_fnmadd_pd(ymm5, ymm4, ymm8);//d = c - (a*b) //B11[1][0-3] -= A11[1][0] * B11[0][0-3] ymm11 = _mm256_fnmadd_pd(ymm6, ymm4, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][0] * B11[0][0-3] ymm13 = _mm256_fnmadd_pd(ymm7, ymm4, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][0] * B11[0][0-3] //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) ymm11 = _mm256_fnmadd_pd(ymm9, ymm8, ymm11);//d = c - (a*b) //B11[2][0-3] -= A11[2][1] * B11[1][0-3] ymm13 = _mm256_fnmadd_pd(ymm10, ymm8, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][1] * B11[1][0-3] //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) ymm13 = _mm256_fnmadd_pd(ymm12, ymm11, ymm13);//d = c - (a*b) //B11[3][0-3] -= A11[3][2] * B11[2][0-3] //--> Transpose and store results of columns of B block <--// ////unpacklow//// ymm1 = _mm256_unpacklo_pd(ymm4, ymm8); //B11[0][0] B11[1][0] B11[0][2] B11[1][2] ymm3 = _mm256_unpacklo_pd(ymm11, ymm13); //B11[2][0] B11[3][0] B11[2][2] B11[3][2] //rearrange low elements ymm0 = _mm256_permute2f128_pd(ymm1,ymm3,0x20); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_permute2f128_pd(ymm1,ymm3,0x31); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ////unpackhigh//// ymm14 = _mm256_unpackhi_pd(ymm4, ymm8); //B11[0][1] B11[1][1] B11[0][3] B11[1][3] ymm15 = _mm256_unpackhi_pd(ymm11, ymm13); //B11[2][1] B11[3][1] B11[2][3] B11[3][3] //rearrange high elements ymm1 = _mm256_permute2f128_pd(ymm14,ymm15,0x20); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_permute2f128_pd(ymm14,ymm15,0x31); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] if(3 == n_remainder) { _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); //store(B11[0-3][2]) } else if(2 == n_remainder) { _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) } else if(1 == n_remainder) { _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) } } if(m_remainder) //implementation for remainder rows(when 'M' is not a multiple of D_MR) { a10 = L +i; //pointer to block of A to be used for GEMM a11 = L + i + (i*cs_a); //pointer to block of A to be used for TRSM b01 = B + j*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j* cs_b; //pointer to block of B to be used for TRSM k_iter = i / D_MR; //number of times GEMM operations to be performed dim_t iter; if((j+n_remainder) == n) { f_temp = f_t; for(iter = 0; iter < m_remainder; iter++) f_temp[iter] = (b11 + cs_b * (n_remainder -1))[iter]; } else f_temp = (b11 + cs_b * (n_remainder -1)); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM for previously calculated values /// //load 4x4 block from b11 if(3 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[0][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[0][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B01[0][2]*A10[0][0] B01[0][2]*A10[1][0] B01[0][2]*A10[2][0] B01[0][2]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[1][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[1][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B01[1][2]*A10[0][1] B01[1][2]*A10[1][1] B01[1][2]*A10[2][1] B01[1][2]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[2][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[2][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B01[2][2]*A10[0][2] B01[2][2]*A10[1][2] B01[2][2]*A10[2][2] B01[2][2]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[3][1] ymm14 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 2)); //B10[3][2] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B01[3][2]*A10[0][3] B01[3][2]*A10[1][3] B01[3][2]*A10[2][3] B01[3][2]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha value ymm8 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] * alpha -= ymm4 ymm9 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] * alpha -= ymm5 ymm10 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[0-3][2] * alpha -= ymm6 ///implement TRSM/// //determine correct values to store if(3 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x08); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x08); ymm2 = _mm256_blend_pd(ymm10, ymm2, 0x08); } else if(2 == m_remainder) { ymm0 = _mm256_permute2f128_pd(ymm8, ymm0, 0x30); ymm1 = _mm256_permute2f128_pd(ymm9, ymm1, 0x30); ymm2 = _mm256_permute2f128_pd(ymm10, ymm2, 0x30); } else if(1 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x0E); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x0E); ymm2 = _mm256_blend_pd(ymm10, ymm2, 0x0E); } _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + (cs_b)), ymm1); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(f_temp), ymm2); //store(B11[0-3][2]) } else if(2 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(b11)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[0][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[0][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B01[0][1]*A10[0][0] B01[0][1]*A10[1][0] B01[0][1]*A10[2][0] B01[0][1]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[1][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[1][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B01[1][1]*A10[0][1] B01[1][1]*A10[1][1] B01[1][1]*A10[2][1] B01[1][1]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[2][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[2][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B01[2][1]*A10[0][2] B01[2][1]*A10[1][2] B01[2][1]*A10[2][2] B01[2][1]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[3][0] ymm13 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 1)); //B10[3][1] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B01[3][1]*A10[0][3] B01[3][1]*A10[1][3] B01[3][1]*A10[2][3] B01[3][1]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha value ymm8 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] * alpha -= ymm4 ymm9 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[0-3][1] * alpha -= ymm5 ///implement TRSM/// //determine correct values to store if(3 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x08); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x08); } else if(2 == m_remainder) { ymm0 = _mm256_permute2f128_pd(ymm8, ymm0, 0x30); ymm1 = _mm256_permute2f128_pd(ymm9, ymm1, 0x30); } else if(1 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x0E); ymm1 = _mm256_blend_pd(ymm9, ymm1, 0x0E); } _mm256_storeu_pd((double *)b11, ymm0); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(f_temp), ymm1); //store(B11[0-3][1]) } else if(1 == n_remainder) { ymm0 = _mm256_loadu_pd((double const *)(f_temp)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_b01_dup = b01; ymm8 = _mm256_loadu_pd((double const *)(a10)); //A10[0][0] A10[1][0] A10[2][0] A10[3][0] ymm9 = _mm256_loadu_pd((double const *)(a10 + cs_a)); //A10[0][1] A10[1][1] A10[2][1] A10[3][1] ymm10 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2)); //A10[0][2] A10[1][2] A10[2][2] A10[3][2] ymm11 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3)); //A10[0][3] A10[1][3] A10[2][3] A10[3][3] ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[0][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B01[0][0]*A10[0][0] B01[0][0]*A10[1][0] B01[0][0]*A10[2][0] B01[0][0]*A10[3][0]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[1][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B01[1][0]*A10[0][1] B01[1][0]*A10[1][1] B01[1][0]*A10[2][1] B01[1][0]*A10[3][1]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[2][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B01[2][0]*A10[0][2] B01[2][0]*A10[1][2] B01[2][0]*A10[2][2] B01[2][0]*A10[3][2]) ymm12 = _mm256_broadcast_sd((double const *)(b01 + cs_b * 0)); //B10[3][0] b01 += 1; //move to next row of B ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B01[3][0]*A10[0][3] B01[3][0]*A10[1][3] B01[3][0]*A10[2][3] B01[3][0]*A10[3][3]) a10 += D_MR * cs_a; //pointer math to find next block of A for GEMM b01 = ptr_b01_dup + D_MR; //pointer math to find next block of B for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to hold alpha value ymm8 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[0-3][0] * alpha -= ymm4 ///implement TRSM/// //determine correct values to store if(3 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x08); } else if(2 == m_remainder) { ymm0 = _mm256_permute2f128_pd(ymm8, ymm0, 0x30); } else if(1 == m_remainder) { ymm0 = _mm256_blend_pd(ymm8, ymm0, 0x0E); } _mm256_storeu_pd((double *)(f_temp), ymm0); //store(B11[0-3][0]) } if((j+n_remainder) == n) { for(iter = 0; iter < m_remainder; iter++) (b11 + cs_b * (n_remainder-1))[iter] = f_temp[iter]; } ///scalar code for trsm without alpha/// dtrsm_small_AlXB_unitDiag(a11, b11, m_remainder, n_remainder, cs_a, cs_b); } } return BLIS_SUCCESS; } /*implements TRSM for the case XA = alpha * B *A is upper triangular, non-unit diagonal, no transpose *dimensions: X:mxn A:nxn B: mxn */ /* b11---> a01 ----> ***************** *********** *b01*b11* * * * * * * b11 * * * * * **a01 * * a11 | ***************** ********* | | * * * * * *a11* * | | * * * * * * * * | v ***************** ****** v * * * * * * * * * * * * * * ***************** * * * */ static err_t bli_dtrsm_small_XAuB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 8; //block dimension along the rows dim_t D_NR = 4; //block dimension along the columns dim_t m = bli_obj_length(b); //number of rows dim_t n = bli_obj_width(b); //number of columns dim_t m_remainder = m & 7; //number of corner rows dim_t n_remainder = n & 3; //number of corner columns dim_t cs_a = bli_obj_col_stride(a); //column stride of matrix A dim_t cs_b = bli_obj_col_stride(b); //column stride of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME) || (m>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n)>D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES && (m/n) < D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double ones = 1.0; double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double *L = a->buffer; //pointer to matrix A double *B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; double f_t[4] __attribute__((aligned(64)));//buffer to store corner column when m_remainder !=0 double* f_temp; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = 0; (i+D_MR-1) < m; i += D_MR) //loop along 'M' direction { for(j = 0; (j+D_NR-1) < n; j += D_NR) //loop along 'N' direction { a01 = L + j*cs_a; //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used in GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] //2nd col a11 += cs_a; ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][1] //3rd col a11 += cs_a; ymm3 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][2] ymm5 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][2] //4th col a11 += cs_a; ymm6 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[1][1] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm2 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][3] //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm11 = _mm256_fnmadd_pd(ymm2, ymm8, ymm11); //B11[0-3][3] -= B11[0-3][0] * A11[0][3] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] ymm15 = _mm256_fnmadd_pd(ymm2, ymm12, ymm15); //B11[4-7][3] -= B11[4-7][0] * A11[0][3] ymm9 = _mm256_mul_pd(ymm9, ymm0); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm0); //B11[4-7][1] /= A11[1][1] //extract a22 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm11 = _mm256_fnmadd_pd(ymm5, ymm9, ymm11); //B11[0-3][3] -= B11[0-3][1] * A11[1][3] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] ymm15 = _mm256_fnmadd_pd(ymm5, ymm13, ymm15); //B11[4-7][3] -= B11[4-7][1] * A11[1][3] ymm10 = _mm256_mul_pd(ymm10, ymm0); //B11[0-3][2] /= A11[2][2] ymm14 = _mm256_mul_pd(ymm14, ymm0); //B11[4-7][2] /= A11[2][2] //extract a33 ymm0 = _mm256_permute_pd(ymm7, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) //(Row3)FMA operations ymm11 = _mm256_fnmadd_pd(ymm6, ymm10, ymm11); //B11[0-3][3] -= B11[0-3][2] * A11[2][3] ymm15 = _mm256_fnmadd_pd(ymm6, ymm14, ymm15); //B11[4-7][3] -= B11[4-7][2] * A11[2][3] ymm11 = _mm256_mul_pd(ymm11, ymm0); //B11[0-3][3] /= A11[3][3] ymm15 = _mm256_mul_pd(ymm15, ymm0); //B11[4-7][3] /= A11[3][3] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b), ymm11); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b + D_NR), ymm15);//store(B11[4-7][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + j*cs_a; //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be performed(in blocks of 4x4) ///load 4x4 block of b11 ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //subtract the calculated GEMM block from current TRSM block //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ///GEMM code ends/// ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0-3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0-3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][2] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm10 = _mm256_fmsub_pd(ymm10, ymm15, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ymm14 = _mm256_fmsub_pd(ymm14, ymm15, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] //2nd col a11 += cs_a; ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][1] //3rd col a11 += cs_a; ymm3 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][2] ymm5 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][2] //4th col a11 += cs_a; ymm6 = _mm256_broadcast_sd((double const *)(&ones)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[1][1] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] ymm9 = _mm256_mul_pd(ymm9, ymm0); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm0); //B11[4-7][1] /= A11[1][1] //extract a22 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] ymm10 = _mm256_mul_pd(ymm10, ymm0); //B11[0-3][2] /= A11[2][2] ymm14 = _mm256_mul_pd(ymm14, ymm0); //B11[4-7][2] /= A11[2][2] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) } else if(2 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ///GEMM code ends/// ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0-3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][1] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] //2nd col a11 += cs_a; ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][1] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm0 = _mm256_blend_pd(ymm0, ymm7, 0x0C); //A11[0][0] A11[1][1] 1 1 ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/1 1/1) //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm9 = _mm256_mul_pd(ymm9, ymm0); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm0); //B11[4-7][1] /= A11[1][1] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) } else if(1 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ///GEMM code ends/// ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) } } } if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = 0; (j+D_NR-1) a01 ----> ***************** *********** *b01*b11* * * * * * * b11 * * * * * **a01 * * a11 | ***************** ********* | | * * * * * *a11* * | | * * * * * * * * | v ***************** ****** v * * * * * * * * * * * * * * ***************** * * * */ static err_t bli_dtrsm_small_XAuB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 8; //block dimension along the rows dim_t D_NR = 4; //block dimension along the columns dim_t m = bli_obj_length(b); //number of rows dim_t n = bli_obj_width(b); //number of columns dim_t m_remainder = m & 7; //number of corner rows dim_t n_remainder = n & 3; //number of corner columns dim_t cs_a = bli_obj_col_stride(a); //column stride of matrix A dim_t cs_b = bli_obj_col_stride(b); //column stride of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_ROW_PANEL_M && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME) || (m>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUB_ROME_COLUMN_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n)>D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES && (m/n) < D_BLIS_SMALL_MATRIX_THRES_TRSM_DIM_RATIO) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double ones = 1.0; double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double *L = a->buffer; //pointer to matrix A double *B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; double f_t[4] __attribute__((aligned(64)));//buffer to store corner column when m_remainder !=0 double* f_temp; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = 0; (i+D_MR-1) < m; i += D_MR) //loop along 'M' direction { for(j = 0; (j+D_NR-1) < n; j += D_NR) //loop along 'N' direction { a01 = L + j*cs_a; //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used in GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //2nd col a11 += cs_a; ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] //3rd col a11 += cs_a; ymm3 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][2] //4th col a11 += cs_a; ymm2 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[2][3] //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm11 = _mm256_fnmadd_pd(ymm2, ymm8, ymm11); //B11[0-3][3] -= B11[0-3][0] * A11[0][3] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] ymm15 = _mm256_fnmadd_pd(ymm2, ymm12, ymm15); //B11[4-7][3] -= B11[4-7][0] * A11[0][3] //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm11 = _mm256_fnmadd_pd(ymm5, ymm9, ymm11); //B11[0-3][3] -= B11[0-3][1] * A11[1][3] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] ymm15 = _mm256_fnmadd_pd(ymm5, ymm13, ymm15); //B11[4-7][3] -= B11[4-7][1] * A11[1][3] //(Row3)FMA operations ymm11 = _mm256_fnmadd_pd(ymm6, ymm10, ymm11); //B11[0-3][3] -= B11[0-3][2] * A11[2][3] ymm15 = _mm256_fnmadd_pd(ymm6, ymm14, ymm15); //B11[4-7][3] -= B11[4-7][2] * A11[2][3] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b), ymm11); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b + D_NR), ymm15);//store(B11[4-7][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + j*cs_a; //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be performed(in blocks of 4x4) ///load 4x4 block of b11 ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //subtract the calculated GEMM block from current TRSM block //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ///GEMM code ends/// ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0-3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0-3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][2] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm10 = _mm256_fmsub_pd(ymm10, ymm15, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ymm14 = _mm256_fmsub_pd(ymm14, ymm15, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //2nd col a11 += cs_a; ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] //3rd col a11 += cs_a; ymm3 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[1][2] //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) } else if(2 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0-3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][1] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //2nd col a11 += cs_a; ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) } else if(1 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) } } } if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = 0; (j+D_NR-1) a01 ----> ***************** *********** *b01*b11* * * * * * * b11 * * * * * **a01 * * a11 | ***************** ********* | | * * * * * *a11* * | | * * * * * * * * | v ***************** ****** v * * * * * * * * * * * * * * ***************** * * * */ static err_t bli_dtrsm_small_XAltB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 8; //block dimension along the rows dim_t D_NR = 4; //block dimension along the columns dim_t m = bli_obj_length(b); //number of rows dim_t n = bli_obj_width(b); //number of columns dim_t m_remainder = m & 7; //number of corner rows dim_t n_remainder = n & 3; //number of corner columns dim_t cs_a = bli_obj_col_stride(a); //column stride of matrix A dim_t cs_b = bli_obj_col_stride(b); //column stride of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N) || (m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N) || (m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME) || (m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME) || (m > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n) > D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double ones = 1.0; double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double *L = a->buffer; //pointer to matrix A double *B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; double f_t[4] __attribute__((aligned(64)));//buffer to store corner column when m_remainder !=0 double* f_temp; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = 0; (i+D_MR-1) < m; i += D_MR) //loop along 'M' direction { for(j = 0; (j+D_NR-1) < n; j += D_NR) //loop along 'N' direction { a01 = L + j; //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used in GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][1] //3rd col a11 += 1; ymm3 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][2] //4th col a11 += 1; ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[1][1] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm11 = _mm256_fnmadd_pd(ymm2, ymm8, ymm11); //B11[0-3][3] -= B11[0-3][0] * A11[0][3] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] ymm15 = _mm256_fnmadd_pd(ymm2, ymm12, ymm15); //B11[4-7][3] -= B11[4-7][0] * A11[0][3] ymm9 = _mm256_mul_pd(ymm9, ymm0); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm0); //B11[4-7][1] /= A11[1][1] //extract a22 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm11 = _mm256_fnmadd_pd(ymm5, ymm9, ymm11); //B11[0-3][3] -= B11[0-3][1] * A11[1][3] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] ymm15 = _mm256_fnmadd_pd(ymm5, ymm13, ymm15); //B11[4-7][3] -= B11[4-7][1] * A11[1][3] ymm10 = _mm256_mul_pd(ymm10, ymm0); //B11[0-3][2] /= A11[2][2] ymm14 = _mm256_mul_pd(ymm14, ymm0); //B11[4-7][2] /= A11[2][2] //extract a33 ymm0 = _mm256_permute_pd(ymm7, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) //(Row3)FMA operations ymm11 = _mm256_fnmadd_pd(ymm6, ymm10, ymm11); //B11[0-3][3] -= B11[0-3][2] * A11[2][3] ymm15 = _mm256_fnmadd_pd(ymm6, ymm14, ymm15); //B11[4-7][3] -= B11[4-7][2] * A11[2][3] ymm11 = _mm256_mul_pd(ymm11, ymm0); //B11[0-3][3] /= A11[3][3] ymm15 = _mm256_mul_pd(ymm15, ymm0); //B11[4-7][3] /= A11[3][3] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b), ymm11); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b + D_NR), ymm15);//store(B11[4-7][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + j; //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be performed(in blocks of 4x4) ///load 4x4 block of b11 ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //subtract the calculated GEMM block from current TRSM block //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] a01 += cs_a; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] a01 += cs_a; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0-3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0-3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][2] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm10 = _mm256_fmsub_pd(ymm10, ymm15, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ymm14 = _mm256_fmsub_pd(ymm14, ymm15, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][1] //3rd col a11 += 1; ymm3 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][2] //4th col a11 += 1; ymm6 = _mm256_broadcast_sd((double const *)(&ones)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[1][1] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] ymm9 = _mm256_mul_pd(ymm9, ymm0); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm0); //B11[4-7][1] /= A11[1][1] //extract a22 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] ymm10 = _mm256_mul_pd(ymm10, ymm0); //B11[0-3][2] /= A11[2][2] ymm14 = _mm256_mul_pd(ymm14, ymm0); //B11[4-7][2] /= A11[2][2] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) } else if(2 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] a01 += cs_a; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] a01 += cs_a; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][1] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm0 = _mm256_blend_pd(ymm0, ymm7, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm9 = _mm256_mul_pd(ymm9, ymm0); //B11[0-3][1] /= A11[1][1] ymm13 = _mm256_mul_pd(ymm13, ymm0); //B11[4-7][1] /= A11[1][1] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) } else if(1 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] a01 += cs_a; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] a01 += cs_a; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ///implement TRSM/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm8 = _mm256_mul_pd(ymm8, ymm7); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm7); //B11[4-7][0] /= A11[0][0] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) } } } if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = 0; (j+D_NR-1) a01 ----> ***************** *********** *b01*b11* * * * * * * b11 * * * * * **a01 * * a11 | ***************** ********* | | * * * * * *a11* * | | * * * * * * * * | v ***************** ****** v * * * * * * * * * * * * * * ***************** * * * */ static err_t bli_dtrsm_small_XAltB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 8; //block dimension along the rows dim_t D_NR = 4; //block dimension along the columns dim_t m = bli_obj_length(b); //number of rows dim_t n = bli_obj_width(b); //number of columns dim_t m_remainder = m & 7; //number of corner rows dim_t n_remainder = n & 3; //number of corner columns dim_t cs_a = bli_obj_col_stride(a); //column stride of matrix A dim_t cs_b = bli_obj_col_stride(b); //column stride of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_M && n>D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_ROW_PANEL_N) || (m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_M && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_SQUARE_N) || (m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME) || (m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME) || (m > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALTB_ROME_COLUMN_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n) > D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double *L = a->buffer; //pointer to matrix A double *B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; double f_t[4] __attribute__((aligned(64)));//buffer to store corner column when m_remainder !=0 double* f_temp; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = 0; (i+D_MR-1) < m; i += D_MR) //loop along 'M' direction { for(j = 0; (j+D_NR-1) < n; j += D_NR) //loop along 'N' direction { a01 = L + j; //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used in GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] //3rd col a11 += 1; ymm3 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] //4th col a11 += 1; ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm11 = _mm256_fnmadd_pd(ymm2, ymm8, ymm11); //B11[0-3][3] -= B11[0-3][0] * A11[0][3] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] ymm15 = _mm256_fnmadd_pd(ymm2, ymm12, ymm15); //B11[4-7][3] -= B11[4-7][0] * A11[0][3] //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm11 = _mm256_fnmadd_pd(ymm5, ymm9, ymm11); //B11[0-3][3] -= B11[0-3][1] * A11[1][3] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] ymm15 = _mm256_fnmadd_pd(ymm5, ymm13, ymm15); //B11[4-7][3] -= B11[4-7][1] * A11[1][3] //(Row3)FMA operations ymm11 = _mm256_fnmadd_pd(ymm6, ymm10, ymm11); //B11[0-3][3] -= B11[0-3][2] * A11[2][3] ymm15 = _mm256_fnmadd_pd(ymm6, ymm14, ymm15); //B11[4-7][3] -= B11[4-7][2] * A11[2][3] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b), ymm11); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b + D_NR), ymm15);//store(B11[4-7][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + j; //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = j / D_NR; //number of GEMM operations to be performed(in blocks of 4x4) ///load 4x4 block of b11 ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //subtract the calculated GEMM block from current TRSM block //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] a01 += cs_a; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] a01 += cs_a; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0-3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0-3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][2] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm10 = _mm256_fmsub_pd(ymm10, ymm15, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ymm14 = _mm256_fmsub_pd(ymm14, ymm15, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ///implement TRSM/// ///read 4x4 block of A11/// //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] //3rd col a11 += 1; ymm3 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] //4th col a11 += 1; ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10); //B11[0-3][2] -= B11[0-3][0] * A11[0][2] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14); //B11[4-7][2] -= B11[4-7][0] * A11[0][2] //(Row2)FMA operations ymm10 = _mm256_fnmadd_pd(ymm4, ymm9, ymm10); //B11[0-3][2] -= B11[0-3][1] * A11[1][2] ymm14 = _mm256_fnmadd_pd(ymm4, ymm13, ymm14); //B11[4-7][2] -= B11[4-7][1] * A11[1][2] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) } else if(2 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] a01 += cs_a; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] a01 += cs_a; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm9 = _mm256_fmsub_pd(ymm9, ymm15, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] ymm13 = _mm256_fmsub_pd(ymm13, ymm15, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ///implement TRSM/// ///read 4x4 block of A11/// //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] //(Row1): FMA operations ymm9 = _mm256_fnmadd_pd(ymm1, ymm8, ymm9); //B11[0-3][1] -= B11[0-3][0] * A11[0][1] ymm13 = _mm256_fnmadd_pd(ymm1, ymm12, ymm13); //B11[4-7][1] -= B11[4-7][0] * A11[0][1] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) } else if(1 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] a01 += cs_a; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] a01 += cs_a; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0-3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4-7][0] ymm8 = _mm256_fmsub_pd(ymm8, ymm15, ymm0); //B11[0-3][0] * alpha -= B10[0-3][0] ymm12 = _mm256_fmsub_pd(ymm12, ymm15, ymm4); //B11[0-3][2] * alpha -= B10[0-3][2] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) } } } if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = 0; (j+D_NR-1) D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME) ||(m > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n) > D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double ones = 1.0; double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double* restrict L = a->buffer; //pointer to matrix A double* restrict B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = (m-D_MR); (i+1) > 0; i -= D_MR) //loop along 'M' direction { for(j = (n-D_NR); (j+1) > 0; j -= D_NR) //loop along 'N' direction { a01 = L + j*cs_a +(j+D_NR); //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used in GEMM b11 = B + (i) + (j)*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][1] //3rd col a11 += 1; ymm3 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][2] //4th col a11 += 1; ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[1][1] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //extract a33 ymm0 = _mm256_permute_pd(ymm7, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); //extract a22 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(row 3):FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); ymm8 = _mm256_fnmadd_pd(ymm11, ymm2, ymm8); ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); ymm12 = _mm256_fnmadd_pd(ymm15, ymm2, ymm12); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm8 = _mm256_fnmadd_pd(ymm10, ymm3, ymm8); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); ymm12 = _mm256_fnmadd_pd(ymm14, ymm3, ymm12); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); //extract a00 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) //(Row 1): FMA operations ymm8 = _mm256_fnmadd_pd(ymm9, ymm1, ymm8); ymm12 = _mm256_fnmadd_pd(ymm13, ymm1, ymm12); ymm8 = _mm256_mul_pd(ymm8, ymm0); //B11[0-3][0] /= A11[0][0] ymm12 = _mm256_mul_pd(ymm12, ymm0); //B11[4-7][0] /= A11[0][0] _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b), ymm11); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b + D_NR), ymm15);//store(B11[4-7][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + j*cs_a + (j+D_NR); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j + D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be performed(in blocks of 4x4) ///load 4x4 block of b11 ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //subtract the calculated GEMM block from current TRSM block //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1] )); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1] + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm9 = _mm256_loadu_pd((double const *)(b11+cs_b)); //B11[0-3][0] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][0] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b*2)); //B11[0-3][1] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b*2 + D_NR)); //B11[4-7][1] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][2] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][2] ymm9 = _mm256_fmsub_pd(ymm9, ymm8, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= B10[4-7][1] ymm13 = _mm256_fmsub_pd(ymm13, ymm8, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= B10[4-7][3] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //1st col ymm0 = _mm256_broadcast_sd((double const *)(&ones)); //A11[0][0] //2nd col a11 += 1; ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][1] //3rd col a11 += 1; ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][2] //4th col a11 += 1; ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[1][1] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //extract a33 ymm0 = _mm256_permute_pd(ymm7, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); //extract a22 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(row 3):FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); //extract a11 ymm0 = _mm256_permute_pd(ymm7, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x00);//(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(Row 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); ymm9 = _mm256_mul_pd(ymm9, ymm0); ymm13 = _mm256_mul_pd(ymm13, ymm0); _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(2 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0-3][0] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][0] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][1] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][1] ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= B10[4-7][1] ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= B10[4-7][3] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //3rd col a11 += 2; ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][2] //4th col a11 += 1; ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[1][1] A11[3][3] ymm0 = _mm256_blend_pd(ymm7, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm7 = _mm256_div_pd(ymm7, ymm0); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //extract a33 ymm0 = _mm256_permute_pd(ymm7, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); //extract a22 ymm0 = _mm256_permute_pd(ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm0 = _mm256_permute2f128_pd(ymm0, ymm0, 0x11);//(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(row 3):FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm10 = _mm256_mul_pd(ymm10, ymm0); ymm14 = _mm256_mul_pd(ymm14, ymm0); _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(1 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm11 = _mm256_loadu_pd((double const *)(b11+cs_b_offset[1])); //B11[0-3][0] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] +D_NR)); //B11[4-7][0] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= B10[4-7][1] ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= B10[4-7][3] ///implement TRSM/// ///read 4x4 block of A11/// ymm7 = _mm256_broadcast_sd((double const *)(&ones)); //4th col a11 += 3; ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 3)); //A11[3][3] //compute reciprocals of L(i,i) and broadcast in registers ymm7 = _mm256_div_pd(ymm7, ymm6); //(1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3]) ymm11 = _mm256_mul_pd(ymm11, ymm7); ymm15 = _mm256_mul_pd(ymm15, ymm7); _mm256_storeu_pd((double *)(b11+ cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } } } if(i<0) i += D_NR; if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = (n-D_NR); (j+1) > 0; j -=D_NR) //loop along n direction { a01 = L + j*cs_a + (j+D_NR); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ///GEMM for previous blocks /// ///load 4x4 block of b11 ymm0 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B10[0][1]*A01[1][0] B10[1][1]*A01[1][0] B10[2][1]*A01[1][0] B10[3][1]*A01[1][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B10[0][2]*A01[2][0] B10[1][2]*A01[2][0] B10[2][2]*A01[2][0] B10[3][2]*A01[2][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B10[0][3]*A01[3][0] B10[1][3]*A01[3][0] B10[2][3]*A01[3][0] B10[3][3]*A01[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[x][0] -=ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][0] ymm7 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][0] //2nd col a11 += cs_a; ymm8 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm9 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][1] //3rd col a11 += cs_a; ymm11 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][2] ymm12 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][2] //4th col a11 += cs_a; ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][3] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm4, ymm8); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm15 = _mm256_blend_pd(ymm4, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm15); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a33 ymm15 = _mm256_permute_pd(ymm14, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm3 = _mm256_mul_pd(ymm3, ymm15); //extract a22 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); ymm0 = _mm256_fnmadd_pd(ymm3, ymm7, ymm0); ymm2 = _mm256_mul_pd(ymm2, ymm15); //extract a11 ymm15 = _mm256_permute_pd(ymm14, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); ymm0 = _mm256_fnmadd_pd(ymm2, ymm6, ymm0); ymm1 = _mm256_mul_pd(ymm1, ymm15); //extract A00 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) //(Row 1):FMA operations ymm0 = _mm256_fnmadd_pd(ymm1, ymm5, ymm0); ymm0 = _mm256_mul_pd(ymm0, ymm15); _mm256_storeu_pd((double *)b11, ymm0); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm3); //store(B11[x][3]) } if(n_remainder) //implementation for remainder columns(when n is not a multiple of D_NR) { a01 = L + j*cs_a + (j+D_NR); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointwr to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM for previous blocks /// if(3 == n_remainder) { ///load 4x4 block of b11 ymm0 = _mm256_broadcast_sd((double const *)&ones); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm1 = _mm256_loadu_pd((double const *)b11+ cs_b); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ///GEMM processing stars/// for(k = 0; k < k_iter; k++) { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //2nd col a11 += cs_a; ymm8 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm9 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][1] //3rd col a11 += cs_a; ymm11 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][2] ymm12 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][2] //4th col a11 += cs_a; ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][3] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm14, ymm8); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm15 = _mm256_blend_pd(ymm4, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm15); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a33 ymm15 = _mm256_permute_pd(ymm14, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm3 = _mm256_mul_pd(ymm3, ymm15); //extract a22 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); ymm2 = _mm256_mul_pd(ymm2, ymm15); //extract a11 ymm15 = _mm256_permute_pd(ymm14, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); ymm1 = _mm256_mul_pd(ymm1, ymm15); _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b*3), ymm3); //store(B11[x][0]) } else if(2 == n_remainder) { ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ///GEMM processing stars/// for(k = 0; k < k_iter; k++) { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //3rd col a11 += 2 * cs_a; ymm11 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][2] ymm12 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][2] //4th col a11 += cs_a; ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][3] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm15 = _mm256_blend_pd(ymm14, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm15); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a33 ymm15 = _mm256_permute_pd(ymm14, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm3 = _mm256_mul_pd(ymm3, ymm15); //extract a22 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm2 = _mm256_mul_pd(ymm2, ymm15); _mm256_storeu_pd((double *)(b11+ cs_b * 2), ymm2); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][1]) } else if(1 == n_remainder) { ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ///GEMM processing stars/// for(k = 0; k < k_iter; k++) { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //4th col a11 += 3 * cs_a; ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][3] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm14 = _mm256_div_pd(ymm14, ymm13); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a33 ymm3 = _mm256_mul_pd(ymm3, ymm14); _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][0]) } } m_remainder -= 4; i -= 4; } // if(i < 0) i = 0; if(m_remainder) ///implementation for remainder rows { dtrsm_small_XAlB(L, B, AlphaVal, m_remainder, n, cs_a, cs_b); } return BLIS_SUCCESS; } /*implements TRSM for the case XA = alpha * B *A is lower triangular, unit-diagonal, no transpose *dimensions: X:mxn A:nxn B: mxn */ /* <---b11 <---a11 ***************** * *b01*b11* * * * * ^ * * * * * ^ * * | ***************** | ******* | * * * * * | * * * | * * * * * a01* * * b10 ***************** ************* * * * * * * * * * * * * * * * * * * ***************** ******************* */ static err_t bli_dtrsm_small_XAlB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 8; //block dimension along the rows dim_t D_NR = 4; //block dimension along the columns dim_t m = bli_obj_length(b); //number of rows dim_t n = bli_obj_width(b); //number of columns dim_t m_remainder = m & 7; //number of corner rows dim_t n_remainder = n & 3; //number of corner columns dim_t cs_a = bli_obj_col_stride(a); //column stride of matrix A dim_t cs_b = bli_obj_col_stride(b); //column stride of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME) ||(m > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n) > D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double* restrict L = a->buffer; //pointer to matrix A double* restrict B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = (m-D_MR); (i+1) > 0; i -= D_MR) //loop along 'M' direction { for(j = (n-D_NR); (j+1) > 0; j -= D_NR) //loop along 'N' direction { a01 = L + j*cs_a +(j+D_NR); //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used in GEMM b11 = B + (i) + (j)*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //2nd col a11 += 1; ymm1 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][1] //3rd col a11 += 1; ymm3 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][2] ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] //4th col a11 += 1; ymm2 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 0)); //A11[0][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //(row 3):FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); ymm8 = _mm256_fnmadd_pd(ymm11, ymm2, ymm8); ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); ymm12 = _mm256_fnmadd_pd(ymm15, ymm2, ymm12); //(Row 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm8 = _mm256_fnmadd_pd(ymm10, ymm3, ymm8); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); ymm12 = _mm256_fnmadd_pd(ymm14, ymm3, ymm12); //(Row 1): FMA operations ymm8 = _mm256_fnmadd_pd(ymm9, ymm1, ymm8); ymm12 = _mm256_fnmadd_pd(ymm13, ymm1, ymm12); _mm256_storeu_pd((double *)b11, ymm8); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[4-7][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b), ymm11); //store(B11[0-3][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + cs_b + D_NR), ymm15);//store(B11[4-7][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + j*cs_a + (j+D_NR); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j + D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be performed(in blocks of 4x4) ///load 4x4 block of b11 ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //subtract the calculated GEMM block from current TRSM block //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1] )); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1] + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm9 = _mm256_loadu_pd((double const *)(b11+cs_b)); //B11[0-3][0] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][0] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b*2)); //B11[0-3][1] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b*2 + D_NR)); //B11[4-7][1] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][2] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][2] ymm9 = _mm256_fmsub_pd(ymm9, ymm8, ymm1); //B11[4-7][0] * alpha -= B10[4-7][0] ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= B10[4-7][1] ymm13 = _mm256_fmsub_pd(ymm13, ymm8, ymm5); //B11[4-7][2] * alpha -= B10[4-7][2] ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= B10[4-7][3] ///implement TRSM/// ///read 4x4 block of A11/// //3rd col a11 += 2; ymm4 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][2] //4th col a11 += 1; ymm5 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 1)); //A11[1][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //(row 3):FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); //(Row 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(2 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0-3][0] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][0] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][1] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][1] ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha -= B10[0-3][1] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= B10[4-7][1] ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= B10[0-3][3] ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= B10[4-7][3] ///implement TRSM/// ///read 4x4 block of A11/// //4th col a11 += 3; ymm6 = _mm256_broadcast_sd((double const *)(a11+ cs_a * 2)); //A11[2][3] //(row 3):FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(1 == n_remainder) { ///GEMM implementation begins/// for(k = 0; k < k_iter; k++) ///loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR));//B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm11 = _mm256_loadu_pd((double const *)(b11+cs_b_offset[1])); //B11[0-3][0] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] +D_NR)); //B11[4-7][0] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= B10[4-7][1] ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= B10[4-7][3] _mm256_storeu_pd((double *)(b11+ cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } } } if(i<0) i += D_NR; if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = (n-D_NR); (j+1) > 0; j -=D_NR) //loop along n direction { a01 = L + j*cs_a + (j+D_NR); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ///GEMM for previous blocks /// ///load 4x4 block of b11 ymm0 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B10[0][1]*A01[1][0] B10[1][1]*A01[1][0] B10[2][1]*A01[1][0] B10[3][1]*A01[1][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B10[0][2]*A01[2][0] B10[1][2]*A01[2][0] B10[2][2]*A01[2][0] B10[3][2]*A01[2][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 0)); //A01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B10[0][3]*A01[3][0] B10[1][3]*A01[3][0] B10[2][3]*A01[3][0] B10[3][3]*A01[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[x][0] -=ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][0] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][0] ymm7 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][0] //2nd col a11 += cs_a; ymm9 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][1] //3rd col a11 += cs_a; ymm12 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][2] //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); ymm0 = _mm256_fnmadd_pd(ymm3, ymm7, ymm0); //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); ymm0 = _mm256_fnmadd_pd(ymm2, ymm6, ymm0); //(Row 1):FMA operations ymm0 = _mm256_fnmadd_pd(ymm1, ymm5, ymm0); _mm256_storeu_pd((double *)b11, ymm0); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm3); //store(B11[x][3]) } if(n_remainder) //implementation for remainder columns(when n is not a multiple of D_NR) { a01 = L + j*cs_a + (j+D_NR); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointwr to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM for previous blocks /// if(3 == n_remainder) { ///load 4x4 block of b11 ymm1 = _mm256_loadu_pd((double const *)b11+ cs_b); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ///GEMM processing stars/// for(k = 0; k < k_iter; k++) { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha value ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //2nd col a11 += cs_a; ymm9 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][1] //3rd col a11 += cs_a; ymm12 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][2] //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b*3), ymm3); //store(B11[x][0]) } else if(2 == n_remainder) { ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ///GEMM processing stars/// for(k = 0; k < k_iter; k++) { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha value ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //3rd col a11 += 2 * cs_a; ymm12 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[1][2] //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); _mm256_storeu_pd((double *)(b11+ cs_b * 2), ymm2); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][1]) } else if(1 == n_remainder) { ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ///GEMM processing stars/// for(k = 0; k < k_iter; k++) { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[0][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[1][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[2][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + cs_a * 3)); //A01[3][3] a01 += 1; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR; //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha value ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][0]) } } m_remainder -= 4; i -= 4; } // if(i < 0) i = 0; if(m_remainder) ///implementation for remainder rows { dtrsm_small_XAlB_unitDiag(L, B, AlphaVal, m_remainder, n, cs_a, cs_b); } return BLIS_SUCCESS; } /*implements TRSM for the case XA = alpha * B *A is lower triangular, non-unit diagonal, no transpose *dimensions: X:mxn A:nxn B: mxn */ /* <---b11 <---a11 ***************** * *b01*b11* * * * * ^ * * * * * ^ * * | ***************** | ******* | * * * * * | * * * | * * * * * a01* * * b10 ***************** ************* * * * * * * * * * * * * * * * * * * ***************** ******************* */ static err_t bli_dtrsm_small_XAutB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 8; //block dimension along the rows dim_t D_NR = 4; //block dimension along the columns dim_t m = bli_obj_length(b); //number of rows dim_t n = bli_obj_width(b); //number of columns dim_t m_remainder = m & 7; //number of corner rows dim_t n_remainder = n & 3; //number of corner columns dim_t cs_a = bli_obj_col_stride(a); //column stride of matrix A dim_t cs_b = bli_obj_col_stride(b); //column stride of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME) ||(m > D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n) > D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double ones = 1.0; double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double* restrict L = a->buffer; //pointer to matrix A double* restrict B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = (m-D_MR); (i+1) > 0; i -= D_MR) //loop along 'M' direction { for(j = (n-D_NR); (j+1) > 0; j -= D_NR) //loop along 'N' direction { a01 = L + (j+D_NR)*cs_a +(j); //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used in GEMM b11 = B + (i) + (j)*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col ymm0 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] a11 += cs_a; //2nd col ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm2 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //3rd col ymm3 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm5 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] a11 += cs_a; //4th col ymm6 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm7 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm0 = _mm256_div_pd(ymm7, ymm0); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ymm2 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //extract a33 ymm7 = _mm256_permute_pd(ymm0, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm11 = _mm256_mul_pd(ymm11, ymm7); ymm15 = _mm256_mul_pd(ymm15, ymm7); //extract a22 ymm7 = _mm256_permute_pd(ymm0, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); ymm8 = _mm256_fnmadd_pd(ymm11, ymm2, ymm8); //(Row 3): FMA operations ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); ymm12 = _mm256_fnmadd_pd(ymm15, ymm2, ymm12); ymm10 = _mm256_mul_pd(ymm10, ymm7); ymm14 = _mm256_mul_pd(ymm14, ymm7); //extract a11 ymm7 = _mm256_permute_pd(ymm0, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x00); //(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(ROW 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm8 = _mm256_fnmadd_pd(ymm10, ymm3, ymm8); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); ymm12 = _mm256_fnmadd_pd(ymm14, ymm3, ymm12); ymm9 = _mm256_mul_pd(ymm9, ymm7); ymm13 = _mm256_mul_pd(ymm13, ymm7); //extract A00 ymm7 = _mm256_permute_pd(ymm0, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) //(Row 1):FMA operations ymm8 = _mm256_fnmadd_pd(ymm9, ymm1, ymm8); ymm12 = _mm256_fnmadd_pd(ymm13, ymm1, ymm12); ymm8 = _mm256_mul_pd(ymm8, ymm7); ymm12 = _mm256_mul_pd(ymm12, ymm7); _mm256_storeu_pd((double *)b11, ymm8); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[x][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[x][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + (j+D_NR)*cs_a +(j); //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used in GEMM b11 = B + (i) + (j)*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm9 = _mm256_loadu_pd((double const *)(b11+cs_b)); //B11[0-3][0] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][0] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b*2)); //B11[0-3][1] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b*2 + D_NR)); //B11[4-7][1] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][2] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][2] ymm9 = _mm256_fmsub_pd(ymm9, ymm8, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm13 = _mm256_fmsub_pd(ymm13, ymm8, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col ymm0 = _mm256_broadcast_sd((double const *)(&ones)); //A11[0][0] a11 += cs_a; //2nd col ymm2 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //3rd col ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm5 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] a11 += cs_a; //4th col ymm6 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm7 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm0 = _mm256_unpacklo_pd(ymm0, ymm2); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm0 = _mm256_blend_pd(ymm0, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm0 = _mm256_div_pd(ymm7, ymm0); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //extract a33 ymm7 = _mm256_permute_pd(ymm0, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm11 = _mm256_mul_pd(ymm11, ymm7); ymm15 = _mm256_mul_pd(ymm15, ymm7); //extract a22 ymm7 = _mm256_permute_pd(ymm0, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); //(Row 3): FMA operations ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); ymm10 = _mm256_mul_pd(ymm10, ymm7); ymm14 = _mm256_mul_pd(ymm14, ymm7); //extract a11 ymm7 = _mm256_permute_pd(ymm0, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x00); //(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(ROW 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); ymm9 = _mm256_mul_pd(ymm9, ymm7); ymm13 = _mm256_mul_pd(ymm13, ymm7); _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(2 == n_remainder) { ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][0] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][1] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][1] ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col a11 += 2 * cs_a; //3rd col ymm5 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] a11 += cs_a; //4th col ymm6 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm7 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm2 = _mm256_unpacklo_pd(ymm5, ymm6); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm0 = _mm256_blend_pd(ymm7, ymm2, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm0 = _mm256_div_pd(ymm7, ymm0); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //extract a33 ymm7 = _mm256_permute_pd(ymm0, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm11 = _mm256_mul_pd(ymm11, ymm7); ymm15 = _mm256_mul_pd(ymm15, ymm7); //extract a22 ymm7 = _mm256_permute_pd(ymm0, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm7 = _mm256_permute2f128_pd(ymm7, ymm7, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); //(Row 3): FMA operations ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm10 = _mm256_mul_pd(ymm10, ymm7); ymm14 = _mm256_mul_pd(ymm14, ymm7); _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(1 == n_remainder) { ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm11 = _mm256_loadu_pd((double const *)(b11+cs_b_offset[1])); //B11[0-3][0] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] +D_NR)); //B11[4-7][0] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// a11 += 3 * cs_a; //4th col ymm6 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm7 = _mm256_broadcast_sd((double const *)&ones); ymm0 = _mm256_div_pd(ymm7, ymm6); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ymm11 = _mm256_mul_pd(ymm11, ymm0); ymm15 = _mm256_mul_pd(ymm15, ymm0); _mm256_storeu_pd((double *)(b11+ cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } } } if(i<0) i += D_NR; if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = (n-D_NR); (j+1) > 0; j -=D_NR) //loop along n direction { a01 = L + (j+D_NR)*cs_a + (j); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ///GEMM for previous blocks /// ///load 4x4 block of b11 ymm0 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B10[0][1]*A01[1][0] B10[1][1]*A01[1][0] B10[2][1]*A01[1][0] B10[3][1]*A01[1][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B10[0][2]*A01[2][0] B10[1][2]*A01[2][0] B10[2][2]*A01[2][0] B10[3][2]*A01[2][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B10[0][3]*A01[3][0] B10[1][3]*A01[3][0] B10[2][3]*A01[3][0] B10[3][3]*A01[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR*cs_a; //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[x][0] -=ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][0] a11 += cs_a; //2nd col ymm5 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm8 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //3rd col ymm6 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm9 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm11 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] a11 += cs_a; //4th col ymm7 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm12 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm4, ymm8); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm15 = _mm256_blend_pd(ymm4, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm15); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a33 ymm15 = _mm256_permute_pd(ymm14, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm3 = _mm256_mul_pd(ymm3, ymm15); //extract a22 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); ymm0 = _mm256_fnmadd_pd(ymm3, ymm7, ymm0); ymm2 = _mm256_mul_pd(ymm2, ymm15); //extract a11 ymm15 = _mm256_permute_pd(ymm14, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); ymm0 = _mm256_fnmadd_pd(ymm2, ymm6, ymm0); ymm1 = _mm256_mul_pd(ymm1, ymm15); //extract A00 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[0][0] 1/A11[0][0]) //(Row 1):FMA operations ymm0 = _mm256_fnmadd_pd(ymm1, ymm5, ymm0); ymm0 = _mm256_mul_pd(ymm0, ymm15); _mm256_storeu_pd((double *)b11, ymm0); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm3); //store(B11[x][3]) } if(n_remainder) //implementation for remainder columns(when n is not a multiple of D_NR) { a01 = L + (j+D_NR)*cs_a + (j); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ///GEMM for previous blocks /// ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///load 4x4 block of b11 if(3 == n_remainder) { ymm1 = _mm256_loadu_pd((double const *)b11+ cs_b); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col ymm4 = _mm256_broadcast_sd((double const *)(&ones)); //A11[0][0] a11 += cs_a; //2nd col ymm8 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //3rd col ymm9 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm11 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] a11 += cs_a; //4th col ymm10 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm12 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm4 = _mm256_unpacklo_pd(ymm4, ymm8); //A11[0][0] A11[1][1] A11[0][0] A11[1][1] ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm15 = _mm256_blend_pd(ymm4, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm15); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a33 ymm15 = _mm256_permute_pd(ymm14, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm3 = _mm256_mul_pd(ymm3, ymm15); //extract a22 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); ymm2 = _mm256_mul_pd(ymm2, ymm15); //extract a11 ymm15 = _mm256_permute_pd(ymm14, 0x03); //(1/A11[1][1] 1/A11[1][1] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x00); //(1/A11[1][1] 1/A11[1][1] 1/A11[1][1] 1/A11[1][1]) //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); ymm1 = _mm256_mul_pd(ymm1, ymm15); _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b*3), ymm3); //store(B11[x][0]) } else if(2 == n_remainder) { ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col a11 += 2 * cs_a; //3rd col ymm11 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] a11 += cs_a; //4th col ymm12 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm8 = _mm256_unpacklo_pd(ymm11, ymm13); //A11[2][2] A11[3][3] A11[2][2] A11[3][3] ymm15 = _mm256_blend_pd(ymm14, ymm8, 0x0C); //A11[0][0] A11[1][1] A11[2][2] A11[3][3] ymm14 = _mm256_div_pd(ymm14, ymm15); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] //extract a33 ymm15 = _mm256_permute_pd(ymm14, 0x0C); //(1/A11[0][0] 1/A11[0][0] 1/A11[3][3] 1/A11[3][3]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[3][3] 1/A11[3][3] 1/A11[3][3] 1/A11[3][3]) ymm3 = _mm256_mul_pd(ymm3, ymm15); //extract a22 ymm15 = _mm256_permute_pd(ymm14, 0x00); //(1/A11[0][0] 1/A11[0][0] 1/A11[2][2] 1/A11[2][2]) ymm15 = _mm256_permute2f128_pd(ymm15, ymm15, 0x11); //(1/A11[2][2] 1/A11[2][2] 1/A11[2][2] 1/A11[2][2]) //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm2 = _mm256_mul_pd(ymm2, ymm15); _mm256_storeu_pd((double *)(b11+ cs_b * 2), ymm2); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][1]) } else if(1 == n_remainder) { ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// a11 += 3 * cs_a; //4th col ymm13 = _mm256_broadcast_sd((double const *)(a11+3)); //A11[0][1] ymm14 = _mm256_broadcast_sd((double const *)&ones); //compute reciprocals of A(i,i) and broadcast in registers ymm14 = _mm256_div_pd(ymm14, ymm13); // 1/A11[0][0] 1/A11[1][1] 1/A11[2][2] 1/A11[3][3] ymm3 = _mm256_mul_pd(ymm3, ymm14); _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][0]) } } m_remainder -= 4; i -= 4; } if(m_remainder) ///implementation for remainder rows { dtrsm_small_XAutB(L, B, AlphaVal, m_remainder, n, cs_a, cs_b); } return BLIS_SUCCESS; } /*implements TRSM for the case XA = alpha * B *A is lower triangular, unit-diagonal, no transpose *dimensions: X:mxn A:nxn B: mxn */ /* <---b11 <---a11 ***************** * *b01*b11* * * * * ^ * * * * * ^ * * | ***************** | ******* | * * * * * | * * * | * * * * * a01* * * b10 ***************** ************* * * * * * * * * * * * * * * * * * * ***************** ******************* */ static err_t bli_dtrsm_small_XAutB_unitDiag( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { dim_t D_MR = 8; //block dimension along the rows dim_t D_NR = 4; //block dimension along the columns dim_t m = bli_obj_length(b); //number of rows dim_t n = bli_obj_width(b); //number of columns dim_t m_remainder = m & 7; //number of corner rows dim_t n_remainder = n & 3; //number of corner columns dim_t cs_a = bli_obj_col_stride(a); //column stride of matrix A dim_t cs_b = bli_obj_col_stride(b); //column stride of matrix B #ifdef BLIS_ENABLE_SMALL_MATRIX_ROME if((m < D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME) ||(m > D_BLIS_SMALL_MATRIX_THRES_TRSM_XAUTB_ROME && n > D_BLIS_SMALL_MATRIX_THRES_TRSM_XALB_ROME_COL_PANEL_N) ) return BLIS_NOT_YET_IMPLEMENTED; #else if(bli_max(m,n) > D_BLIS_SMALL_MATRIX_THRES_TRSM_NAPLES) { return BLIS_NOT_YET_IMPLEMENTED; } #endif dim_t i, j, k; //loop variablse dim_t k_iter; //determines the number of GEMM operations to be done dim_t cs_b_offset[2]; //pre-calculated strides double AlphaVal = *(double *)AlphaObj->buffer; //value of Alpha double* restrict L = a->buffer; //pointer to matrix A double* restrict B = b->buffer; //pointer to matrix B double *a01, *a11, *b10, *b11; //pointers for GEMM and TRSM blocks double *ptr_a01_dup; cs_b_offset[0] = cs_b << 1; //cs_b_offset[0] = cs_b * 2; cs_b_offset[1] = cs_b_offset[0] + cs_b;//cs_b_offset[1] = cs_b * 3; //ymm scratch reginsters __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m256d ymm16; for(i = (m-D_MR); (i+1) > 0; i -= D_MR) //loop along 'M' direction { for(j = (n-D_NR); (j+1) > 0; j -= D_NR) //loop along 'N' direction { a01 = L + (j+D_NR)*cs_a +(j); //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used in GEMM b11 = B + (i) + (j)*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][0]*A01[0][0] B10[5][0]*A01[0][0] B10[6][0]*A01[0][0] B10[7][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][1]*A01[0][0] B10[1][1]*A01[0][0] B10[2][1]*A01[0][0] B10[3][1]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][1]*A01[0][0] B10[5][1]*A01[0][0] B10[6][1]*A01[0][0] B10[7][1]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm0 = _mm256_fmadd_pd(ymm8, ymm12, ymm0); //ymm0 += (B10[0][2]*A01[0][0] B10[1][2]*A01[0][0] B10[2][2]*A01[0][0] B10[3][2]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm13, ymm4); //ymm4 += (B10[4][2]*A01[0][0] B10[5][2]*A01[0][0] B10[6][2]*A01[0][0] B10[7][2]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm8 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm0 = _mm256_fmadd_pd(ymm8, ymm14, ymm0); //ymm0 += (B10[0][3]*A01[0][0] B10[1][3]*A01[0][0] B10[2][3]*A01[0][0] B10[3][3]*A01[0][0]) ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm4 = _mm256_fmadd_pd(ymm8, ymm15, ymm4); //ymm4 += (B10[4][3]*A01[0][0] B10[5][3]*A01[0][0] B10[6][3]*A01[0][0] B10[7][3]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm16 = _mm256_broadcast_sd((double const *)&AlphaVal); //load 8x4 block of B11 ymm8 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm12 = _mm256_loadu_pd((double const *)(b11 + D_NR)); //B11[4][0] B11[5][0] B11[6][0] B11[7][0] ymm9 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4][1] B11[5][1] B11[6][1] B11[7][1] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4][2] B11[5][2] B11[6][2] B11[7][2] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4][3] B11[5][3] B11[6][3] B11[7][3] ymm8 = _mm256_fmsub_pd(ymm8, ymm16, ymm0); //B11[0-3][0] * alpha -= ymm0 ymm9 = _mm256_fmsub_pd(ymm9, ymm16, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm16, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm16, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm12 = _mm256_fmsub_pd(ymm12, ymm16, ymm4); //B11[0-3][2] * alpha -= ymm4 ymm13 = _mm256_fmsub_pd(ymm13, ymm16, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm16, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm16, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// //1st col a11 += cs_a; //2nd col ymm1 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] a11 += cs_a; //3rd col ymm3 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //4th col ymm2 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //(Row 3): FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); ymm8 = _mm256_fnmadd_pd(ymm11, ymm2, ymm8); //(Row 3): FMA operations ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); ymm12 = _mm256_fnmadd_pd(ymm15, ymm2, ymm12); //(ROW 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm8 = _mm256_fnmadd_pd(ymm10, ymm3, ymm8); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); ymm12 = _mm256_fnmadd_pd(ymm14, ymm3, ymm12); //(Row 1):FMA operations ymm8 = _mm256_fnmadd_pd(ymm9, ymm1, ymm8); ymm12 = _mm256_fnmadd_pd(ymm13, ymm1, ymm12); _mm256_storeu_pd((double *)b11, ymm8); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + D_NR), ymm12); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[x][3]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[x][3]) } if(n_remainder) //implementation for remainder columns(when n is not multiple of D_NR) { a01 = L + (j+D_NR)*cs_a +(j); //pointer to block of A to be used in GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used in GEMM b11 = B + (i) + (j)*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of GEMM operations to be done(in blocks of 4x4) ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_setzero_pd(); ymm2 = _mm256_setzero_pd(); ymm3 = _mm256_setzero_pd(); ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); //load 8x4 block of B11 if(3 == n_remainder) { ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][0]*A01[0][1] B10[5][0]*A01[0][1] B10[6][0]*A01[0][1] B10[7][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][1]*A01[0][1] B10[1][1]*A01[0][1] B10[2][1]*A01[0][1] B10[3][1]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][1]*A01[0][1] B10[5][1]*A01[0][1] B10[6][1]*A01[0][1] B10[7][1]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm1 = _mm256_fmadd_pd(ymm9, ymm12, ymm1); //ymm1 += (B10[0][2]*A01[0][1] B10[1][2]*A01[0][1] B10[2][2]*A01[0][1] B10[3][2]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm13, ymm5); //ymm5 += (B10[4][2]*A01[0][1] B10[5][2]*A01[0][1] B10[6][2]*A01[0][1] B10[7][2]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm9 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm1 = _mm256_fmadd_pd(ymm9, ymm14, ymm1); //ymm1 += (B10[0][3]*A01[0][1] B10[1][3]*A01[0][1] B10[2][3]*A01[0][1] B10[3][3]*A01[0][1]) ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm5 = _mm256_fmadd_pd(ymm9, ymm15, ymm5); //ymm5 += (B10[4][3]*A01[0][1] B10[5][3]*A01[0][1] B10[6][3]*A01[0][1] B10[7][3]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm9 = _mm256_loadu_pd((double const *)(b11+cs_b)); //B11[0-3][0] ymm13 = _mm256_loadu_pd((double const *)(b11 + cs_b + D_NR)); //B11[4-7][0] ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b*2)); //B11[0-3][1] ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b*2 + D_NR)); //B11[4-7][1] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][2] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][2] ymm9 = _mm256_fmsub_pd(ymm9, ymm8, ymm1); //B11[4-7][0] * alpha-= ymm1 ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm13 = _mm256_fmsub_pd(ymm13, ymm8, ymm5); //B11[4-7][2] * alpha -= ymm5 ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// a11 += 2 * cs_a; //3rd col ymm4 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //4th col ymm5 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //(Row 3): FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); ymm9 = _mm256_fnmadd_pd(ymm11, ymm5, ymm9); //(Row 3): FMA operations ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); ymm13 = _mm256_fnmadd_pd(ymm15, ymm5, ymm13); //(ROW 2): FMA operations ymm9 = _mm256_fnmadd_pd(ymm10, ymm4, ymm9); ymm13 = _mm256_fnmadd_pd(ymm14, ymm4, ymm13); _mm256_storeu_pd((double *)(b11 + cs_b), ymm9); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b + D_NR), ymm13); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14);//store(B11[4-7][2]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(2 == n_remainder) { ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][0]*A01[0][2] B10[5][0]*A01[0][2] B10[6][0]*A01[0][2] B10[7][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][1]*A01[0][2] B10[1][1]*A01[0][2] B10[2][1]*A01[0][2] B10[3][1]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][1]*A01[0][2] B10[5][1]*A01[0][2] B10[6][1]*A01[0][2] B10[7][1]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm2 = _mm256_fmadd_pd(ymm10, ymm12, ymm2); //ymm2 += (B10[0][2]*A01[0][2] B10[1][2]*A01[0][2] B10[2][2]*A01[0][2] B10[3][2]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm13, ymm6); //ymm6 += (B10[4][2]*A01[0][2] B10[5][2]*A01[0][2] B10[6][2]*A01[0][2] B10[7][2]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm10 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm2 = _mm256_fmadd_pd(ymm10, ymm14, ymm2); //ymm2 += (B10[0][3]*A01[0][2] B10[1][3]*A01[0][2] B10[2][3]*A01[0][2] B10[3][3]*A01[0][2]) ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm6 = _mm256_fmadd_pd(ymm10, ymm15, ymm6); //ymm6 += (B10[4][3]*A01[0][2] B10[5][3]*A01[0][2] B10[6][3]*A01[0][2] B10[7][3]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm10 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); ymm14 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0] + D_NR)); //B11[4-7][0] ymm11 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0-3][1] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] + D_NR)); //B11[4-7][1] ymm10 = _mm256_fmsub_pd(ymm10, ymm8, ymm2); //B11[0-3][1] * alpha-= ymm2 ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm14 = _mm256_fmsub_pd(ymm14, ymm8, ymm6); //B11[0-3][3] * alpha -= ymm6 ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// a11 += 3 * cs_a; //4th col ymm6 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //(Row 3): FMA operations ymm10 = _mm256_fnmadd_pd(ymm11, ymm6, ymm10); //(Row 3): FMA operations ymm14 = _mm256_fnmadd_pd(ymm15, ymm6, ymm14); _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm10); //store(B11[0-3][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0] + D_NR), ymm14); //store(B11[4-7][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } else if(1 == n_remainder) { ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //broadcast 1st row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row //load 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm13 = _mm256_loadu_pd((double const *)(b10 + D_NR)); //B10[4][0] B10[5][0] B10[6][0] B10[7][0] ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b + D_NR)); //B10[4][1] B10[5][1] B10[6][1] B10[7][1] ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][0]*A01[0][3] B10[5][0]*A01[0][3] B10[6][0]*A01[0][3] B10[7][0]*A01[0][3]) //broadcast 2nd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][1]*A01[0][3] B10[1][1]*A01[0][3] B10[2][1]*A01[0][3] B10[3][1]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][1]*A01[0][3] B10[5][1]*A01[0][3] B10[6][1]*A01[0][3] B10[7][1]*A01[0][3]) //broadcast 3rd row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A01 //load next 8x2 block of B10 ymm12 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //(B10[0][2] B10[1][2] B10[2][2] B10[3][2]) ymm13 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + D_NR)); //(B10[4][2] B10[5][2] B10[6][2] B10[7][2]) ymm14 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b)); //(B10[0][3] B10[1][3] B10[2][3] B10[3][3]) ymm15 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0] + cs_b + D_NR)); //(B10[4][3] B10[5][3] B10[6][3] B10[7][3]) ymm3 = _mm256_fmadd_pd(ymm11, ymm12, ymm3); //ymm3 += (B10[0][2]*A01[0][3] B10[1][2]*A01[0][3] B10[2][2]*A01[0][3] B10[3][2]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm13, ymm7); //ymm7 += (B10[4][2]*A01[0][3] B10[5][2]*A01[0][3] B10[6][2]*A01[0][3] B10[7][2]*A01[0][3]) //broadcast 4th row of A01 ymm11 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A01 ymm3 = _mm256_fmadd_pd(ymm11, ymm14, ymm3); //ymm3 += (B10[0][3]*A01[0][3] B10[1][3]*A01[0][3] B10[2][3]*A01[0][3] B10[3][3]*A01[0][3]) ymm7 = _mm256_fmadd_pd(ymm11, ymm15, ymm7); //ymm7 += (B10[4][3]*A01[0][3] B10[5][3]*A01[0][3] B10[6][3]*A01[0][3] B10[7][3]*A01[0][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code ends/// ymm8 = _mm256_broadcast_sd((double const *)&AlphaVal); ymm11 = _mm256_loadu_pd((double const *)(b11+cs_b_offset[1])); //B11[0-3][0] ymm15 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1] +D_NR)); //B11[4-7][0] ymm11 = _mm256_fmsub_pd(ymm11, ymm8, ymm3); //B11[4-7][1] * alpha -= ymm3 ymm15 = _mm256_fmsub_pd(ymm15, ymm8, ymm7); //B11[4-7][3] * alpha -= ymm7 _mm256_storeu_pd((double *)(b11+ cs_b_offset[1]), ymm11); //store(B11[0-3][0]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1] + D_NR), ymm15); //store(B11[4-7][0]) } } } if(i<0) i += D_NR; if((m & 4)) ///implementation for remainder rows(when m_remainder is greater than 4) { for(j = (n-D_NR); (j+1) > 0; j -=D_NR) //loop along n direction { a01 = L + (j+D_NR)*cs_a + (j); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ///GEMM for previous blocks /// ///load 4x4 block of b11 ymm0 = _mm256_loadu_pd((double const *)b11); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[0])); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b_offset[1])); //B11[0][3] B11[1][3] B11[2][3] B11[3][3] ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[0][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm8, ymm4); //ymm4 += (B10[0][0]*A01[0][0] B10[1][0]*A01[0][0] B10[2][0]*A01[0][0] B10[3][0]*A01[0][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[1][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm9, ymm4); //ymm4 += (B10[0][1]*A01[1][0] B10[1][1]*A01[1][0] B10[2][1]*A01[1][0] B10[3][1]*A01[1][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[2][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm10, ymm4); //ymm4 += (B10[0][2]*A01[2][0] B10[1][2]*A01[2][0] B10[2][2]*A01[2][0] B10[3][2]*A01[2][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm12 = _mm256_broadcast_sd((double const *)(a01 + 0)); //A01[3][0] ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm4 = _mm256_fmadd_pd(ymm12, ymm11, ymm4); //ymm4 += (B10[0][3]*A01[3][0] B10[1][3]*A01[3][0] B10[2][3]*A01[3][0] B10[3][3]*A01[3][0]) ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + D_NR*cs_a; //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm0 = _mm256_fmsub_pd(ymm0, ymm15, ymm4); //B11[x][0] -=ymm4 ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// a11 += cs_a; //2nd col ymm5 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] a11 += cs_a; //3rd col ymm6 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm9 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //4th col ymm7 = _mm256_broadcast_sd((double const *)(a11+0)); //A11[0][1] ymm10 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm12 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); ymm0 = _mm256_fnmadd_pd(ymm3, ymm7, ymm0); //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); ymm0 = _mm256_fnmadd_pd(ymm2, ymm6, ymm0); //(Row 1):FMA operations ymm0 = _mm256_fnmadd_pd(ymm1, ymm5, ymm0); _mm256_storeu_pd((double *)b11, ymm0); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b_offset[1]), ymm3); //store(B11[x][3]) } if(n_remainder) //implementation for remainder columns(when n is not a multiple of D_NR) { a01 = L + (j+D_NR)*cs_a + (j); //pointer to block of A to be used for GEMM a11 = L + j*cs_a + j; //pointer to block of A to be used for TRSM b10 = B + i + (j+D_NR)*cs_b; //pointer to block of B to be used for GEMM b11 = B + i + j*cs_b; //pointer to block of B to be used for TRSM k_iter = (n-j-D_NR) / D_NR; //number of times GEMM operations to be performed(in blocks of 4x4) ///GEMM for previous blocks /// ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ///load 4x4 block of b11 if(3 == n_remainder) { ymm1 = _mm256_loadu_pd((double const *)b11+ cs_b); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][2] B11[1][2] B11[2][2] B11[3][2] ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[0][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm8, ymm5); //ymm5 += (B10[0][0]*A01[0][1] B10[1][0]*A01[0][1] B10[2][0]*A01[0][1] B10[3][0]*A01[0][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[1][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm9, ymm5); //ymm5 += (B10[0][1]*A01[1][1] B10[1][1]*A01[1][1] B10[2][1]*A01[1][1] B10[3][1]*A01[1][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[2][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm10, ymm5); //ymm5 += (B10[0][2]*A01[2][1] B10[1][2]*A01[2][1] B10[2][2]*A01[2][1] B10[3][2]*A01[2][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm13 = _mm256_broadcast_sd((double const *)(a01 + 1)); //A01[3][1] ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm5 = _mm256_fmadd_pd(ymm13, ymm11, ymm5); //ymm5 += (B10[0][3]*A01[3][1] B10[1][3]*A01[3][1] B10[2][3]*A01[3][1] B10[3][3]*A01[3][1]) ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm1 = _mm256_fmsub_pd(ymm1, ymm15, ymm5); //B11[x][1] -= ymm5 ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// a11 += 2 * cs_a; //3rd col ymm9 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] a11 += cs_a; //4th col ymm10 = _mm256_broadcast_sd((double const *)(a11+1)); //A11[0][1] ymm12 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); ymm1 = _mm256_fnmadd_pd(ymm3, ymm10, ymm1); //(ROW 2): FMA operations ymm1 = _mm256_fnmadd_pd(ymm2, ymm9, ymm1); _mm256_storeu_pd((double *)(b11 + cs_b), ymm1); //store(B11[x][1]) _mm256_storeu_pd((double *)(b11 + cs_b_offset[0]), ymm2); //(store(B11[x][2])) _mm256_storeu_pd((double *)(b11 + cs_b*3), ymm3); //store(B11[x][0]) } else if(2 == n_remainder) { ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][1] B11[1][1] B11[2][1] B11[3][1] ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[0][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm8, ymm6); //ymm6 += (B10[0][0]*A01[0][2] B10[1][0]*A01[0][2] B10[2][0]*A01[0][2] B10[3][0]*A01[0][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[1][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm9, ymm6); //ymm6 += (B10[0][1]*A01[1][2] B10[1][1]*A01[1][2] B10[2][1]*A01[1][2] B10[3][1]*A01[1][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[2][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm10, ymm6); //ymm6 += (B10[0][2]*A01[2][2] B10[1][2]*A01[2][2] B10[2][2]*A01[2][2] B10[3][2]*A01[2][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm14 = _mm256_broadcast_sd((double const *)(a01 + 2)); //A01[3][2] ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm6 = _mm256_fmadd_pd(ymm14, ymm11, ymm6); //ymm6 += (B10[0][3]*A01[3][2] B10[1][3]*A01[3][2] B10[2][3]*A01[3][2] B10[3][3]*A01[3][2]) ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm2 = _mm256_fmsub_pd(ymm2, ymm15, ymm6); //B11[x][2] -= ymm6 ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 ///implement TRSM/// ///read 4x4 block of A11/// a11 += 3 * cs_a; //4th col ymm12 = _mm256_broadcast_sd((double const *)(a11+2)); //A11[0][1] //(Row 3): FMA operations ymm2 = _mm256_fnmadd_pd(ymm3, ymm12, ymm2); _mm256_storeu_pd((double *)(b11+ cs_b * 2), ymm2); //store(B11[x][0]) _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][1]) } else if(1 == n_remainder) { ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); //B11[0][0] B11[1][0] B11[2][0] B11[3][0] ///GEMM implementation starts/// for(k = 0; k < k_iter; k++) //loop for number of GEMM operations { ptr_a01_dup = a01; //load 4x4 bblock of b10 ymm8 = _mm256_loadu_pd((double const *)b10); //B10[0][0] B10[1][0] B10[2][0] B10[3][0] ymm9 = _mm256_loadu_pd((double const *)(b10 + cs_b)); //B10[0][1] B10[1][1] B10[2][1] B10[3][1] ymm10 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[0])); //B10[0][2] B10[1][2] B10[2][2] B10[3][2] ymm11 = _mm256_loadu_pd((double const *)(b10 + cs_b_offset[1])); //B10[0][3] B10[1][3] B10[2][3] B10[3][3] //broadcast 1st row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[0][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm8, ymm7); //ymm7 += (B10[0][0]*A01[0][3] B10[1][0]*A01[0][3] B10[2][0]*A01[0][3] B10[3][0]*A01[0][3]) //broadcast 2nd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[1][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm9, ymm7); //ymm7 += (B10[0][1]*A01[1][3] B10[1][1]*A01[1][3] B10[2][1]*A01[1][3] B10[3][1]*A01[1][3]) //braodcast 3rd row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[2][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm10, ymm7); //ymm7 += (B10[0][2]*A01[2][3] B10[1][2]*A01[2][3] B10[2][2]*A01[2][3] B10[3][2]*A01[2][3]) //broadcast 4th row of A01 ymm15 = _mm256_broadcast_sd((double const *)(a01 + 3)); //A01[3][3] a01 += cs_a; //move to next row of A ymm7 = _mm256_fmadd_pd(ymm15, ymm11, ymm7); //ymm7 += (B10[0][3]*A01[3][3] B10[1][3]*A01[3][3] B10[2][3]*A01[3][3] B10[3][3]*A01[3][3]) b10 += D_NR * cs_b; //pointer math to find next block of B for GEMM a01 = ptr_a01_dup + (D_NR * cs_a); //pointer math to find next block of A for GEMM } ///GEMM code end/// ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); //register to store alpha ymm3 = _mm256_fmsub_pd(ymm3, ymm15, ymm7); //B11[x][3] -= ymm7 _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); //store(B11[x][0]) } } m_remainder -= 4; i -= 4; } if(m_remainder) ///implementation for remainder rows { dtrsm_small_XAutB_unitDiag(L, B, AlphaVal, m_remainder, n, cs_a, cs_b); } return BLIS_SUCCESS; } /* * AX = Alpha*B, Single precision, A: lower triangular * This kernel implementation supports matrices A and B such that m is equal to BLI_AlXB_M_SP and n is mutiple of 8 */ static err_t bli_strsm_small_AlXB ( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { obj_t alpha, beta; // gemm parameters obj_t Ga, Gb, Gc; // for GEMM int m = bli_obj_length(b); // number of rows of matrix B int n = bli_obj_width(b); // number of columns of matrix B int lda = bli_obj_col_stride(a); // column stride of A int ldb = bli_obj_col_stride(b); // column stride of B int rsa = bli_obj_row_stride(a); // row stride of A int rsb = bli_obj_row_stride(b); // row stride of B int i = 0; int j; int blk_size = 8; int isUnitDiag = bli_obj_has_unit_diag(a); float alphaVal; float* restrict L = a->buffer; float* restrict B = b->buffer; if (m != BLI_AlXB_M_SP || (n&7) != 0) { return BLIS_NOT_YET_IMPLEMENTED; } if ( (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) { return BLIS_NOT_YET_IMPLEMENTED; } alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, AlphaObj)); /* Small _GEMM preparation code */ bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &alpha ); bli_obj_create( BLIS_FLOAT, 1, 1, 0, 0, &beta ); /* B = B - A*B */ bli_setsc( -(1.0), 0.0, &alpha ); bli_setsc( (1.0), 0.0, &beta ); bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, blk_size, a->buffer, rsa, lda, &Ga); bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gb); bli_obj_create_with_attached_buffer( BLIS_FLOAT, blk_size, n, b->buffer, rsb, ldb, &Gc); bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Ga ); bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Gb ); bli_obj_set_conjtrans( BLIS_NO_TRANSPOSE, &Gc ); //first block of trsm Gb.buffer = (void*)(B + i); //trsm of first 8xn block if (alphaVal != 1) { if (isUnitDiag == 0) { blis_strsm_microkernel_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); fp_blis_strsm_microkernel = blis_strsm_microkernel; } else { blis_strsm_microkernel_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag; } bli_setsc( alphaVal, 0.0, &beta ); } else { if (isUnitDiag == 0) { blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); fp_blis_strsm_microkernel = blis_strsm_microkernel; } else { blis_strsm_microkernel_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); fp_blis_strsm_microkernel = blis_strsm_microkernel_unitDiag; } } //gemm update for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT { Ga.buffer = (void*)(L + j + i*lda); Gc.buffer = (void*)(B + j); bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb } //trsm of remaining blocks for (i = blk_size; i < m; i += blk_size) { Gb.buffer = (void*)(B + i); fp_blis_strsm_microkernel((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); for (j = i + blk_size; j < m; j += blk_size) // for rows upto multiple of BLOCK_HEIGHT { Ga.buffer = (void*)(L + j + i*lda); Gc.buffer = (void*)(B + j); bli_gemm_small(&alpha, &Ga, &Gb, &beta, &Gc, cntx, cntl ); // Gc = beta*Gc + alpha*Ga *Gb } } // End of for loop - i return BLIS_SUCCESS; } /* * XA' = Alpha*B, Single precision, A: lower triangular * This kernel implementation supports matrices A and B such that * m and n are multiples of 8 and n is less than or equal to BLI_XAltB_N_SP */ static err_t bli_strsm_small_XAltB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { int m = bli_obj_length(a); // number of rows of matrix B int n = bli_obj_length(b); // number of columns of matrix B int lda = bli_obj_col_stride(a); // column stride of A int ldb = bli_obj_col_stride(b); // column stride of B int rsa = bli_obj_row_stride(a); // row stride of A int rsb = bli_obj_row_stride(b); // row stride of B int i = 0; int isUnitDiag = bli_obj_has_unit_diag(a); float alphaVal; float *L = a->buffer; float *B = b->buffer; if ((m&7) != 0 || (n&7) != 0) { return BLIS_NOT_YET_IMPLEMENTED; } if ( n > BLI_XAltB_N_SP || (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) { return BLIS_NOT_YET_IMPLEMENTED; } alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, AlphaObj)); if (alphaVal != 1) { if (isUnitDiag == 0) { trsm_XAtB_block_allSmallSizedMatrices_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); } else { trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); } } else { if (isUnitDiag == 0) { trsm_XAtB_block_allSmallSizedMatrices((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); } else { trsm_XAtB_block_allSmallSizedMatrices_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); } } return BLIS_SUCCESS; } /* * A'X = Alpha*B, Single precision, A: upper triangular * This kernel implementation supports matrices A and B such that * m and n are multiples of 8, m is less than or equal to BLI_AutXB_M_SP and n is less than or equal to BLI_AutXB_N_SP */ static err_t bli_strsm_small_AutXB( side_t side, obj_t* AlphaObj, obj_t* a, obj_t* b, cntx_t* cntx, cntl_t* cntl ) { int m = bli_obj_width(a); // number of rows of matrix A (since At, so width is taken) int n = bli_obj_width(b); // number of columns of matrix B int lda = bli_obj_col_stride(a); // column stride of A int ldb = bli_obj_col_stride(b); // column stride of B int rsa = bli_obj_row_stride(a); // row stride of A int rsb = bli_obj_row_stride(b); // row stride of B int i = 0; int isUnitDiag = bli_obj_has_unit_diag(a); float alphaVal; float *L = a->buffer; float *B = b->buffer; if ((m&7) != 0 || (n&7) != 0) { return BLIS_NOT_YET_IMPLEMENTED; } if ( m > BLI_AutXB_M_SP || n > BLI_AutXB_N_SP || (m*(m + n)) > BLIS_SMALL_MATRIX_THRES_TRSM ) { return BLIS_NOT_YET_IMPLEMENTED; } alphaVal = *((float *)bli_obj_buffer_for_const(BLIS_FLOAT, AlphaObj)); if (alphaVal != 1) { if (isUnitDiag == 0) { trsm_AutXB_block_allSmallSizedMatrices_alpha((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); } else { trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb, alphaVal); } } else { if (isUnitDiag == 0) { trsm_AutXB_block_allSmallSizedMatrices((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); } else { trsm_AutXB_block_allSmallSizedMatrices_unitDiag((L + i * lda + i), (B + i), m, n, rsa, rsb, lda, ldb); } } return BLIS_SUCCESS; } ///////////////////////////// AX=B /////////////////////////////// static void blis_strsm_microkernel_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal) { float ones = 1.0; int j; int cs_b_offset[6]; //int row2, row4, row6; float *ptr_b_dup; //70 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_cols[8]; __m256 mat_a_cols_rearr[36]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags; __m256 alphaReg; cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; //reciprocal_diags = _mm256_loadu_ps((float const *)ones); reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); alphaReg = _mm256_broadcast_ss((float const *)&alphaVal); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); //row2 = (cs_l << 1); //row4 = (cs_l << 2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); //row6 = row2 + row4; mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); //reciprocal_diags = _mm256_loadu_ps((float const *)ones); //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); //1st col mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); //2nd col ptr_l += cs_l; mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //3rd col ptr_l += cs_l; mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //4rth col ptr_l += cs_l; mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //5th col ptr_l += cs_l; mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //6th col ptr_l += cs_l; mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //7th col ptr_l += cs_l; mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //7th col ptr_l += cs_l; mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); numCols_b -= 8; // blk_width = 8 //compute reciprocals of L(i,i) and broadcast in registers mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); //reciprocal of diagnol elements reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); //Start loop for cols of B to be processed in size of blk_width for (j = 0; j < numCols_b; j += 8) { ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Read next set of B columns ptr_b += (cs_b + cs_b_offset[5]); mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } //Last block trsm processing ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } static void blis_strsm_microkernel_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alphaVal) { //float ones = 1.0; int j; int cs_b_offset[6]; //int row2, row4, row6; float *ptr_b_dup; //70 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_cols[8]; __m256 mat_a_cols_rearr[36]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags; __m256 alphaReg; cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; //reciprocal_diags = _mm256_loadu_ps((float const *)ones); //reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); alphaReg = _mm256_broadcast_ss((float const *)&alphaVal); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); //row2 = (cs_l << 1); //row4 = (cs_l << 2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); //row6 = row2 + row4; mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); //reciprocal_diags = _mm256_loadu_ps((float const *)ones); //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); //1st col mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); //2nd col ptr_l += cs_l; mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //3rd col ptr_l += cs_l; mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //4rth col ptr_l += cs_l; mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //5th col ptr_l += cs_l; mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //6th col ptr_l += cs_l; mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //7th col ptr_l += cs_l; mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //8th col //ptr_l += cs_l; //mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); numCols_b -= 8; // blk_width = 8 //compute reciprocals of L(i,i) and broadcast in registers //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); //mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); //mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); //reciprocal of diagnol elements //reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); //Start loop for cols of B to be processed in size of blk_width for (j = 0; j < numCols_b; j += 8) { ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); //extract diag a11 from a //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Read next set of B columns ptr_b += (cs_b + cs_b_offset[5]); mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } //Last block trsm processing ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); //extract diag a11 from a //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } static void blis_strsm_microkernel_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { //float ones = 1.0; int j; int cs_b_offset[6]; //int row2, row4, row6; float *ptr_b_dup; //70 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_cols[8]; __m256 mat_a_cols_rearr[36]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags; cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; //reciprocal_diags = _mm256_loadu_ps((float const *)ones); //reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); //row2 = (cs_l << 1); //row4 = (cs_l << 2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); //row6 = row2 + row4; mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); //reciprocal_diags = _mm256_loadu_ps((float const *)ones); //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); //1st col mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); //2nd col ptr_l += cs_l; mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //3rd col ptr_l += cs_l; mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //4rth col ptr_l += cs_l; mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //5th col ptr_l += cs_l; mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //6th col ptr_l += cs_l; mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //7th col ptr_l += cs_l; mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //8th col //ptr_l += cs_l; //mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); numCols_b -= 8; // blk_width = 8 //compute reciprocals of L(i,i) and broadcast in registers //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); //mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); //mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); //reciprocal of diagnol elements //reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); //Start loop for cols of B to be processed in size of blk_width for (j = 0; j < numCols_b; j += 8) { ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); //extract diag a11 from a //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Read next set of B columns ptr_b += (cs_b + cs_b_offset[5]); mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } //Last block trsm processing ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a //mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B //mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); //extract diag a11 from a //mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B //mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a //mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B //mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a //mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B //mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a //mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); //mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B //mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a //mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); //mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B //mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a //mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); //mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B //mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a //mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); //mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B //mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } static void blis_strsm_microkernel(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { float ones = 1.0; int j; int cs_b_offset[6]; //int row2, row4, row6; float *ptr_b_dup; //70 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_cols[8]; __m256 mat_a_cols_rearr[36]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags; cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; //reciprocal_diags = _mm256_loadu_ps((float const *)ones); reciprocal_diags = _mm256_broadcast_ss((float const *)&ones); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //read first set of 16x8 block of B into registers, where 16 is the blk_height and 8 is the blk_width for B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); //_mm_prefetch((char*)(ptr_l + 0), _MM_HINT_T0); //row2 = (cs_l << 1); //row4 = (cs_l << 2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); //_mm_prefetch((char*)(ptr_l + cs_l), _MM_HINT_T0); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); //_mm_prefetch((char*)(ptr_l + row2), _MM_HINT_T0); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); //_mm_prefetch((char*)(ptr_l + row2 + cs_l), _MM_HINT_T0); //row6 = row2 + row4; mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); //_mm_prefetch((char*)(ptr_l + row4), _MM_HINT_T0); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); //_mm_prefetch((char*)(ptr_l + row4 + cs_l), _MM_HINT_T0); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); //_mm_prefetch((char*)(ptr_l + row6), _MM_HINT_T0); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //_mm_prefetch((char*)(ptr_l + row6 + cs_l), _MM_HINT_T0); //reciprocal_diags = _mm256_loadu_ps((float const *)ones); //read first set of 16x16 block of L, where 16 is the blk_height and 16 is the blk_width for L /*mat_a_cols[0] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[1] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[2] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[3] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[4] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[5] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[6] = _mm256_loadu_ps((float const *)ptr_l); ptr_l += cs_l; mat_a_cols[7] = _mm256_loadu_ps((float const *)ptr_l);*/ //Shuffle to rearrange/transpose 16x16 block of L into contiguous row-wise registers //tmpRegs[0] = _mm256_castps256_ps128(mat_a_cols[0]); //zero latency, no instruction added actually. //mat_a_cols_rearr[0] = _mm256_broadcastss_ps(tmpRegs[0]); //1st col mat_a_cols_rearr[0] = _mm256_broadcast_ss((float const *)(ptr_l+0)); mat_a_cols_rearr[1] = _mm256_broadcast_ss((float const *)(ptr_l+1)); mat_a_cols_rearr[3] = _mm256_broadcast_ss((float const *)(ptr_l+2)); mat_a_cols_rearr[6] = _mm256_broadcast_ss((float const *)(ptr_l+3)); mat_a_cols_rearr[10] = _mm256_broadcast_ss((float const *)(ptr_l+4)); mat_a_cols_rearr[15] = _mm256_broadcast_ss((float const *)(ptr_l+5)); mat_a_cols_rearr[21] = _mm256_broadcast_ss((float const *)(ptr_l+6)); mat_a_cols_rearr[28] = _mm256_broadcast_ss((float const *)(ptr_l+7)); //2nd col ptr_l += cs_l; mat_a_cols_rearr[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_cols_rearr[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[7] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[11] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[16] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[22] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[29] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //3rd col ptr_l += cs_l; mat_a_cols_rearr[5] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_cols_rearr[8] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[12] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[17] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[23] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[30] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //4rth col ptr_l += cs_l; mat_a_cols_rearr[9] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_cols_rearr[13] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[18] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[24] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[31] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //5th col ptr_l += cs_l; mat_a_cols_rearr[14] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_cols_rearr[19] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[25] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[32] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //6th col ptr_l += cs_l; mat_a_cols_rearr[20] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_cols_rearr[26] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[33] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //7th col ptr_l += cs_l; mat_a_cols_rearr[27] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_cols_rearr[34] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //7th col ptr_l += cs_l; mat_a_cols_rearr[35] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); numCols_b -= 8; // blk_width = 8 //compute reciprocals of L(i,i) and broadcast in registers mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[2]); mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_cols_rearr[5], mat_a_cols_rearr[9]); mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_cols_rearr[14], mat_a_cols_rearr[20]); mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_cols_rearr[27], mat_a_cols_rearr[35]); //mat_a_diag_inv[1] = _mm256_permute_ps(mat_a_diag_inv[1], 0x55); //mat_a_diag_inv[3] = _mm256_permute_ps(mat_a_diag_inv[3], 0x55); mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC); mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0x20); //reciprocal of diagnol elements reciprocal_diags = _mm256_div_ps(reciprocal_diags, mat_a_diag_inv[0]); //Start loop for cols of B to be processed in size of blk_width for (j = 0; j < numCols_b; j += 8) { ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Read next set of B columns ptr_b += (cs_b + cs_b_offset[5]); mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + (cs_b))); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5])); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } //Last block trsm processing ptr_b_dup = ptr_b; /*Shuffle to rearrange/transpose 16x8 block of B into contiguous row-wise registers*/ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_cols_rearr[1], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[3], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[6], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[10], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[15], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[21], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[28], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_cols_rearr[4], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[7], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[11], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[16], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[22], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[29], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_cols_rearr[8], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[12], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[17], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[23], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[30], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags, 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_cols_rearr[13], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[18], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[24], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[31], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags, 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_cols_rearr[19], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[25], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[32], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags, 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_cols_rearr[26], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[33], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags, 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_cols_rearr[34], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //--> Transpose and store results of columns of B block <--// ////unpacklow//// mat_a_cols[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_a_cols[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_a_cols[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_a_cols[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_a_cols[4] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x44); mat_a_cols[5] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0xEE); mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x44); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0xEE); #else mat_a_cols[6] = _mm256_shuffle_ps(mat_a_cols[0], mat_a_cols[1], 0x4E); mat_a_cols[7] = _mm256_shuffle_ps(mat_a_cols[2], mat_a_cols[3], 0x4E); mat_a_cols[4] = _mm256_blend_ps(mat_a_cols[0], mat_a_cols[6], 0xCC); mat_a_cols[5] = _mm256_blend_ps(mat_a_cols[1], mat_a_cols[6], 0x33); mat_a_cols[6] = _mm256_blend_ps(mat_a_cols[2], mat_a_cols[7], 0xCC); mat_a_cols[7] = _mm256_blend_ps(mat_a_cols[3], mat_a_cols[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_a_cols[0] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x20); mat_a_cols[4] = _mm256_permute2f128_ps(mat_a_cols[4], mat_a_cols[6], 0x31); mat_a_cols[1] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x20); mat_a_cols[5] = _mm256_permute2f128_ps(mat_a_cols[5], mat_a_cols[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_a_cols[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_a_cols[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_a_cols[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_a_cols[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_a_cols[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_a_cols[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_a_cols[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_a_cols[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_a_cols[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_a_cols[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_a_cols[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_a_cols[7]); //end loop of cols } #if OPT_CACHE_BLOCKING_L1 //new intrinsic kernels static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { float ones = 1.0; int i, i1, i2, i3, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags[2]; reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); //read diag elems of L 16x16 block mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); reciprocal_diags[1] = reciprocal_diags[0]; //pack first 8 diags together mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; //Read next 8x8 block of A to get diag elements i3 += cs_l_offset[6]; mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); //pack 8 diags of A together reciprocal_diags[0] = reciprocal_diags[1]; mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A i = i1 + r; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); #endif i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B ptr_l_dup = ptr_l; i4 = i2 + r; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); i4 = k >> 3; ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols } i2 += cs_b_offset[6]; i += cs_l_offset[6]; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) { i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; #if GEMM_ACCUM_A //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; } } } //numRows of A ///////////////////loop ends ///////////////////// } static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) { float ones = 1.0; int i, i1, i2, i3, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags[2]; __m256 alphaReg; reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); alphaReg = _mm256_broadcast_ss((float const *)&alpha); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); //read diag elems of L 16x16 block mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); reciprocal_diags[1] = reciprocal_diags[0]; //pack first 8 diags together mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); #if 0 //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); #endif //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; //Read next 8x8 block of A to get diag elements i3 += cs_l_offset[6]; mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); //pack 8 diags of A together reciprocal_diags[0] = reciprocal_diags[1]; mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A i = i1 + r; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); #endif i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B ptr_l_dup = ptr_l; i4 = i2 + r; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); i4 = k >> 3; ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols } i2 += cs_b_offset[6]; i += cs_l_offset[6]; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) { i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; #if GEMM_ACCUM_A //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; } } } //numRows of A ///////////////////loop ends ///////////////////// } static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { //float ones = 1.0; int i, i1, i2, i3, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags[2]; // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); //(Row0) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; i3 += cs_l_offset[6]; i = 0; i2 = 0; for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A i = i1 + r; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); #endif i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B ptr_l_dup = ptr_l; i4 = i2 + r; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); i4 = k >> 3; ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols } i2 += cs_b_offset[6]; i += cs_l_offset[6]; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) { i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; #if GEMM_ACCUM_A //(Row0): already done #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; } } } //numRows of A ///////////////////loop ends ///////////////////// } static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) { //float ones = 1.0; int i, i1, i2, i3, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags[2]; __m256 alphaReg; alphaReg = _mm256_broadcast_ss((float const *)&alpha); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); #if 0 //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); #endif /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); //(Row0) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; i3 += cs_l_offset[6]; i = 0; i2 = 0; for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A i = i1 + r; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); #endif i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B ptr_l_dup = ptr_l; i4 = i2 + r; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); i4 = k >> 3; ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + i + 7)); ptr_l_dup += cs_l; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols } i2 += cs_b_offset[6]; i += cs_l_offset[6]; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) { i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i2); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i2)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i2)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i2)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i2)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i2)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i2)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i2)); mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; #if GEMM_ACCUM_A //(Row0): already done #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + r, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+r), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + r), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + r), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + r), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + r), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + r), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + r), mat_b_rearr[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; } } } //numRows of A ///////////////////loop ends ///////////////////// } #else //rel 1.0 intrisic kernels (NOT OPT_CACHE_BLOCKING_L1) static void trsm_XAtB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { float ones = 1.0; int i, i1, i2, i3, i4, j, k, l; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[16][8]; __m256 mat_a_cols_rearr[8]; __m256 mat_a_blk_elems[64]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags[2]; reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); //read diag elems of L 16x16 block mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l); mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); reciprocal_diags[1] = reciprocal_diags[0]; //pack first 8 diags together mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; //Read next 8x8 block of A to get diag elements i3 += cs_l_offset[6]; mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l + i3); mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); //pack 8 diags of A together reciprocal_diags[0] = reciprocal_diags[1]; mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); i = 0; i2 = 0; for (k = 0; k < numCols_b; k += 8) { i = i1 + k; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); i2++; } i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); // _mm256_permute2f128_ps() //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); i += cs_l_offset[6]; for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B i4 = i2 + k; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); i4 = k >> 3; //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) //end loop of cols } i2 += cs_b_offset[6]; } //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); k = 0; for (i = 0; i < numCols_b; i+=8) { /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; } } ///////////////////loop ends ///////////////////// } static void trsm_XAtB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) { float ones = 1.0; int i, i1, i2, i3, i4, j, k, l; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[16][8]; __m256 mat_a_cols_rearr[8]; __m256 mat_a_blk_elems[64]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags[2]; __m256 alphaReg; reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); alphaReg = _mm256_broadcast_ss((float const *)&alpha); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); //read diag elems of L 16x16 block mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l); mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); reciprocal_diags[1] = reciprocal_diags[0]; //pack first 8 diags together mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg); mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg); mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg); mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg); mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg); mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg); mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg); mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_col[0] = _mm256_mul_ps(mat_b_rearr[0][0], mat_a_diag_inv[0]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_col[1] = _mm256_mul_ps(mat_b_rearr[1][0], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_col[2] = _mm256_mul_ps(mat_b_rearr[2][0], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_col[3] = _mm256_mul_ps(mat_b_rearr[3][0], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_col[4] = _mm256_mul_ps(mat_b_rearr[4][0], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_col[5] = _mm256_mul_ps(mat_b_rearr[5][0], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_col[6] = _mm256_mul_ps(mat_b_rearr[6][0], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_col[7] = _mm256_mul_ps(mat_b_rearr[7][0], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; //Read next 8x8 block of A to get diag elements i3 += cs_l_offset[6]; mat_a_cols_rearr[0] = _mm256_loadu_ps((float const *)ptr_l + i3); mat_a_cols_rearr[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); mat_a_cols_rearr[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); mat_a_cols_rearr[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); mat_a_cols_rearr[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); mat_a_cols_rearr[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); mat_a_cols_rearr[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); mat_a_cols_rearr[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); //pack 8 diags of A together reciprocal_diags[0] = reciprocal_diags[1]; mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_cols_rearr[0], mat_a_cols_rearr[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_cols_rearr[2], mat_a_cols_rearr[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_cols_rearr[4], mat_a_cols_rearr[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_cols_rearr[6], mat_a_cols_rearr[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); i = 0; i2 = 0; for (k = 0; k < numCols_b; k += 8) { i = i1 + k; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg); mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg); mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg); mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg); mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg); mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg); mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg); mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg); i2++; } i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); // _mm256_permute2f128_ps() //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); i += cs_l_offset[6]; for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B i4 = i2 + k; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); i4 = k >> 3; //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) //end loop of cols } i2 += cs_b_offset[6]; } //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); k = 0; for (i = 0; i < numCols_b; i+=8) { /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[k][0] = _mm256_mul_ps(mat_b_rearr[k][0], mat_a_diag_inv[0]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[k][1] = _mm256_mul_ps(mat_b_rearr[k][1], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[k][2] = _mm256_mul_ps(mat_b_rearr[k][2], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[k][3] = _mm256_mul_ps(mat_b_rearr[k][3], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[k][4] = _mm256_mul_ps(mat_b_rearr[k][4], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[k][5] = _mm256_mul_ps(mat_b_rearr[k][5], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[k][6] = _mm256_mul_ps(mat_b_rearr[k][6], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[k][7] = _mm256_mul_ps(mat_b_rearr[k][7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); k++; } } ///////////////////loop ends ///////////////////// } static void trsm_XAtB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { //float ones = 1.0; int i, i1, i2, i3, i4, j, k, l; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[16][8]; //__m256 mat_a_cols_rearr[8]; __m256 mat_a_blk_elems[64]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags[2]; // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); //(Row0) mat_b_col[0] = mat_b_rearr[0][0]; //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; i3 += cs_l_offset[6]; i = 0; i2 = 0; for (k = 0; k < numCols_b; k += 8) { i = i1 + k; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); i2++; } i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); // _mm256_permute2f128_ps() //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); i += cs_l_offset[6]; for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B i4 = i2 + k; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); i4 = k >> 3; //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) //end loop of cols } i2 += cs_b_offset[6]; } //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); k = 0; for (i = 0; i < numCols_b; i+=8) { /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A //(Row0): already done //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; } } ///////////////////loop ends ///////////////////// } static void trsm_XAtB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) { //float ones = 1.0; int i, i1, i2, i3, i4, j, k, l; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[16][8]; //__m256 mat_a_cols_rearr[8]; __m256 mat_a_blk_elems[64]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags[2]; __m256 alphaReg; alphaReg = _mm256_broadcast_ss((float const *)&alpha); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7][0] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_rearr[0][0] = _mm256_mul_ps(mat_b_rearr[0][0], alphaReg); mat_b_rearr[1][0] = _mm256_mul_ps(mat_b_rearr[1][0], alphaReg); mat_b_rearr[2][0] = _mm256_mul_ps(mat_b_rearr[2][0], alphaReg); mat_b_rearr[3][0] = _mm256_mul_ps(mat_b_rearr[3][0], alphaReg); mat_b_rearr[4][0] = _mm256_mul_ps(mat_b_rearr[4][0], alphaReg); mat_b_rearr[5][0] = _mm256_mul_ps(mat_b_rearr[5][0], alphaReg); mat_b_rearr[6][0] = _mm256_mul_ps(mat_b_rearr[6][0], alphaReg); mat_b_rearr[7][0] = _mm256_mul_ps(mat_b_rearr[7][0], alphaReg); //(Row0) mat_b_col[0] = mat_b_rearr[0][0]; //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[1][0]);//d = c - (a*b) mat_b_rearr[2][0] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[7][0]);//d = c - (a*b) //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[2][0]);//d = c - (a*b) mat_b_rearr[3][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[7][0]);//d = c - (a*b) //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[2], mat_b_rearr[3][0]);//d = c - (a*b) mat_b_rearr[4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[2], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[2], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[7][0]);//d = c - (a*b) //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[3], mat_b_rearr[4][0]);//d = c - (a*b) mat_b_rearr[5][0] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[3], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[3], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[3], mat_b_rearr[7][0]);//d = c - (a*b) //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[4], mat_b_rearr[5][0]);//d = c - (a*b) mat_b_rearr[6][0] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[4], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[4], mat_b_rearr[7][0]);//d = c - (a*b) //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[5], mat_b_rearr[6][0]);//d = c - (a*b) mat_b_rearr[7][0] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[5], mat_b_rearr[7][0]);//d = c - (a*b) //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[6], mat_b_rearr[7][0]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_col[7]); //i += cs_b_offset[6]; //ptr_b_dup += cs_b_offset[6]; i += 8; ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += 8; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += cs_b_offset[6]; i1 += cs_b_offset[6]; i3 += cs_l_offset[6]; i = 0; i2 = 0; for (k = 0; k < numCols_b; k += 8) { i = i1 + k; //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[i2][0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[i2][1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[i2][2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[i2][3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[i2][4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[i2][5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[i2][6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[i2][7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); mat_b_rearr[i2][0] = _mm256_mul_ps(mat_b_rearr[i2][0], alphaReg); mat_b_rearr[i2][1] = _mm256_mul_ps(mat_b_rearr[i2][1], alphaReg); mat_b_rearr[i2][2] = _mm256_mul_ps(mat_b_rearr[i2][2], alphaReg); mat_b_rearr[i2][3] = _mm256_mul_ps(mat_b_rearr[i2][3], alphaReg); mat_b_rearr[i2][4] = _mm256_mul_ps(mat_b_rearr[i2][4], alphaReg); mat_b_rearr[i2][5] = _mm256_mul_ps(mat_b_rearr[i2][5], alphaReg); mat_b_rearr[i2][6] = _mm256_mul_ps(mat_b_rearr[i2][6], alphaReg); mat_b_rearr[i2][7] = _mm256_mul_ps(mat_b_rearr[i2][7], alphaReg); i2++; } i = 0; i2 = 0; for (l = 0; l < j; l += 8) // move across m { //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 1)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 2)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 3)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 4)); mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 5)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 6)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + i + 7)); //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 1)); mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 2)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 3)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 4)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 5)); mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 6)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + i + 7)); //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i)); mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 1)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 2)); mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 3)); mat_a_blk_elems[28] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 4)); mat_a_blk_elems[29] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 5)); mat_a_blk_elems[30] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 6)); mat_a_blk_elems[31] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + i + 7)); // _mm256_permute2f128_ps() //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[32] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i)); mat_a_blk_elems[33] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 1)); mat_a_blk_elems[34] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 2)); mat_a_blk_elems[35] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 3)); mat_a_blk_elems[36] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 4)); mat_a_blk_elems[37] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 5)); mat_a_blk_elems[38] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 6)); mat_a_blk_elems[39] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + i + 7)); //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[40] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i)); mat_a_blk_elems[41] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 1)); mat_a_blk_elems[42] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 2)); mat_a_blk_elems[43] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 3)); mat_a_blk_elems[44] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 4)); mat_a_blk_elems[45] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 5)); mat_a_blk_elems[46] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 6)); mat_a_blk_elems[47] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + i + 7)); //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[48] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i)); mat_a_blk_elems[49] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 1)); mat_a_blk_elems[50] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 2)); mat_a_blk_elems[51] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 3)); mat_a_blk_elems[52] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 4)); mat_a_blk_elems[53] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 5)); mat_a_blk_elems[54] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 6)); mat_a_blk_elems[55] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + i + 7)); //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[56] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i)); mat_a_blk_elems[57] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 1)); mat_a_blk_elems[58] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 2)); mat_a_blk_elems[59] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 3)); mat_a_blk_elems[60] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 4)); mat_a_blk_elems[61] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 5)); mat_a_blk_elems[62] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 6)); mat_a_blk_elems[63] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5] + i + 7)); i += cs_l_offset[6]; for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) { /////////////////// Partial Lower 8x8 block trsm of B i4 = i2 + k; //Read current 8 cols of B columns from specified 8x8 current-block of B mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); i4 = k >> 3; //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_col[1], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_col[1], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_col[1], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_col[1], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_col[1], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_col[1], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_col[1], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_col[1], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_col[2], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_col[2], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_col[2], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_col[2], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_col[2], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_col[2], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_col[2], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_col[2], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_col[3], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_col[3], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_col[3], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_col[3], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[28], mat_b_col[3], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[29], mat_b_col[3], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[30], mat_b_col[3], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[31], mat_b_col[3], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[32], mat_b_col[4], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[33], mat_b_col[4], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[34], mat_b_col[4], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[35], mat_b_col[4], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[36], mat_b_col[4], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[37], mat_b_col[4], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[38], mat_b_col[4], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[39], mat_b_col[4], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[40], mat_b_col[5], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[41], mat_b_col[5], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[42], mat_b_col[5], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[43], mat_b_col[5], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[44], mat_b_col[5], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[45], mat_b_col[5], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[46], mat_b_col[5], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[47], mat_b_col[5], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[48], mat_b_col[6], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[49], mat_b_col[6], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[50], mat_b_col[6], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[51], mat_b_col[6], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[52], mat_b_col[6], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[53], mat_b_col[6], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[54], mat_b_col[6], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[55], mat_b_col[6], mat_b_rearr[i4][7]);//d = c - (a*b) //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[i4][0] = _mm256_fnmadd_ps(mat_a_blk_elems[56], mat_b_col[7], mat_b_rearr[i4][0]);//d = c - (a*b) mat_b_rearr[i4][1] = _mm256_fnmadd_ps(mat_a_blk_elems[57], mat_b_col[7], mat_b_rearr[i4][1]);//d = c - (a*b) mat_b_rearr[i4][2] = _mm256_fnmadd_ps(mat_a_blk_elems[58], mat_b_col[7], mat_b_rearr[i4][2]);//d = c - (a*b) mat_b_rearr[i4][3] = _mm256_fnmadd_ps(mat_a_blk_elems[59], mat_b_col[7], mat_b_rearr[i4][3]);//d = c - (a*b) mat_b_rearr[i4][4] = _mm256_fnmadd_ps(mat_a_blk_elems[60], mat_b_col[7], mat_b_rearr[i4][4]);//d = c - (a*b) mat_b_rearr[i4][5] = _mm256_fnmadd_ps(mat_a_blk_elems[61], mat_b_col[7], mat_b_rearr[i4][5]);//d = c - (a*b) mat_b_rearr[i4][6] = _mm256_fnmadd_ps(mat_a_blk_elems[62], mat_b_col[7], mat_b_rearr[i4][6]);//d = c - (a*b) mat_b_rearr[i4][7] = _mm256_fnmadd_ps(mat_a_blk_elems[63], mat_b_col[7], mat_b_rearr[i4][7]);//d = c - (a*b) //end loop of cols } i2 += cs_b_offset[6]; } //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + i + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + i + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + i + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + i + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + i + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + i + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); i += cs_l; //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + i + 7)); k = 0; for (i = 0; i < numCols_b; i+=8) { /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A //(Row0): already done //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[k][1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[k][0], mat_b_rearr[k][1]);//d = c - (a*b) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[k][0], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[k][0], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[k][0], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[k][0], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[k][0], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[k][0], mat_b_rearr[k][7]);//d = c - (a*b) //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[k][2] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_rearr[k][1], mat_b_rearr[k][2]);//d = c - (a*b) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[8], mat_b_rearr[k][1], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[9], mat_b_rearr[k][1], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[10], mat_b_rearr[k][1], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[11], mat_b_rearr[k][1], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[12], mat_b_rearr[k][1], mat_b_rearr[k][7]);//d = c - (a*b) //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[k][3] = _mm256_fnmadd_ps(mat_a_blk_elems[13], mat_b_rearr[k][2], mat_b_rearr[k][3]);//d = c - (a*b) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[14], mat_b_rearr[k][2], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[15], mat_b_rearr[k][2], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[16], mat_b_rearr[k][2], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[17], mat_b_rearr[k][2], mat_b_rearr[k][7]);//d = c - (a*b) //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[k][4] = _mm256_fnmadd_ps(mat_a_blk_elems[18], mat_b_rearr[k][3], mat_b_rearr[k][4]);//d = c - (a*b) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[19], mat_b_rearr[k][3], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[20], mat_b_rearr[k][3], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[21], mat_b_rearr[k][3], mat_b_rearr[k][7]);//d = c - (a*b) //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[k][5] = _mm256_fnmadd_ps(mat_a_blk_elems[22], mat_b_rearr[k][4], mat_b_rearr[k][5]);//d = c - (a*b) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[23], mat_b_rearr[k][4], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[24], mat_b_rearr[k][4], mat_b_rearr[k][7]);//d = c - (a*b) //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[k][6] = _mm256_fnmadd_ps(mat_a_blk_elems[25], mat_b_rearr[k][5], mat_b_rearr[k][6]);//d = c - (a*b) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[26], mat_b_rearr[k][5], mat_b_rearr[k][7]);//d = c - (a*b) //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[k][7] = _mm256_fnmadd_ps(mat_a_blk_elems[27], mat_b_rearr[k][6], mat_b_rearr[k][7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i, mat_b_rearr[k][0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b) + i), mat_b_rearr[k][1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i), mat_b_rearr[k][2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i), mat_b_rearr[k][3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i), mat_b_rearr[k][4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i), mat_b_rearr[k][5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i), mat_b_rearr[k][6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i), mat_b_rearr[k][7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; } } ///////////////////loop ends ///////////////////// } #endif //OPT_CACHE_BLOCKING_L1 //////////////////////////// AutX=B /////////////////////// static void trsm_AutXB_block_allSmallSizedMatrices(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { float ones = 1.0; int i, i1, i2, i3, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags[2]; reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); //read diag elems of L 16x16 block mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); reciprocal_diags[1] = reciprocal_diags[0]; //pack first 8 diags together mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); #if 0 //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); #endif //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); i += cs_b_offset[6]; ptr_b_dup += cs_b_offset[6]; //i += 8; //ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += cs_l_offset[6]; //Read next 8x8 block of A to get diag elements i3 += 8; mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); //pack 8 diags of A together reciprocal_diags[0] = reciprocal_diags[1]; mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += 8; i1 += 8; i = i1; i2 = 0; //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ #endif //i = 0; ptr_l_dup = ptr_l; i4 = i2; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) //{ /////////////////// Partial Lower 8x8 block trsm of B //Read current 8 cols of B columns from specified 8x8 current-block of B mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); #else mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); /* transpose steps end */ //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i4 = k >> 3; ptr_l_dup++; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols //} //i2 += cs_b_offset[6]; i4 += 8; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) //{ //i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i += cs_l; #if GEMM_ACCUM_A //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; //} i += cs_b_offset[6]; i2 += cs_b_offset[6]; } } //numRows of A ///////////////////loop ends ///////////////////// } static void trsm_AutXB_block_allSmallSizedMatrices_alpha(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) { float ones = 1.0; int i, i1, i2, i3, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; __m256 mat_a_diag_inv[8]; __m256 reciprocal_diags[2]; __m256 alphaReg; reciprocal_diags[0] = _mm256_broadcast_ss((float const *)(&ones)); alphaReg = _mm256_broadcast_ss((float const *)&alpha); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); //read diag elems of L 16x16 block mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + cs_l_offset[5]); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); reciprocal_diags[1] = reciprocal_diags[0]; //pack first 8 diags together mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); #if 0 //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); #endif //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv[0] = _mm256_unpacklo_ps(mat_a_diag_inv[0], mat_a_diag_inv[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], mat_a_diag_inv[0]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], mat_a_diag_inv[1]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], mat_a_diag_inv[2]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], mat_a_diag_inv[3]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], mat_a_diag_inv[4]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], mat_a_diag_inv[5]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], mat_a_diag_inv[6]); mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); i += cs_b_offset[6]; ptr_b_dup += cs_b_offset[6]; //i += 8; //ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i3 = 0; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += cs_l_offset[6]; //Read next 8x8 block of A to get diag elements i3 += 8; mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_l + i3); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[0]); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[1]); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[2]); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[3]); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[4]); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)ptr_l + i3 + cs_l_offset[5]); //pack 8 diags of A together reciprocal_diags[0] = reciprocal_diags[1]; mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xAA);//diag 0,1 mat_a_diag_inv[1] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xAA);//diag 2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_blk_elems[4], mat_a_blk_elems[5], 0xAA);//diag 4,5 mat_a_diag_inv[3] = _mm256_blend_ps(mat_a_blk_elems[6], mat_a_blk_elems[7], 0xAA);//diag 6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[1], 0xCC);//diag 0,1,2,3 mat_a_diag_inv[2] = _mm256_blend_ps(mat_a_diag_inv[2], mat_a_diag_inv[3], 0xCC);//diag 4,5,6,7 mat_a_diag_inv[0] = _mm256_blend_ps(mat_a_diag_inv[0], mat_a_diag_inv[2], 0xF0);//diag 0,1,2,3,4,5,6,7 //reciprocal of diagnal elements of A :- 0,1,2,3,4,5,6,7 reciprocal_diags[0] = _mm256_div_ps(reciprocal_diags[0], mat_a_diag_inv[0]); //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += 8; i1 += 8; i = i1; i2 = 0; //extract diag a00 from a mat_a_diag_inv[0] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[0] = _mm256_permute2f128_ps(mat_a_diag_inv[0], mat_a_diag_inv[0], 0x00); //mat_a_diag_inv2[0] = _mm256_unpacklo_ps(mat_a_diag_inv2[0], mat_a_diag_inv2[0]); //extract diag a11 from a mat_a_diag_inv[1] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[1] = _mm256_permute2f128_ps(mat_a_diag_inv[1], mat_a_diag_inv[1], 0x00); //mat_a_diag_inv[1] = _mm256_unpacklo_ps(mat_a_diag_inv[1], mat_a_diag_inv[1]); //extract diag a22 from a mat_a_diag_inv[2] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[2] = _mm256_permute2f128_ps(mat_a_diag_inv[2], mat_a_diag_inv[2], 0x00); //mat_a_diag_inv[2] = _mm256_unpacklo_ps(mat_a_diag_inv[2], mat_a_diag_inv[2]); //extract diag a33 from a mat_a_diag_inv[3] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[3] = _mm256_permute2f128_ps(mat_a_diag_inv[3], mat_a_diag_inv[3], 0x00); //mat_a_diag_inv[3] = _mm256_unpacklo_ps(mat_a_diag_inv[3], mat_a_diag_inv[3]); //extract diag a44 from a mat_a_diag_inv[4] = _mm256_permute_ps(reciprocal_diags[0], 0x00); mat_a_diag_inv[4] = _mm256_permute2f128_ps(mat_a_diag_inv[4], mat_a_diag_inv[4], 0x11); //mat_a_diag_inv[4] = _mm256_unpacklo_ps(mat_a_diag_inv[4], mat_a_diag_inv[4]); //extract diag a55 from a mat_a_diag_inv[5] = _mm256_permute_ps(reciprocal_diags[0], 0x55); mat_a_diag_inv[5] = _mm256_permute2f128_ps(mat_a_diag_inv[5], mat_a_diag_inv[5], 0x11); //mat_a_diag_inv[5] = _mm256_unpacklo_ps(mat_a_diag_inv[5], mat_a_diag_inv[5]); //extract diag a66 from a mat_a_diag_inv[6] = _mm256_permute_ps(reciprocal_diags[0], 0xAA); mat_a_diag_inv[6] = _mm256_permute2f128_ps(mat_a_diag_inv[6], mat_a_diag_inv[6], 0x11); //mat_a_diag_inv[6] = _mm256_unpacklo_ps(mat_a_diag_inv[6], mat_a_diag_inv[6]); //extract diag a77 from a mat_a_diag_inv[7] = _mm256_permute_ps(reciprocal_diags[0], 0xFF); mat_a_diag_inv[7] = _mm256_permute2f128_ps(mat_a_diag_inv[7], mat_a_diag_inv[7], 0x11); //mat_a_diag_inv[7] = _mm256_unpacklo_ps(mat_a_diag_inv[7], mat_a_diag_inv[7]); for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); #endif //i = 0; ptr_l_dup = ptr_l; i4 = i2; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) //{ /////////////////// Partial Lower 8x8 block trsm of B //Read current 8 cols of B columns from specified 8x8 current-block of B mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); #else mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); /* transpose steps end */ //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i4 = k >> 3; ptr_l_dup++; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols //} //i2 += cs_b_offset[6]; i4 += 8; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) //{ //i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i += cs_l; #if GEMM_ACCUM_A //(Row0): Perform mul operation of reciprocal of L(0,0) element with 1st row elements of B mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], mat_a_diag_inv[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(1,1) element with 2nd row elements of B mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], mat_a_diag_inv[1]); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(2, 2) element with 3rd row elements of B mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], mat_a_diag_inv[2]); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(3, 3) element with 4rth row elements of B mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], mat_a_diag_inv[3]); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(4, 4) element with 4rth row elements of B mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], mat_a_diag_inv[4]); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); //i += cs_l; //Perform mul operation of reciprocal of L(5, 5) element with 5th row elements of B mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], mat_a_diag_inv[5]); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); //Perform mul operation of reciprocal of L(6, 6) element with 6th row elements of B mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], mat_a_diag_inv[6]); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //Perform mul operation of reciprocal of L(7, 7) element with 7th row elements of B mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], mat_a_diag_inv[7]); //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; //} i += cs_b_offset[6]; i2 += cs_b_offset[6]; } } //numRows of A ///////////////////loop ends ///////////////////// } static void trsm_AutXB_block_allSmallSizedMatrices_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b) { //float ones = 1.0; int i, i1, i2, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags[2]; // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); #if 0 //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); #endif /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ //(Row0) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); i += cs_b_offset[6]; ptr_b_dup += cs_b_offset[6]; //i += 8; //ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += cs_l_offset[6]; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += 8; i1 += 8; i = i1; i2 = 0; for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ #endif //i = 0; ptr_l_dup = ptr_l; i4 = i2; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) //{ /////////////////// Partial Lower 8x8 block trsm of B //Read current 8 cols of B columns from specified 8x8 current-block of B mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); #else mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); /* transpose steps end */ //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i4 = k >> 3; ptr_l_dup++; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols //} //i2 += cs_b_offset[6]; i4 += 8; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) //{ //i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i += cs_l; #if GEMM_ACCUM_A //(Row0): already done #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); //i += cs_l; //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); //i += cs_l; //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); //i += cs_l; //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); //i += cs_l; //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); //i += cs_l; //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; //} i += cs_b_offset[6]; i2 += cs_b_offset[6]; } } //numRows of A ///////////////////loop ends ///////////////////// } static void trsm_AutXB_block_allSmallSizedMatrices_alpha_unitDiag(float *ptr_l, float *ptr_b, int numRows_lb, int numCols_b, int rs_l, int rs_b, int cs_l, int cs_b, float alpha) { //float ones = 1.0; int i, i1, i2, i4, j, k, l, r; int cs_b_offset[7]; int cs_l_offset[7]; float *ptr_b_dup, *ptr_l_dup; //57 number of ymm(256 bits) registers used __m256 mat_b_col[8]; __m256 mat_b_rearr[8]; __m256 mat_a_blk_elems[8]; //__m256 mat_a_diag_inv[8]; //__m256 reciprocal_diags[2]; __m256 alphaReg; alphaReg = _mm256_broadcast_ss((float const *)&alpha); // ---> considering that the matrix size is multiple of 16 rows and 8 cols <--- // //L matrix offsets cs_l_offset[0] = (cs_l << 1); cs_l_offset[1] = cs_l + cs_l_offset[0]; cs_l_offset[2] = (cs_l << 2); cs_l_offset[3] = cs_l + cs_l_offset[2]; cs_l_offset[4] = cs_l_offset[0] + cs_l_offset[2]; cs_l_offset[5] = cs_l + cs_l_offset[4]; cs_l_offset[6] = (cs_l_offset[5] + cs_l); cs_b_offset[0] = (cs_b << 1); cs_b_offset[1] = cs_b + cs_b_offset[0]; cs_b_offset[2] = (cs_b << 2); cs_b_offset[3] = cs_b + cs_b_offset[2]; cs_b_offset[4] = cs_b_offset[0] + cs_b_offset[2]; cs_b_offset[5] = cs_b + cs_b_offset[4]; cs_b_offset[6] = (cs_b_offset[5] + cs_b); #if 0 //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3)); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 4)); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 5)); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 6)); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + 7)); //Broadcast A21 to A71 to registers mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 2)); mat_a_blk_elems[8] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 3)); mat_a_blk_elems[9] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 4)); mat_a_blk_elems[10] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 5)); mat_a_blk_elems[11] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 6)); mat_a_blk_elems[12] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l + 7)); //Broadcast A32 to A72 to registers mat_a_blk_elems[13] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 3)); mat_a_blk_elems[14] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 4)); mat_a_blk_elems[15] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 5)); mat_a_blk_elems[16] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 6)); mat_a_blk_elems[17] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0] + 7)); //Broadcast A43 to A73 to registers mat_a_blk_elems[18] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 4)); mat_a_blk_elems[19] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 5)); mat_a_blk_elems[20] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 6)); mat_a_blk_elems[21] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1] + 7)); //Broadcast A54 to A74 to registers mat_a_blk_elems[22] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 5)); mat_a_blk_elems[23] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 6)); mat_a_blk_elems[24] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2] + 7)); //Broadcast A65 to A75 to registers mat_a_blk_elems[25] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 6)); mat_a_blk_elems[26] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3] + 7)); //Broadcast A76 to register mat_a_blk_elems[27] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4] + 7)); #endif /***************** first set of 8 rows of B processing starts *****************/ ptr_b_dup = ptr_b; i = 0; for (j = 0; j < numCols_b; j += 8) { /////////////////// Complete Upper 8x8 block trsm of B :- upper 8x8 block of B with upper 8x8 block of A //read 8x8 block of B into registers mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); //(Row0) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l + cs_l_offset[5])); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_col[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_col[1]);//d = c - (a*b) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l + 1 + cs_l_offset[5])); //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_col[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_col[2]);//d = c - (a*b) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l + 2 + cs_l_offset[5])); //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_col[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_col[3]);//d = c - (a*b) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l + 3 + cs_l_offset[5])); //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_col[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_col[4]);//d = c - (a*b) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l + 4 + cs_l_offset[5])); //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_col[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_col[5]);//d = c - (a*b) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l + 5 + cs_l_offset[5])); //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_col[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_col[6]);//d = c - (a*b) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_col[7]);//d = c - (a*b) mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l + 6 + cs_l_offset[5])); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_col[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_col[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup, mat_b_rearr[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)), mat_b_rearr[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0]), mat_b_rearr[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1]), mat_b_rearr[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2]), mat_b_rearr[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3]), mat_b_rearr[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4]), mat_b_rearr[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5]), mat_b_rearr[7]); i += cs_b_offset[6]; ptr_b_dup += cs_b_offset[6]; //i += 8; //ptr_b_dup += 8; } //c = 0; /***************** first set of 8 cols of B processing done *****************/ ptr_b_dup = ptr_b; i1 = 0; //Start loop for cols of B to be processed in size of blk_width for (j = 8; j < numRows_lb; j += 8)//m :- 8x8 block row { ptr_l += cs_l_offset[6]; //ptr_b += j; //ptr_b_dup += 8; ptr_b_dup += 8; i1 += 8; i = i1; i2 = 0; for (r = 0; r < numCols_b; r += GEMM_BLK_V1) { #if GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_col[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_col[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_col[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_col[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_col[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_col[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_col[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_col[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_rearr[0] = _mm256_unpacklo_ps(mat_b_col[0], mat_b_col[1]); mat_b_rearr[1] = _mm256_unpacklo_ps(mat_b_col[2], mat_b_col[3]); mat_b_rearr[2] = _mm256_unpacklo_ps(mat_b_col[4], mat_b_col[5]); mat_b_rearr[3] = _mm256_unpacklo_ps(mat_b_col[6], mat_b_col[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_rearr[0] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_rearr[4] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_rearr[1] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_rearr[5] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); ////unpackhigh//// mat_b_col[0] = _mm256_unpackhi_ps(mat_b_col[0], mat_b_col[1]); mat_b_col[1] = _mm256_unpackhi_ps(mat_b_col[2], mat_b_col[3]); mat_b_col[2] = _mm256_unpackhi_ps(mat_b_col[4], mat_b_col[5]); mat_b_col[3] = _mm256_unpackhi_ps(mat_b_col[6], mat_b_col[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_rearr[2] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_rearr[6] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_rearr[3] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_rearr[7] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); /* transpose steps end */ mat_b_rearr[0] = _mm256_mul_ps(mat_b_rearr[0], alphaReg); mat_b_rearr[1] = _mm256_mul_ps(mat_b_rearr[1], alphaReg); mat_b_rearr[2] = _mm256_mul_ps(mat_b_rearr[2], alphaReg); mat_b_rearr[3] = _mm256_mul_ps(mat_b_rearr[3], alphaReg); mat_b_rearr[4] = _mm256_mul_ps(mat_b_rearr[4], alphaReg); mat_b_rearr[5] = _mm256_mul_ps(mat_b_rearr[5], alphaReg); mat_b_rearr[6] = _mm256_mul_ps(mat_b_rearr[6], alphaReg); mat_b_rearr[7] = _mm256_mul_ps(mat_b_rearr[7], alphaReg); #endif //i = 0; ptr_l_dup = ptr_l; i4 = i2; for (l = 0; l < j; l += 8) // move across m { //for (k = 0; k < numCols_b; k += 8) // move across n for the same value of l (index of m) //{ /////////////////// Partial Lower 8x8 block trsm of B //Read current 8 cols of B columns from specified 8x8 current-block of B mat_a_blk_elems[0] = _mm256_loadu_ps((float const *)ptr_b + i4); mat_a_blk_elems[1] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b)); mat_a_blk_elems[2] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[0])); mat_a_blk_elems[3] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[1])); mat_a_blk_elems[4] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[2])); mat_a_blk_elems[5] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[3])); mat_a_blk_elems[6] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[4])); mat_a_blk_elems[7] = _mm256_loadu_ps((float const *)(ptr_b + i4 + cs_b_offset[5])); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_a_blk_elems[0] = _mm256_unpackhi_ps(mat_a_blk_elems[0], mat_a_blk_elems[1]); mat_a_blk_elems[1] = _mm256_unpackhi_ps(mat_a_blk_elems[2], mat_a_blk_elems[3]); mat_a_blk_elems[2] = _mm256_unpackhi_ps(mat_a_blk_elems[4], mat_a_blk_elems[5]); mat_a_blk_elems[3] = _mm256_unpackhi_ps(mat_a_blk_elems[6], mat_a_blk_elems[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_a_blk_elems[4] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x44); mat_a_blk_elems[5] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0xEE); mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x44); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0xEE); #else mat_a_blk_elems[6] = _mm256_shuffle_ps(mat_a_blk_elems[0], mat_a_blk_elems[1], 0x4E); mat_a_blk_elems[7] = _mm256_shuffle_ps(mat_a_blk_elems[2], mat_a_blk_elems[3], 0x4E); mat_a_blk_elems[4] = _mm256_blend_ps(mat_a_blk_elems[0], mat_a_blk_elems[6], 0xCC); mat_a_blk_elems[5] = _mm256_blend_ps(mat_a_blk_elems[1], mat_a_blk_elems[6], 0x33); mat_a_blk_elems[6] = _mm256_blend_ps(mat_a_blk_elems[2], mat_a_blk_elems[7], 0xCC); mat_a_blk_elems[7] = _mm256_blend_ps(mat_a_blk_elems[3], mat_a_blk_elems[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_a_blk_elems[4], mat_a_blk_elems[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_a_blk_elems[5], mat_a_blk_elems[7], 0x31); /* transpose steps end */ //Broadcast A8,0 to A15,0 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i4 = k >> 3; ptr_l_dup++; #if GEMM_ACCUM_A //(Row8): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[0], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_mul_ps(mat_a_blk_elems[0], mat_b_col[0]); mat_b_rearr[1] = _mm256_mul_ps(mat_a_blk_elems[1], mat_b_col[0]); mat_b_rearr[2] = _mm256_mul_ps(mat_a_blk_elems[2], mat_b_col[0]); mat_b_rearr[3] = _mm256_mul_ps(mat_a_blk_elems[3], mat_b_col[0]); mat_b_rearr[4] = _mm256_mul_ps(mat_a_blk_elems[4], mat_b_col[0]); mat_b_rearr[5] = _mm256_mul_ps(mat_a_blk_elems[5], mat_b_col[0]); mat_b_rearr[6] = _mm256_mul_ps(mat_a_blk_elems[6], mat_b_col[0]); mat_b_rearr[7] = _mm256_mul_ps(mat_a_blk_elems[7], mat_b_col[0]); #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row9): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[1], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[1], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[1], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,2 to A15,2 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row10): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[2], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[2], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[2], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[2], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,3 to A15,3 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row11): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[3], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[3], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[3], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[3], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[3], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,4 to A15,4 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row12): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[4], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[4], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[4], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[4], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[4], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[4], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,5 to A15,5 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row13): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[5], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[5], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[5], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[5], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[5], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[5], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[5], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,6 to A15,6 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row14): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[6], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[6], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[6], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[6], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[6], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[6], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[6], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[6], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A8,7 to A15,7 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[7] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); ptr_l_dup++; #if GEMM_ACCUM_A //(Row15): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[0] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[0] = _mm256_fmadd_ps(mat_a_blk_elems[0], mat_b_col[7], mat_b_rearr[0]);//d = c - (a*b) mat_b_rearr[1] = _mm256_fmadd_ps(mat_a_blk_elems[1], mat_b_col[7], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fmadd_ps(mat_a_blk_elems[2], mat_b_col[7], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fmadd_ps(mat_a_blk_elems[3], mat_b_col[7], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fmadd_ps(mat_a_blk_elems[4], mat_b_col[7], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fmadd_ps(mat_a_blk_elems[5], mat_b_col[7], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fmadd_ps(mat_a_blk_elems[6], mat_b_col[7], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fmadd_ps(mat_a_blk_elems[7], mat_b_col[7], mat_b_rearr[7]);//d = c - (a*b) #endif //end loop of cols //} //i2 += cs_b_offset[6]; i4 += 8; } //trsm solve k = 0; //for (i2 = 0; i2 < numCols_b; i2 += 8) //{ //i2 = i1 + r; /////////////////// Complete Lower 8x8 block trsm of B :- lower 8x8 block of B with lower right 8x8 block of A #if !GEMM_ACCUM_A //Read 8 cols of B columns of Block-to-be-solved mat_b_rearr[0] = _mm256_loadu_ps((float const *)ptr_b + i); mat_b_rearr[1] = _mm256_loadu_ps((float const *)(ptr_b + cs_b + i)); mat_b_rearr[2] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[0] + i)); mat_b_rearr[3] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[1] + i)); mat_b_rearr[4] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[2] + i)); mat_b_rearr[5] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[3] + i)); mat_b_rearr[6] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[4] + i)); mat_b_rearr[7] = _mm256_loadu_ps((float const *)(ptr_b + cs_b_offset[5] + i)); /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ mat_b_col[0] = _mm256_mul_ps(mat_b_col[0], alphaReg); mat_b_col[1] = _mm256_mul_ps(mat_b_col[1], alphaReg); mat_b_col[2] = _mm256_mul_ps(mat_b_col[2], alphaReg); mat_b_col[3] = _mm256_mul_ps(mat_b_col[3], alphaReg); mat_b_col[4] = _mm256_mul_ps(mat_b_col[4], alphaReg); mat_b_col[5] = _mm256_mul_ps(mat_b_col[5], alphaReg); mat_b_col[6] = _mm256_mul_ps(mat_b_col[6], alphaReg); mat_b_col[7] = _mm256_mul_ps(mat_b_col[7], alphaReg); #endif //Broadcast A10 to A70 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l)); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[0])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[1])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[2])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[3])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[4])); mat_a_blk_elems[6] = _mm256_broadcast_ss((float const *)(ptr_l_dup + cs_l_offset[5])); //i += cs_l; #if GEMM_ACCUM_A //(Row0): already done #else mat_b_rearr[0] = _mm256_sub_ps(mat_b_col[0], mat_b_rearr[0]); #endif #if GEMM_ACCUM_A mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #else mat_b_rearr[1] = _mm256_sub_ps(mat_b_col[1], mat_b_rearr[1]); mat_b_rearr[2] = _mm256_sub_ps(mat_b_col[2], mat_b_rearr[2]); mat_b_rearr[3] = _mm256_sub_ps(mat_b_col[3], mat_b_rearr[3]); mat_b_rearr[4] = _mm256_sub_ps(mat_b_col[4], mat_b_rearr[4]); mat_b_rearr[5] = _mm256_sub_ps(mat_b_col[5], mat_b_rearr[5]); mat_b_rearr[6] = _mm256_sub_ps(mat_b_col[6], mat_b_rearr[6]); mat_b_rearr[7] = _mm256_sub_ps(mat_b_col[7], mat_b_rearr[7]); //(Row1): FMA operations of b1 with elements of indices from (1, 0) uptill (7, 0) mat_b_rearr[1] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[0], mat_b_rearr[1]);//d = c - (a*b) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[0], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[0], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[0], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[0], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[0], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[6], mat_b_rearr[0], mat_b_rearr[7]);//d = c - (a*b) #endif //Broadcast A21 to A71 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[0])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[1])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[2])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[3])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[4])); mat_a_blk_elems[5] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 1 + cs_l_offset[5])); //i += cs_l; //(Row2): FMA operations of b2 with elements of indices from (2, 0) uptill (7, 0) mat_b_rearr[2] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[1], mat_b_rearr[2]);//d = c - (a*b) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[1], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[1], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[1], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[1], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[5], mat_b_rearr[1], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A32 to A72 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[1])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[2])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[3])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[4])); mat_a_blk_elems[4] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 2 + cs_l_offset[5])); //i += cs_l; //(Row3): FMA operations of b3 with elements of indices from (3, 0) uptill (7, 0) mat_b_rearr[3] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[2], mat_b_rearr[3]);//d = c - (a*b) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[2], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[2], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[2], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[4], mat_b_rearr[2], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A43 to A73 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[2])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[3])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[4])); mat_a_blk_elems[3] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 3 + cs_l_offset[5])); //i += cs_l; //(Row4): FMA operations of b4 with elements of indices from (4, 0) uptill (7, 0) mat_b_rearr[4] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[3], mat_b_rearr[4]);//d = c - (a*b) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[3], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[3], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[3], mat_b_rearr[3], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A54 to A74 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[3])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[4])); mat_a_blk_elems[2] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 4 + cs_l_offset[5])); //i += cs_l; //(Row5): FMA operations of b5 with elements of indices from (5, 0) uptill (7, 0) mat_b_rearr[5] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[4], mat_b_rearr[5]);//d = c - (a*b) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[4], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[2], mat_b_rearr[4], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A65 to A75 to registers mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[4])); mat_a_blk_elems[1] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 5 + cs_l_offset[5])); //i += cs_l; //(Row6): FMA operations of b6 with elements of indices from (6, 0) uptill (7, 0) mat_b_rearr[6] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[5], mat_b_rearr[6]);//d = c - (a*b) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[1], mat_b_rearr[5], mat_b_rearr[7]);//d = c - (a*b) //Broadcast A76 to register mat_a_blk_elems[0] = _mm256_broadcast_ss((float const *)(ptr_l_dup + 6 + cs_l_offset[5])); //(Row7): FMA operations of b7 with elements of index (7, 0) mat_b_rearr[7] = _mm256_fnmadd_ps(mat_a_blk_elems[0], mat_b_rearr[6], mat_b_rearr[7]);//d = c - (a*b) //////////////////////////////////////////////////////////////////////////////// /* transpose steps start */ ////unpacklow//// mat_b_col[0] = _mm256_unpacklo_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_col[1] = _mm256_unpacklo_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_col[2] = _mm256_unpacklo_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_col[3] = _mm256_unpacklo_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange low elements #if REARRANGE_SHFL == 1 mat_b_col[4] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x44); mat_b_col[5] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0xEE); mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x44); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0xEE); #else mat_b_col[6] = _mm256_shuffle_ps(mat_b_col[0], mat_b_col[1], 0x4E); mat_b_col[7] = _mm256_shuffle_ps(mat_b_col[2], mat_b_col[3], 0x4E); mat_b_col[4] = _mm256_blend_ps(mat_b_col[0], mat_b_col[6], 0xCC); mat_b_col[5] = _mm256_blend_ps(mat_b_col[1], mat_b_col[6], 0x33); mat_b_col[6] = _mm256_blend_ps(mat_b_col[2], mat_b_col[7], 0xCC); mat_b_col[7] = _mm256_blend_ps(mat_b_col[3], mat_b_col[7], 0x33); #endif //Merge rearranged low elements into complete rows mat_b_col[0] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x20); mat_b_col[4] = _mm256_permute2f128_ps(mat_b_col[4], mat_b_col[6], 0x31); mat_b_col[1] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x20); mat_b_col[5] = _mm256_permute2f128_ps(mat_b_col[5], mat_b_col[7], 0x31); ////unpackhigh//// mat_b_rearr[0] = _mm256_unpackhi_ps(mat_b_rearr[0], mat_b_rearr[1]); mat_b_rearr[1] = _mm256_unpackhi_ps(mat_b_rearr[2], mat_b_rearr[3]); mat_b_rearr[2] = _mm256_unpackhi_ps(mat_b_rearr[4], mat_b_rearr[5]); mat_b_rearr[3] = _mm256_unpackhi_ps(mat_b_rearr[6], mat_b_rearr[7]); //Rearrange high elements #if REARRANGE_SHFL == 1 mat_b_rearr[4] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x44); mat_b_rearr[5] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0xEE); mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x44); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0xEE); #else mat_b_rearr[6] = _mm256_shuffle_ps(mat_b_rearr[0], mat_b_rearr[1], 0x4E); mat_b_rearr[7] = _mm256_shuffle_ps(mat_b_rearr[2], mat_b_rearr[3], 0x4E); mat_b_rearr[4] = _mm256_blend_ps(mat_b_rearr[0], mat_b_rearr[6], 0xCC); mat_b_rearr[5] = _mm256_blend_ps(mat_b_rearr[1], mat_b_rearr[6], 0x33); mat_b_rearr[6] = _mm256_blend_ps(mat_b_rearr[2], mat_b_rearr[7], 0xCC); mat_b_rearr[7] = _mm256_blend_ps(mat_b_rearr[3], mat_b_rearr[7], 0x33); #endif //Merge rearranged high elements into complete rows mat_b_col[2] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x20); mat_b_col[6] = _mm256_permute2f128_ps(mat_b_rearr[4], mat_b_rearr[6], 0x31); mat_b_col[3] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x20); mat_b_col[7] = _mm256_permute2f128_ps(mat_b_rearr[5], mat_b_rearr[7], 0x31); /* transpose steps end */ //Store the computed B columns _mm256_storeu_ps((float *)ptr_b_dup + i2, mat_b_col[0]); _mm256_storeu_ps((float *)(ptr_b_dup + (cs_b)+i2), mat_b_col[1]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[0] + i2), mat_b_col[2]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[1] + i2), mat_b_col[3]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[2] + i2), mat_b_col[4]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[3] + i2), mat_b_col[5]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[4] + i2), mat_b_col[6]); _mm256_storeu_ps((float *)(ptr_b_dup + cs_b_offset[5] + i2), mat_b_col[7]); //printf("writing B => m[%d], n[%d], [%f]\n", j, k, *(ptr_b_dup + k)); k++; //} i += cs_b_offset[6]; i2 += cs_b_offset[6]; } } //numRows of A ///////////////////loop ends ///////////////////// } #endif cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/000077500000000000000000000000001427272030600210465ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/broken/000077500000000000000000000000001427272030600223265ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8.c000066400000000000000000002102211427272030600300630ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define CGEMM_INPUT_SCALE_CS_BETA_NZ \ vmovlpd(mem(rcx), xmm0, xmm0) \ vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \ vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \ vinsertf128(imm(1), xmm3, ymm0, ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_INPUT_SCALE_CS_BETA_NZ_128 \ vmovlpd(mem(rcx), xmm0, xmm0) \ vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ vpermilps(imm(0xb1), xmm0, xmm3) \ vmulps(xmm1, xmm0, xmm0) \ vmulps(xmm2, xmm3, xmm3) \ vaddsubps(xmm3, xmm0, xmm0) // assumes values to output are in ymm0 #define CGEMM_OUTPUT_GS \ vextractf128(imm(1), ymm0, xmm3) \ vmovlpd(xmm0, mem(rcx)) \ vmovhpd(xmm0, mem(rcx, rsi, 1)) \ vmovlpd(xmm3, mem(rcx, rsi, 2)) \ vmovhpd(xmm3, mem(rcx, r13, 1)) #define CGEMM_INPUT_SCALE_RS_BETA_NZ \ vmovups(mem(rcx), ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_OUTPUT_RS \ vmovups(ymm0, mem(rcx)) \ #define CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \ vmovups(mem(rcx, rsi, 8), ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_OUTPUT_RS_NEXT \ vmovups(ymm0, mem(rcx, rsi, 8)) \ void bli_cgemmsup_rv_zen_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* data, cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. vzeroall() // zero all xmm/ymm registers. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm7, ymm7) vpermilps(imm(0xb1), ymm10, ymm10) vpermilps(imm(0xb1), ymm11, ymm11) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) vaddsubps(ymm10, ymm8, ymm8) vaddsubps(ymm11, ymm9, ymm9) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) vpermilps(imm(0xb1), ymm5, ymm3) vmulps(ymm0, ymm5, ymm5) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm5, ymm5) vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) vpermilps(imm(0xb1), ymm9, ymm3) vmulps(ymm0, ymm9, ymm9) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm9, ymm9) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 1*rs_c CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_RS CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 2*rs_c jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|----------------| |-------| | | | | | | 2x4 | 2x4 | | 4x2 | | | | |-------| |----------------| | | | 4x2 | |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm4, ymm0, ymm4) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm8, ymm0, ymm8) add(rdi, rcx) lea(mem(r12, rsi, 4), rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm5, ymm0, ymm5) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm9, ymm0, ymm9) add(rdi, rcx) mov(r12, rcx) // reset rcx to current utile of c. vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-12 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 //gamma01-11 gamma03-13 /******************Transpose top tile 4x3***************************/ vmovups(xmm0, mem(rcx)) // store (gamma00-10) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vmovups(xmm0, mem(rcx)) // store (gamma02-12) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma03-13) lea(mem(rcx, rsi, 1), rcx) /******************Transpose bottom tile 4x3***************************/ vunpcklpd(ymm9, ymm5, ymm0) //a8a9b8b9 a12a13b12b13 //gamma04-14 gamma06-16 vunpckhpd(ymm9, ymm5, ymm2) //a10a11b10b11 a14a15b14b15 //gamma05-15 gamma07-17 vmovups(xmm0, mem(rcx)) // store (gamma04-14) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma05-15) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vmovups(xmm0, mem(rcx)) // store (gamma06-16) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma07-17) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****2x8 tile going to save into 8x2 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 /******************Transpose top tile 4x2***************************/ vmovups(xmm0, mem(rcx)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vmovups(xmm0, mem(rcx)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) lea(mem(rcx, rsi, 1), rcx) /******************Transpose bottom tile 4x2***************************/ vunpcklpd(ymm9, ymm5, ymm0) //a8a9b8b9 a12a13b12b13 vunpckhpd(ymm9, ymm5, ymm2) //a10a11b10b11 a14a15b14b15 vmovups(xmm0, mem(rcx)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vmovups(xmm0, mem(rcx)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_cgemmsup_rv_zen_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* data, cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. vzeroall() // zero all xmm/ymm registers. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm7, ymm7) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) vpermilps(imm(0xb1), ymm5, ymm3) vmulps(ymm0, ymm5, ymm5) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm5, ymm5) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 1*rs_c jmp(.SDONE) // jump to end. label(.SCOLSTORED) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm4, ymm0, ymm4) lea(mem(r12, rsi, 4), rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm5, ymm0, ymm5) mov(r12, rcx) // reset rcx to current utile of c. /******************Transpose top tile 4x1***************************/ vmovlpd(xmm4, mem(rcx)) // store (gamma40) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma41) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovlpd(xmm4, mem(rcx)) // store (gamma42) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma43) lea(mem(rcx, rsi, 1), rcx) /******************Transpose bottom tile 4x1***************************/ vmovlpd(xmm5, mem(rcx)) // store (gamma44) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm5, mem(rcx)) // store (gamma45) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm5, xmm5) vmovlpd(xmm5, mem(rcx)) // store (gamma46) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm5, mem(rcx)) // store (gamma47) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****1x8 tile going to save into 8x1 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose top tile 4x1***************************/ vmovlpd(xmm4, mem(rcx)) // store (gamma40) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma41) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovlpd(xmm4, mem(rcx)) // store (gamma42) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma43) lea(mem(rcx, rsi, 1), rcx) /******************Transpose bottom tile 4x1***************************/ vmovlpd(xmm5, mem(rcx)) // store (gamma44) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm5, mem(rcx)) // store (gamma45) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm5, xmm5) vmovlpd(xmm5, mem(rcx)) // store (gamma46) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm5, mem(rcx)) // store (gamma47) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_cgemmsup_rv_zen_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* data, cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm10, ymm10) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm10, ymm8, ymm8) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS add(rdi, rcx) // rcx = c + 1*rs_c CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_RS jmp(.SDONE) // jump to end. label(.SCOLSTORED) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm4, ymm0, ymm4) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm8, ymm0, ymm8) mov(r12, rcx) // reset rcx to current utile of c. vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-12 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 //gamma01-11 gamma03-13 /******************Transpose top tile 4x2***************************/ vmovups(xmm0, mem(rcx)) // store (gamma00-10) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vmovups(xmm0, mem(rcx)) // store (gamma02-12) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma03-13) lea(mem(rcx, rsi, 1), rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****2x4 tile going to save into 4x2 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-12 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 //gamma01-11 gamma03-13 /******************Transpose top tile 4x3***************************/ vmovups(xmm0, mem(rcx)) // store (gamma00-10) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vmovups(xmm0, mem(rcx)) // store (gamma02-12) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma03-13) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_cgemmsup_rv_zen_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* data, cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS jmp(.SDONE) // jump to end. label(.SCOLSTORED) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm4, ymm0, ymm4) mov(r12, rcx) // reset rcx to current utile of c. vmovlpd(xmm4, mem(rcx)) // store (gamma00-10) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma01-11) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovlpd(xmm4, mem(rcx)) // store (gamma02-12) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma03-13) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****1x4 tile going to save into 4x1 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vmovlpd(xmm4, mem(rcx)) // store (gamma40) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma41) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovlpd(xmm4, mem(rcx)) // store (gamma42) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma43) lea(mem(rcx, rsi, 1), rcx) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_cgemmsup_rv_zen_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* data, cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vxorps(xmm4, xmm4, xmm4) vxorps(xmm6, xmm6, xmm6) vxorps(xmm8, xmm8, xmm8) vxorps(xmm10, xmm10, xmm10) vxorps(xmm12, xmm12, xmm12) vxorps(xmm14, xmm14, xmm14) mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of xmm6/7 vpermilps(imm(0xb1), xmm6, xmm6) vpermilps(imm(0xb1), xmm10, xmm10) // subtract/add even/odd elements vaddsubps(xmm6, xmm4, xmm4) vaddsubps(xmm10, xmm8, xmm8) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate vpermilps(imm(0xb1), xmm4, xmm3) vmulps(xmm0, xmm4, xmm4) vmulps(xmm1, xmm3, xmm3) vaddsubps(xmm3, xmm4, xmm4) vpermilps(imm(0xb1), xmm8, xmm3) vmulps(xmm0, xmm8, xmm8) vmulps(xmm1, xmm3, xmm3) vaddsubps(xmm3, xmm8, xmm8) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), xmm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), xmm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vpermilps(imm(0xb1), xmm0, xmm3) vmulps(xmm1, xmm0, xmm0) vmulps(xmm2, xmm3, xmm3) vaddsubps(xmm3, xmm0, xmm0) vaddps(xmm4, xmm0, xmm0) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) add(rdi, rcx) // rcx = c + 1*rs_c vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vpermilps(imm(0xb1), xmm0, xmm3) vmulps(xmm1, xmm0, xmm0) vmulps(xmm2, xmm3, xmm3) vaddsubps(xmm3, xmm0, xmm0) vaddps(xmm8, xmm0, xmm0) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm4, xmm0, xmm4) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm8, xmm0, xmm8) mov(r12, rcx) // reset rcx to current utile of c. vunpcklpd(xmm8, xmm4, xmm0) //a0a1b0b1 //gamma00-10 vunpckhpd(xmm8, xmm4, xmm2) //a2a3b2b3 //gamma01-11 /******************Transpose top tile 4x3***************************/ vmovups(xmm0, mem(rcx)) // store (gamma00-10) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****2x2 tile going to save into 4x2 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vunpcklpd(xmm8, xmm4, xmm0) //a0a1b0b1 //gamma00-10 vunpckhpd(xmm8, xmm4, xmm2) //a2a3b2b3 //gamma01-11 /******************Transpose top tile 2x2***************************/ vmovups(xmm0, mem(rcx)) // store (gamma00-10) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_cgemmsup_rv_zen_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* data, cntx_t* cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vxorps(xmm4, xmm4, xmm4) vxorps(xmm6, xmm6, xmm6) vxorps(xmm8, xmm8, xmm8) vxorps(xmm10, xmm10, xmm10) vxorps(xmm12, xmm12, xmm12) vxorps(xmm14, xmm14, xmm14) mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, 4 ), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, 4 ), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, 4 ), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, 4 ), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, 4 ), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of xmm6 vpermilps(imm(0xb1), xmm6, xmm6) // subtract/add even/odd elements vaddsubps(xmm6, xmm4, xmm4) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate vpermilps(imm(0xb1), xmm4, xmm3) vmulps(xmm0, xmm4, xmm4) vmulps(xmm1, xmm3, xmm3) vaddsubps(xmm3, xmm4, xmm4) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), xmm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), xmm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vpermilps(imm(0xb1), xmm0, xmm3) vmulps(xmm1, xmm0, xmm0) vmulps(xmm2, xmm3, xmm3) vaddsubps(xmm3, xmm0, xmm0) vaddps(xmm4, xmm0, xmm0) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm4, xmm0, xmm4) mov(r12, rcx) // reset rcx to current utile of c. vmovlpd(xmm4, mem(rcx)) // store (gamma40-50) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma41-51) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****1x2 tile going to save into 2x1 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vmovlpd(xmm4, mem(rcx)) // store (gamma40) lea(mem(rcx, rsi, 1), rcx) vmovhpd(xmm4, mem(rcx)) // store (gamma41) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c000066400000000000000000001446011427272030600302500ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define CGEMM_INPUT_SCALE_CS_BETA_NZ \ vmovlpd(mem(rcx), xmm0, xmm0) \ vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ vmovlpd(mem(rcx, rsi, 2), xmm3, xmm3) \ vmovhpd(mem(rcx, r13, 1), xmm3, xmm3) \ vinsertf128(imm(1), xmm3, ymm0, ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_INPUT_SCALE_CS_BETA_NZ_128 \ vmovlpd(mem(rcx), xmm0, xmm0) \ vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) \ vpermilps(imm(0xb1), xmm0, xmm3) \ vmulps(xmm1, xmm0, xmm0) \ vmulps(xmm2, xmm3, xmm3) \ vaddsubps(xmm3, xmm0, xmm0) #define CGEMM_INPUT_SCALE_RS_BETA_NZ \ vmovups(mem(rcx), ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_OUTPUT_RS \ vmovups(ymm0, mem(rcx)) \ #define CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \ vmovups(mem(rcx, rsi, 8), ymm0) \ vpermilps(imm(0xb1), ymm0, ymm3) \ vmulps(ymm1, ymm0, ymm0) \ vmulps(ymm2, ymm3, ymm3) \ vaddsubps(ymm3, ymm0, ymm0) #define CGEMM_OUTPUT_RS_NEXT \ vmovups(ymm0, mem(rcx, rsi, 8)) \ /* rrr: -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : rcr: -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : */ void bli_cgemmsup_rv_zen_asm_3x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 8; // First check whether this is a edge case in the n dimension. If so, // dispatch other 3x?m kernels, as needed. if (n_left ) { scomplex* cij = c; scomplex* bj = b; scomplex* ai = a; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_cgemmsup_rv_zen_asm_3x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_cgemmsup_rv_zen_asm_3x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_cgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vbroadcastss(mem(rax, 4 ), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm7, ymm7) vpermilps(imm(0xb1), ymm10, ymm10) vpermilps(imm(0xb1), ymm11, ymm11) vpermilps(imm(0xb1), ymm14, ymm14) vpermilps(imm(0xb1), ymm15, ymm15) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm7, ymm5, ymm5) vaddsubps(ymm10, ymm8, ymm8) vaddsubps(ymm11, ymm9, ymm9) vaddsubps(ymm14, ymm12, ymm12) vaddsubps(ymm15, ymm13, ymm13) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) vpermilps(imm(0xb1), ymm5, ymm3) vmulps(ymm0, ymm5, ymm5) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm5, ymm5) vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) vpermilps(imm(0xb1), ymm9, ymm3) vmulps(ymm0, ymm9, ymm9) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm9, ymm9) vpermilps(imm(0xb1), ymm12, ymm3) vmulps(ymm0, ymm12, ymm12) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm12, ymm12) vpermilps(imm(0xb1), ymm13, ymm3) vmulps(ymm0, ymm13, ymm13) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm13, ymm13) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddps(ymm5, ymm0, ymm0) CGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 1*rs_c CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_RS CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddps(ymm9, ymm0, ymm0) CGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 2*rs_c CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_RS CGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddps(ymm13, ymm0, ymm0) CGEMM_OUTPUT_RS_NEXT jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|----------------| |-------| | | | | | | 3x4 | 3x4 | | 4x3 | | | | |-------| |----------------| | | | 4x3 | |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm4, ymm0, ymm4) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm8, ymm0, ymm8) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm12, ymm0, ymm12) lea(mem(r12, rsi, 4), rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm5, ymm0, ymm5) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm9, ymm0, ymm9) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm13, ymm0, ymm13) mov(r12, rcx) // reset rcx to current utile of c. vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-12 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 //gamma01-11 gamma03-13 /******************Transpose top tile 4x3***************************/ vmovups(xmm0, mem(rcx)) // store (gamma00-10) vmovlpd(xmm12, mem(rcx, 16)) // store (gamma20) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma21) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm0, mem(rcx)) // store (gamma02-12) vmovlpd(xmm12, mem(rcx, 16)) // store (gamma22) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma03-13) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma33) lea(mem(rcx, rsi, 1), rcx) /******************Transpose bottom tile 4x3***************************/ vunpcklpd(ymm9, ymm5, ymm0) //a8a9b8b9 a12a13b12b13 //gamma04-14 gamma06-16 vunpckhpd(ymm9, ymm5, ymm2) //a10a11b10b11 a14a15b14b15 //gamma05-15 gamma07-17 vmovups(xmm0, mem(rcx)) // store (gamma04-14) vmovlpd(xmm13, mem(rcx, 16)) // store (gamma24) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma05-15) vmovhpd(xmm13, mem(rcx, 16)) // store (gamma25) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm13, xmm13) vmovups(xmm0, mem(rcx)) // store (gamma06-16) vmovlpd(xmm13, mem(rcx, 16)) // store (gamma26) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma07-17) vmovhpd(xmm13, mem(rcx, 16)) // store (gamma27) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) vmovups(ymm13, mem(rcx, rsi, 8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****3x8 tile going to save into 8x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 /******************Transpose top tile 4x3***************************/ vmovups(xmm0, mem(rcx)) vmovlpd(xmm12, mem(rcx,16)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) vmovhpd(xmm12,mem(rcx,16)) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm0, mem(rcx)) vmovlpd(xmm12, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) vmovhpd(xmm12, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) /******************Transpose bottom tile 4x3***************************/ vunpcklpd(ymm9, ymm5, ymm0) //a8a9b8b9 a12a13b12b13 vunpckhpd(ymm9, ymm5, ymm2) //a10a11b10b11 a14a15b14b15 vmovups(xmm0, mem(rcx)) vmovlpd(xmm13, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) vmovhpd(xmm13, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm13, xmm13) vmovups(xmm0, mem(rcx)) vmovlpd(xmm13, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) vmovhpd(xmm13, mem(rcx, 16)) label(.SDONE) lea(mem(r12, rdi, 2), r12) lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) lea(mem(r14, r8, 1), r14) //a_ii = r14 += 3*rs_a dec(r11) // ii -= 1; jne(.SLOOP3X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; scomplex* cij = c + i_edge*rs_c; scomplex* ai = a + i_edge*rs_a; scomplex* bj = b; cgemmsup_ker_ft ker_fps[3] = { NULL, bli_cgemmsup_rv_zen_asm_1x8, bli_cgemmsup_rv_zen_asm_2x8, }; cgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_cgemmsup_rv_zen_asm_3x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vfmadd231ps(ymm0, ymm2, ymm4) vbroadcastss(mem(rax, r8, 1), ymm2) vfmadd231ps(ymm0, ymm2, ymm8) vbroadcastss(mem(rax, r8, 2), ymm2) vfmadd231ps(ymm0, ymm2, ymm12) vbroadcastss(mem(rax, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 1, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 2, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilps(imm(0xb1), ymm6, ymm6) vpermilps(imm(0xb1), ymm10, ymm10) vpermilps(imm(0xb1), ymm14, ymm14) // subtract/add even/odd elements vaddsubps(ymm6, ymm4, ymm4) vaddsubps(ymm10, ymm8, ymm8) vaddsubps(ymm14, ymm12, ymm12) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), ymm1) // load alpha_i and duplicate vpermilps(imm(0xb1), ymm4, ymm3) vmulps(ymm0, ymm4, ymm4) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm4, ymm4) vpermilps(imm(0xb1), ymm8, ymm3) vmulps(ymm0, ymm8, ymm8) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm8, ymm8) vpermilps(imm(0xb1), ymm12, ymm3) vmulps(ymm0, ymm12, ymm12) vmulps(ymm1, ymm3, ymm3) vaddsubps(ymm3, ymm12, ymm12) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm4, ymm0, ymm0) CGEMM_OUTPUT_RS add(rdi, rcx) // rcx = c + 1*rs_c CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm8, ymm0, ymm0) CGEMM_OUTPUT_RS add(rdi, rcx) // rcx = c + 2*rs_c CGEMM_INPUT_SCALE_RS_BETA_NZ vaddps(ymm12, ymm0, ymm0) CGEMM_OUTPUT_RS jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 3x4 | | 4x3 | |--------| |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm4, ymm0, ymm4) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm8, ymm0, ymm8) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ vaddps(ymm12, ymm0, ymm12) mov(r12, rcx) // reset rcx to current utile of c. vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-12 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 //gamma01-11 gamma03-13 /******************Transpose tile 4x3***************************/ vmovups(xmm0, mem(rcx)) // store (gamma00-10) vmovlpd(xmm12, mem(rcx, 16)) // store (gamma20) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma21) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm0, mem(rcx)) // store (gamma02-12) vmovlpd(xmm12, mem(rcx, 16)) // store (gamma22) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma03-13) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma33) lea(mem(rcx, rsi, 1), rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****3x4 tile going to save into 4x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 vmovups(xmm0, mem(rcx)) vmovlpd(xmm12, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) vmovhpd(xmm12, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) vextractf128(imm(0x1), ymm0, xmm0) vextractf128(imm(0x1), ymm2, xmm2) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm0, mem(rcx)) vmovlpd(xmm12, mem(rcx, 16)) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) vmovhpd(xmm12, mem(rcx, 16)) label(.SDONE) lea(mem(r12, rdi, 2), r12) lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) lea(mem(r14, r8, 1), r14) //a_ii = r14 += 3*rs_a dec(r11) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; scomplex* cij = c + i_edge*rs_c; scomplex* ai = a + i_edge*rs_a; scomplex* bj = b; cgemmsup_ker_ft ker_fps[3] = { NULL, bli_cgemmsup_rv_zen_asm_1x4, bli_cgemmsup_rv_zen_asm_2x4, }; cgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_cgemmsup_rv_zen_asm_3x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X2I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, r8, 2), xmm2) vfmadd231ps(xmm0, xmm2, xmm12) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 2, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, r8, 2), xmm2) vfmadd231ps(xmm0, xmm2, xmm12) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 2, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, r8, 2), xmm2) vfmadd231ps(xmm0, xmm2, xmm12) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 2, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, r8, 2), xmm2) vfmadd231ps(xmm0, xmm2, xmm12) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 2, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vfmadd231ps(xmm0, xmm2, xmm4) vbroadcastss(mem(rax, r8, 1), xmm2) vfmadd231ps(xmm0, xmm2, xmm8) vbroadcastss(mem(rax, r8, 2), xmm2) vfmadd231ps(xmm0, xmm2, xmm12) vbroadcastss(mem(rax, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 1, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 2, 4), xmm3) vfmadd231ps(xmm0, xmm3, xmm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of xmm6/7, xmm10/11, xmm/14/15 vpermilps(imm(0xb1), xmm6, xmm6) vpermilps(imm(0xb1), xmm10, xmm10) vpermilps(imm(0xb1), xmm14, xmm14) // subtract/add even/odd elements vaddsubps(xmm6, xmm4, xmm4) vaddsubps(xmm10, xmm8, xmm8) vaddsubps(xmm14, xmm12, xmm12) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastss(mem(rax), xmm0) // load alpha_r and duplicate vbroadcastss(mem(rax, 4), xmm1) // load alpha_i and duplicate vpermilps(imm(0xb1), xmm4, xmm3) vmulps(xmm0, xmm4, xmm4) vmulps(xmm1, xmm3, xmm3) vaddsubps(xmm3, xmm4, xmm4) vpermilps(imm(0xb1), xmm8, xmm3) vmulps(xmm0, xmm8, xmm8) vmulps(xmm1, xmm3, xmm3) vaddsubps(xmm3, xmm8, xmm8) vpermilps(imm(0xb1), xmm12, xmm3) vmulps(xmm0, xmm12, xmm12) vmulps(xmm1, xmm3, xmm3) vaddsubps(xmm3, xmm12, xmm12) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rbx), xmm1) // load beta_r and duplicate vbroadcastss(mem(rbx, 4), xmm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 2), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomiss(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vpermilps(imm(0xb1), xmm0, xmm3) vmulps(xmm1, xmm0, xmm0) vmulps(xmm2, xmm3, xmm3) vaddsubps(xmm3, xmm0, xmm0) vaddps(xmm4, xmm0, xmm0) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) add(rdi, rcx) // rcx = c + 1*rs_c vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vpermilps(imm(0xb1), xmm0, xmm3) vmulps(xmm1, xmm0, xmm0) vmulps(xmm2, xmm3, xmm3) vaddsubps(xmm3, xmm0, xmm0) vaddps(xmm8, xmm0, xmm0) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) add(rdi, rcx) // rcx = c + 2*rs_c vmovlpd(mem(rcx), xmm0, xmm0) vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) vpermilps(imm(0xb1), xmm0, xmm3) vmulps(xmm1, xmm0, xmm0) vmulps(xmm2, xmm3, xmm3) vaddsubps(xmm3, xmm0, xmm0) vaddps(xmm12, xmm0, xmm0) vmovlpd(xmm0, mem(rcx)) vmovhpd(xmm0, mem(rcx, rsi, 1)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 3x2 | | 2x3 | | | |-------| |--------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm4, xmm0, xmm4) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm8, xmm0, xmm8) add(rdi, rcx) CGEMM_INPUT_SCALE_CS_BETA_NZ_128 vaddps(xmm12, xmm0, xmm12) mov(r12, rcx) // reset rcx to current utile of c. vunpcklpd(xmm8, xmm4, xmm0) //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-02 vunpckhpd(xmm8, xmm4, xmm2) //a2a3b2b3 a6a7b6b7 //gamma01-11 gamma03-13 vmovups(xmm0, mem(rcx)) // store (gamma00-10) vmovlpd(xmm12, mem(rcx, 16)) // store (gamma20) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma21) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(8), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vmovups(xmm12, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****3x2 tile going to save into 2x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) vunpcklpd(xmm8, xmm4, xmm0) //a0a1b0b1 a4a4b4b5 //gamma00-10 gamma02-02 vunpckhpd(xmm8, xmm4, xmm2) //a2a3b2b3 a6a7b6b7 //gamma01-11 gamma03-13 vmovups(xmm0, mem(rcx)) // store (gamma00-10) vmovlpd(xmm12, mem(rcx, 16)) // store (gamma20) lea(mem(rcx, rsi, 1), rcx) vmovups(xmm2, mem(rcx)) // store (gamma01-11) vmovhpd(xmm12, mem(rcx, 16)) // store (gamma21) label(.SDONE) lea(mem(r12, rdi, 2), r12) lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) lea(mem(r14, r8, 1), r14) //a_ii = r14 += 3*rs_a dec(r11) // ii -= 1; jne(.SLOOP3X2I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; scomplex* cij = c + i_edge*rs_c; scomplex* ai = a + i_edge*rs_a; scomplex* bj = b; cgemmsup_ker_ft ker_fps[3] = { NULL, bli_cgemmsup_rv_zen_asm_1x2, bli_cgemmsup_rv_zen_asm_2x2, }; cgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c000066400000000000000000001501151427272030600302460ustar00rootroot00000000000000 /* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "immintrin.h" /* rrr: -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : rcr: -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : */ void bli_cgemmsup_rv_zen_asm_3x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 3; if ( m_left ) { cgemmsup_ker_ft ker_fps[3] = { NULL, bli_cgemmsup_rv_zen_asm_1x8n, bli_cgemmsup_rv_zen_asm_2x8n, }; cgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- //scratch registers __m256 ymm0, ymm1, ymm2, ymm3; __m256 ymm4, ymm5, ymm6, ymm7; __m256 ymm8, ymm9, ymm10, ymm11; __m256 ymm12, ymm13, ymm14, ymm15; __m128 xmm0, xmm3; scomplex *tA = a; float *tAimag = &a->imag; scomplex *tB = b; scomplex *tC = c; for (n_iter = 0; n_iter < n0 / 8; n_iter++) { // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm13 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); ymm15 = _mm256_setzero_ps(); dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tb_inc_col = cs_b; dim_t tc_inc_col = cs_c; tA = a; tAimag = &a->imag; tB = b + n_iter*tb_inc_col*8; tC = c + n_iter*tc_inc_col*8; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_ps(ymm4, 0xb1); ymm4 = _mm256_mul_ps(ymm0, ymm4); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_addsub_ps(ymm4, ymm3); ymm3 = _mm256_permute_ps(ymm5, 0xb1); ymm5 = _mm256_mul_ps(ymm0, ymm5); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm5 = _mm256_addsub_ps(ymm5, ymm3); ymm3 = _mm256_permute_ps(ymm8, 0xb1); ymm8 = _mm256_mul_ps(ymm0, ymm8); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm8 = _mm256_addsub_ps(ymm8, ymm3); ymm3 = _mm256_permute_ps(ymm9, 0xb1); ymm9 = _mm256_mul_ps(ymm0, ymm9); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm9 = _mm256_addsub_ps(ymm9, ymm3); ymm3 = _mm256_permute_ps(ymm12, 0xb1); ymm12 = _mm256_mul_ps(ymm0, ymm12); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm12 = _mm256_addsub_ps(ymm12, ymm3); ymm3 = _mm256_permute_ps(ymm13, 0xb1); ymm13 = _mm256_mul_ps(ymm0, ymm13); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm13 = _mm256_addsub_ps(ymm13, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose left 3x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm4), _mm256_castps_pd (ymm8))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); _mm_storel_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); ymm1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd (ymm4) , _mm256_castps_pd(ymm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ) ,_mm256_extractf128_ps (ymm0,1)); _mm_storel_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12, 1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm1,1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12,1)); //transpose right 3x4 tC += tc_inc_col; ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm5), _mm256_castps_pd(ymm9))); _mm_storeu_ps((float *)(tC ),_mm256_castps256_ps128(ymm0)); _mm_storel_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm13)); ymm1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(ymm5), _mm256_castps_pd(ymm9))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm13)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ),_mm256_extractf128_ps (ymm0,1)); _mm_storel_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm13,1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ),_mm256_extractf128_ps (ymm1,1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm13,1)); } else{ ymm1 = _mm256_broadcast_ss((float const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC) ); xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_loadl_pi(xmm3, (__m64 const *) (tC + tc_inc_col*2)); xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm4 = _mm256_add_ps(ymm4, ymm0); //Multiply ymm8 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 1)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 1 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC + 1 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + 1 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm8 = _mm256_add_ps(ymm8, ymm0); //Multiply ymm12 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 2)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 2 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC + 2 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + 2 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm12 = _mm256_add_ps(ymm12, ymm0); //transpose left 3x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm4), _mm256_castps_pd (ymm8))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); _mm_storel_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); ymm3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd (ymm4) , _mm256_castps_pd(ymm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm3)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm0,1)); _mm_storel_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12, 1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ),_mm256_extractf128_ps (ymm3,1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12,1)); //Multiply ymm5 with beta tC += tc_inc_col; xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_loadl_pi(xmm3, (__m64 const *) (tC + tc_inc_col*2)); xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm5 = _mm256_add_ps(ymm5, ymm0); //Multiply ymm9 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC+ 1)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC+ 1 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC+ 1 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC+ 1 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm9 = _mm256_add_ps(ymm9, ymm0); //Multiply ymm13 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 2)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 2 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC + 2 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + 2 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm13 = _mm256_add_ps(ymm13, ymm0); //transpose right 3x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm5), _mm256_castps_pd(ymm9))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); _mm_storel_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm13)); ymm3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(ymm5), _mm256_castps_pd(ymm9))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ), _mm256_castps256_ps128(ymm3)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm13)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ),_mm256_extractf128_ps (ymm0,1)); _mm_storel_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm13,1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ),_mm256_extractf128_ps (ymm3,1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm13,1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + 4), ymm5); _mm256_storeu_ps((float*)(tC + tc_inc_row ), ymm8); _mm256_storeu_ps((float*)(tC + tc_inc_row + 4), ymm9); _mm256_storeu_ps((float*)(tC + tc_inc_row *2), ymm12); _mm256_storeu_ps((float*)(tC + tc_inc_row *2+ 4), ymm13); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_ss((float const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_ps((float const *)(tC)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_add_ps(ymm4, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+4)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm5 = _mm256_add_ps(ymm5, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm8 = _mm256_add_ps(ymm8, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row + 4)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm9 = _mm256_add_ps(ymm9, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row*2)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm12 = _mm256_add_ps(ymm12, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row*2 +4)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm13 = _mm256_add_ps(ymm13, _mm256_addsub_ps(ymm2, ymm3)); _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + 4), ymm5); _mm256_storeu_ps((float*)(tC + tc_inc_row) , ymm8); _mm256_storeu_ps((float*)(tC + tc_inc_row + 4), ymm9); _mm256_storeu_ps((float*)(tC + tc_inc_row *2), ymm12); _mm256_storeu_ps((float*)(tC + tc_inc_row *2+ 4), ymm13); } } } consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; scomplex* restrict cij = c + j_edge*cs_c; scomplex* restrict ai = a; scomplex* restrict bj = b + n_iter*8; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_cgemmsup_rv_zen_asm_3x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_cgemmsup_rv_zen_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_cgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_cgemmsup_rv_zen_asm_2x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = 0; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- //scratch registers __m256 ymm0, ymm1, ymm2, ymm3; __m256 ymm4, ymm5, ymm6, ymm7; __m256 ymm8, ymm9, ymm10, ymm11; __m128 xmm0, xmm3; scomplex *tA = a; float *tAimag = &a->imag; scomplex *tB = b; scomplex *tC = c; for (n_iter = 0; n_iter < n0 / 8; n_iter++) { // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm9 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm11 = _mm256_setzero_ps(); dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tb_inc_col = cs_b; dim_t tc_inc_col = cs_c; tA = a; tAimag = &a->imag; tB = b + n_iter*tb_inc_col*8; tC = c + n_iter*tc_inc_col*8; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_ps(ymm4, 0xb1); ymm4 = _mm256_mul_ps(ymm0, ymm4); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_addsub_ps(ymm4, ymm3); ymm3 = _mm256_permute_ps(ymm5, 0xb1); ymm5 = _mm256_mul_ps(ymm0, ymm5); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm5 = _mm256_addsub_ps(ymm5, ymm3); ymm3 = _mm256_permute_ps(ymm8, 0xb1); ymm8 = _mm256_mul_ps(ymm0, ymm8); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm8 = _mm256_addsub_ps(ymm8, ymm3); ymm3 = _mm256_permute_ps(ymm9, 0xb1); ymm9 = _mm256_mul_ps(ymm0, ymm9); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm9 = _mm256_addsub_ps(ymm9, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose left 2x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm4), _mm256_castps_pd (ymm8))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); ymm1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd (ymm4) , _mm256_castps_pd(ymm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm0,1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm1,1)); //transpose right 2x4 tC += tc_inc_col; ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm5), _mm256_castps_pd(ymm9))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); ymm1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(ymm5), _mm256_castps_pd(ymm9))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm0,1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm1,1)); } else{ ymm1 = _mm256_broadcast_ss((float const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_loadl_pi(xmm3, (__m64 const *) (tC + tc_inc_col*2)); xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm4 = _mm256_add_ps(ymm4, ymm0); //Multiply ymm8 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 1)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 1 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC + 1 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + 1 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1); ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm8 = _mm256_add_ps(ymm8, ymm0); //transpose left 2x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm4), _mm256_castps_pd (ymm8))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); ymm3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd (ymm4) , _mm256_castps_pd(ymm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm3)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm0,1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm3,1)); //Multiply ymm5 with beta tC += tc_inc_col; xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_loadl_pi(xmm3, (__m64 const *) (tC + tc_inc_col*2)); xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm5 = _mm256_add_ps(ymm5, ymm0); //Multiply ymm9 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC+ 1)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC+ 1 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC+ 1 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC+ 1 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm9 = _mm256_add_ps(ymm9, ymm0); //transpose right 2x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm5), _mm256_castps_pd(ymm9))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); ymm3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(ymm5), _mm256_castps_pd(ymm9))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm3)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm0,1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm3,1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + 4), ymm5); _mm256_storeu_ps((float*)(tC + tc_inc_row) , ymm8); _mm256_storeu_ps((float*)(tC + tc_inc_row + 4), ymm9); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_ss((float const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_ps((float const *)(tC)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_add_ps(ymm4, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+4)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm5 = _mm256_add_ps(ymm5, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm8 = _mm256_add_ps(ymm8, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row + 4)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm9 = _mm256_add_ps(ymm9, _mm256_addsub_ps(ymm2, ymm3)); _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + 4), ymm5); _mm256_storeu_ps((float*)(tC + tc_inc_row) , ymm8); _mm256_storeu_ps((float*)(tC + tc_inc_row + 4), ymm9); } } } consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; scomplex* restrict cij = c + j_edge*cs_c; scomplex* restrict ai = a; scomplex* restrict bj = b + n_iter * 8 ; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_cgemmsup_rv_zen_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_cgemmsup_rv_zen_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_cgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_cgemmsup_rv_zen_asm_1x8n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = 0; uint64_t n_iter = n0 / 8; uint64_t n_left = n0 % 8; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- //scratch registers __m256 ymm0, ymm1, ymm2, ymm3; __m256 ymm4, ymm5, ymm6, ymm7; __m128 xmm0, xmm3; scomplex *tA = a; float *tAimag = &a->imag; scomplex *tB = b; scomplex *tC = c; for (n_iter = 0; n_iter < n0 / 8; n_iter++) { // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tb_inc_col = cs_b; dim_t tc_inc_col = cs_c; tA = a; tAimag = &a->imag; tB = b + n_iter*tb_inc_col*8; tC = c + n_iter*tc_inc_col*8; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_ps(ymm4, 0xb1); ymm4 = _mm256_mul_ps(ymm0, ymm4); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_addsub_ps(ymm4, ymm3); ymm3 = _mm256_permute_ps(ymm5, 0xb1); ymm5 = _mm256_mul_ps(ymm0, ymm5); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm5 = _mm256_addsub_ps(ymm5, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose left 1x4 _mm_storel_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm4)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm4)); tC += tc_inc_col; _mm_storel_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm4,1)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm4,1)); //transpose right 1x4 tC += tc_inc_col; _mm_storel_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm5)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm5)); tC += tc_inc_col; _mm_storel_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm5,1)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm5,1)); } else{ ymm1 = _mm256_broadcast_ss((float const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_loadl_pi(xmm3, (__m64 const *) (tC + tc_inc_col*2)); xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm4 = _mm256_add_ps(ymm4, ymm0); _mm_storel_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm4)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm4)); tC += tc_inc_col; _mm_storel_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm4,1)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm4,1)); //Multiply ymm5 with beta tC += tc_inc_col; xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_loadl_pi(xmm3, (__m64 const *) (tC + tc_inc_col*2)); xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm5 = _mm256_add_ps(ymm5, ymm0); _mm_storel_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm5)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC), _mm256_castps256_ps128(ymm5)); tC += tc_inc_col; _mm_storel_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm5,1)); tC += tc_inc_col; _mm_storeh_pi((__m64 *)(tC) ,_mm256_extractf128_ps (ymm5,1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + 4), ymm5); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_ss((float const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_ps((float const *)(tC)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_add_ps(ymm4, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+4)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm5 = _mm256_add_ps(ymm5, _mm256_addsub_ps(ymm2, ymm3)); _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + 4), ymm5); } } } consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; scomplex* restrict cij = c + j_edge*cs_c; scomplex* restrict ai = a; scomplex* restrict bj = b + n_iter * 8; if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_cgemmsup_rv_zen_asm_1x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_cgemmsup_rv_zen_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ){ bli_cgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_cgemmsup_rv_zen_asm_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t k_iter = 0; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- //scratch registers __m256 ymm0, ymm1, ymm2, ymm3; __m256 ymm4, ymm6; __m256 ymm8, ymm10; __m256 ymm12, ymm14; __m128 xmm0, xmm3; scomplex *tA = a; float *tAimag = &a->imag; scomplex *tB = b; scomplex *tC = c; // clear scratch registers. ymm4 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); ymm10 = _mm256_setzero_ps(); ymm12 = _mm256_setzero_ps(); ymm14 = _mm256_setzero_ps(); dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tc_inc_col = cs_c; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_ps(ymm4, 0xb1); ymm4 = _mm256_mul_ps(ymm0, ymm4); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_addsub_ps(ymm4, ymm3); ymm3 = _mm256_permute_ps(ymm8, 0xb1); ymm8 = _mm256_mul_ps(ymm0, ymm8); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm8 = _mm256_addsub_ps(ymm8, ymm3); ymm3 = _mm256_permute_ps(ymm12, 0xb1); ymm12 = _mm256_mul_ps(ymm0, ymm12); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm12 = _mm256_addsub_ps(ymm12, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose 3x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm4), _mm256_castps_pd (ymm8))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); _mm_storel_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); ymm1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd (ymm4) , _mm256_castps_pd(ymm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC),_mm256_extractf128_ps (ymm0,1)); _mm_storel_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12, 1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ) ,_mm256_extractf128_ps (ymm1,1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12,1)); } else{ ymm1 = _mm256_broadcast_ss((float const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_loadl_pi(xmm3, (__m64 const *) (tC + tc_inc_col*2)); xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm4 = _mm256_add_ps(ymm4, ymm0); //Multiply ymm8 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 1)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 1 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC + 1 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + 1 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm8 = _mm256_add_ps(ymm8, ymm0); //Multiply ymm12 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 2)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 2 + tc_inc_col)) ; xmm3 = _mm_loadl_pi(xmm3, (__m64 const *)(tC + 2 + tc_inc_col*2)) ; xmm3 = _mm_loadh_pi(xmm3, (__m64 const *)(tC + 2 + tc_inc_col*3)) ; ymm0 = _mm256_insertf128_ps(_mm256_castps128_ps256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_ps(ymm0, 0xb1); ymm0 = _mm256_mul_ps(ymm1, ymm0); ymm3 = _mm256_mul_ps(ymm2, ymm3); ymm0 = _mm256_addsub_ps(ymm0, ymm3); ymm12 = _mm256_add_ps(ymm12, ymm0); //transpose 3x4 ymm0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd (ymm4), _mm256_castps_pd (ymm8))); _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm0)); _mm_storel_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); ymm3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd (ymm4) , _mm256_castps_pd(ymm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC), _mm256_castps256_ps128(ymm3)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_castps256_ps128(ymm12)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC) ,_mm256_extractf128_ps (ymm0,1)); _mm_storel_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12, 1)); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ),_mm256_extractf128_ps (ymm3,1)); _mm_storeh_pi((__m64 *)(tC+2), _mm256_extractf128_ps(ymm12,1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + tc_inc_row) , ymm8); _mm256_storeu_ps((float*)(tC + tc_inc_row *2), ymm12); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_ss((float const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_ss((float const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_ps((float const *)(tC)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 =_mm256_mul_ps(ymm1, ymm3); ymm4 = _mm256_add_ps(ymm4, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm8 = _mm256_add_ps(ymm8, _mm256_addsub_ps(ymm2, ymm3)); ymm2 = _mm256_loadu_ps((float const *)(tC+tc_inc_row*2)); ymm3 = _mm256_permute_ps(ymm2, 0xb1); ymm2 = _mm256_mul_ps(ymm0, ymm2); ymm3 = _mm256_mul_ps(ymm1, ymm3); ymm12 = _mm256_add_ps(ymm12, _mm256_addsub_ps(ymm2, ymm3)); _mm256_storeu_ps((float*)(tC), ymm4); _mm256_storeu_ps((float*)(tC + tc_inc_row) , ymm8); _mm256_storeu_ps((float*)(tC + tc_inc_row *2), ymm12);; } } } void bli_cgemmsup_rv_zen_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, scomplex* restrict alpha, scomplex* restrict a, inc_t rs_a0, inc_t cs_a0, scomplex* restrict b, inc_t rs_b0, inc_t cs_b0, scomplex* restrict beta, scomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = 0; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; scomplex *tA = a; float *tAimag = &a->imag; scomplex *tB = b; scomplex *tC = c; // clear scratch registers. __m128 xmm0, xmm1, xmm2, xmm3; __m128 xmm4 = _mm_setzero_ps(); __m128 xmm6 = _mm_setzero_ps(); __m128 xmm8 = _mm_setzero_ps(); __m128 xmm10 = _mm_setzero_ps(); __m128 xmm12 = _mm_setzero_ps(); __m128 xmm14 = _mm_setzero_ps(); dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tc_inc_col = cs_c; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate xmm3 = _mm_permute_ps(xmm4, 0xb1); xmm4 = _mm_mul_ps(xmm0, xmm4); xmm3 =_mm_mul_ps(xmm1, xmm3); xmm4 = _mm_addsub_ps(xmm4, xmm3); xmm3 = _mm_permute_ps(xmm8, 0xb1); xmm8 = _mm_mul_ps(xmm0, xmm8); xmm3 = _mm_mul_ps(xmm1, xmm3); xmm8 = _mm_addsub_ps(xmm8, xmm3); xmm3 = _mm_permute_ps(xmm12, 0xb1); xmm12 = _mm_mul_ps(xmm0, xmm12); xmm3 = _mm_mul_ps(xmm1, xmm3); xmm12 = _mm_addsub_ps(xmm12, xmm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose 3x2 xmm0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd (xmm4), _mm_castps_pd (xmm8))); _mm_storeu_ps((float *)(tC ), xmm0); _mm_storel_pi((__m64 *)(tC+2), xmm12); xmm1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd (xmm4) , _mm_castps_pd(xmm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ), xmm1); _mm_storeh_pi((__m64 *)(tC+2), xmm12); } else{ xmm1 = _mm_broadcast_ss((float const *)(beta)); // load alpha_r and duplicate xmm2 = _mm_broadcast_ss((float const *)(&beta->imag)); // load alpha_i and duplicate //Multiply xmm4 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *) (tC)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *) (tC + tc_inc_col)); xmm3 = _mm_permute_ps(xmm0, 0xb1); xmm0 = _mm_mul_ps(xmm1, xmm0); xmm3 = _mm_mul_ps(xmm2, xmm3); xmm0 = _mm_addsub_ps(xmm0, xmm3); xmm4 = _mm_add_ps(xmm4, xmm0); //Multiply xmm8 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 1)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 1 + tc_inc_col)) ; xmm3 = _mm_permute_ps(xmm0, 0xb1); xmm0 = _mm_mul_ps(xmm1, xmm0); xmm3 = _mm_mul_ps(xmm2, xmm3); xmm0 = _mm_addsub_ps(xmm0, xmm3); xmm8 = _mm_add_ps(xmm8, xmm0); //Multiply xmm12 with beta xmm0 = _mm_loadl_pi(xmm0, (__m64 const *)(tC + 2)) ; xmm0 = _mm_loadh_pi(xmm0, (__m64 const *)(tC + 2 + tc_inc_col)) ; xmm3 = _mm_permute_ps(xmm0, 0xb1); xmm0 = _mm_mul_ps(xmm1, xmm0); xmm3 = _mm_mul_ps(xmm2, xmm3); xmm0 = _mm_addsub_ps(xmm0, xmm3); xmm12 = _mm_add_ps(xmm12, xmm0); //transpose 3x2 xmm0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd (xmm4), _mm_castps_pd (xmm8))); _mm_storeu_ps((float *)(tC ), xmm0); _mm_storel_pi((__m64 *)(tC+2), xmm12); xmm3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd (xmm4) , _mm_castps_pd(xmm8))); tC += tc_inc_col; _mm_storeu_ps((float *)(tC ), xmm3); _mm_storeh_pi((__m64 *)(tC+2), xmm12); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm_storeu_ps((float *)(tC), xmm4); _mm_storeu_ps((float *)(tC + tc_inc_row) , xmm8); _mm_storeu_ps((float *)(tC + tc_inc_row *2), xmm12); } else{ /* (br + bi) C + (ar + ai) AB */ xmm0 = _mm_broadcast_ss((float const *)(beta)); // load beta_r and duplicate xmm1 = _mm_broadcast_ss((float const *)(&beta->imag)); // load beta_i and duplicate xmm2 = _mm_loadu_ps((float const *)(tC)); xmm3 = _mm_permute_ps(xmm2, 0xb1); xmm2 = _mm_mul_ps(xmm0, xmm2); xmm3 = _mm_mul_ps(xmm1, xmm3); xmm4 = _mm_add_ps(xmm4, _mm_addsub_ps(xmm2, xmm3)); xmm2 = _mm_loadu_ps((float const *)(tC+tc_inc_row)); xmm3 = _mm_permute_ps(xmm2, 0xb1); xmm2 = _mm_mul_ps(xmm0, xmm2); xmm3 = _mm_mul_ps(xmm1, xmm3); xmm8 = _mm_add_ps(xmm8, _mm_addsub_ps(xmm2, xmm3)); xmm2 = _mm_loadu_ps((float const *)(tC+tc_inc_row*2)); xmm3 = _mm_permute_ps(xmm2, 0xb1); xmm2 = _mm_mul_ps(xmm0, xmm2); xmm3 = _mm_mul_ps(xmm1, xmm3); xmm12 = _mm_add_ps(xmm12, _mm_addsub_ps(xmm2, xmm3)); _mm_storeu_ps((float *)(tC), xmm4); _mm_storeu_ps((float *)(tC + tc_inc_row) , xmm8); _mm_storeu_ps((float *)(tC + tc_inc_row *2), xmm12);; } } } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c000066400000000000000000001404411427272030600301140ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define ZGEMM_INPUT_SCALE_CS_BETA_NZ \ vmovupd(mem(rcx), xmm0) \ vmovupd(mem(rcx, rsi, 1), xmm3) \ vinsertf128(imm(1), xmm3, ymm0, ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ vmovupd(mem(rcx), ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_OUTPUT_RS \ vmovupd(ymm0, mem(rcx)) \ #define ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \ vmovupd(mem(rcx, rsi, 8), ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_OUTPUT_RS_NEXT \ vmovupd(ymm0, mem(rcx, rsi, 8)) \ void bli_zgemmsup_rv_zen_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r8, 2), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(, r9, 2), r9) // cs_a *= sizeof(dt) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) lea(mem(, r10, 2), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) lea(mem(, rdi, 2), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax , 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm7, ymm7) vpermilpd(imm(0x5), ymm10, ymm10) vpermilpd(imm(0x5), ymm11, ymm11) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) vaddsubpd(ymm10, ymm8, ymm8) vaddsubpd(ymm11, ymm9, ymm9) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) vpermilpd(imm(0x5), ymm5, ymm3) vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) vpermilpd(imm(0x5), ymm9, ymm3) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm9, ymm9) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) ==16. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 1*rs_c ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_RS ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_RS_NEXT jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 2x4 | | 4x2 | |--------| |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm4, ymm0, ymm4) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm8, ymm0, ymm8) add(rdi, rcx) lea(mem(r12, rsi, 2), rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm5, ymm0, ymm5) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm9, ymm0, ymm9) add(rdi, rcx) mov(r12, rcx) // reset rcx to current utile of c. /****3x4 tile going to save into 4x2 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose top tile 4x3***************************/ vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) add(rsi, rcx) vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx, rsi, 8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****2x4 tile going to save into 4x2 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose tile 2x4***************************/ vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) add(rsi, rcx) vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_zgemmsup_rv_zen_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r8, 2), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(, r9, 2), r9) // cs_a *= sizeof(dt) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) lea(mem(, r10, 2), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) lea(mem(, rdi, 2), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax , 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm7, ymm7) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) vpermilpd(imm(0x5), ymm5, ymm3) vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) ==16. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_RS_NEXT jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 1x4 | | 4x1 | |--------| |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm4, ymm0, ymm4) lea(mem(r12, rsi, 2), rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm5, ymm0, ymm5) mov(r12, rcx) // reset rcx to current utile of c. /****1x4 tile going to save into 4x1 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) vmovups(xmm4, mem(rcx)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovups(xmm4, mem(rcx)) add(rsi, rcx) vmovups(xmm5, mem(rcx)) add(rsi, rcx) vextractf128(imm(0x1), ymm5, xmm5) vmovups(xmm5, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****1x4 tile going to save into 4x1 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) vmovups(xmm4, mem(rcx)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovups(xmm4, mem(rcx)) add(rsi, rcx) vmovups(xmm5, mem(rcx)) add(rsi, rcx) vextractf128(imm(0x1), ymm5, xmm5) vmovups(xmm5, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_zgemmsup_rv_zen_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r8, 2), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(, r9, 2), r9) // cs_a *= sizeof(dt) mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) lea(mem(, r10, 2), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) lea(mem(, rdi, 2), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm10, ymm10) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm10, ymm8, ymm8) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS add(rdi, rcx) // rcx = c + 1*rs_c ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_RS jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 2x2 | | 2x2 | |--------| |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm4, ymm0, ymm4) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm8, ymm0, ymm8) mov(r12, rcx) // reset rcx to current utile of c. /****2x2 tile going to save into 2x2 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(16), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****2x2 tile going to save into 2x2 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_zgemmsup_rv_zen_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r8, 2), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(, r9, 2), r9) // cs_a *= sizeof(dt) // lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) lea(mem(, r10, 2), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) lea(mem(, rdi, 2), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 1x2 | | 2x1 | |--------| |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm4, ymm0, ymm4) /****3x4 tile going to save into 4x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose tile 1x2***************************/ vmovups(xmm4, mem(rcx)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(16), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****1x2 tile going to save into 2x1 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose top tile 4x3***************************/ vmovups(xmm4, mem(rcx)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vmovups(xmm4, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c000066400000000000000000001073071427272030600302750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" // assumes beta.r, beta.i have been broadcast into ymm1, ymm2. // outputs to ymm0 #define ZGEMM_INPUT_SCALE_CS_BETA_NZ \ vmovupd(mem(rcx), xmm0) \ vmovupd(mem(rcx, rsi, 1), xmm3) \ vinsertf128(imm(1), xmm3, ymm0, ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_INPUT_SCALE_RS_BETA_NZ \ vmovupd(mem(rcx), ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_OUTPUT_RS \ vmovupd(ymm0, mem(rcx)) \ #define ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \ vmovupd(mem(rcx, rsi, 8), ymm0) \ vpermilpd(imm(0x5), ymm0, ymm3) \ vmulpd(ymm1, ymm0, ymm0) \ vmulpd(ymm2, ymm3, ymm3) \ vaddsubpd(ymm3, ymm0, ymm0) #define ZGEMM_OUTPUT_RS_NEXT \ vmovupd(ymm0, mem(rcx, rsi, 8)) \ /* rrr: -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : rcr: -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | += ------ -------- | | | | | | | | ------ -------- | | | | | | | | ------ : */ void bli_zgemmsup_rv_zen_asm_3x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 4; // First check whether this is a edge case in the n dimension. If so, // dispatch other 3x?m kernels, as needed. if (n_left ) { dcomplex* cij = c; dcomplex* bj = b; dcomplex* ai = a; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_zgemmsup_rv_zen_asm_3x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_zgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(real dt) lea(mem(, r8, 2), r8) // rs_a *= sizeof((real + imag) dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof( real dt) lea(mem(, r9, 2), r9) // cs_a *= sizeof((real + imag) dt) //lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(real dt) lea(mem(, r10, 2), r10) // rs_b *= sizeof((real +imag) dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) lea(mem(, rdi, 2), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vbroadcastsd(mem(rax, 8 ), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vbroadcastsd(mem(rax, 8 ), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) vmovupd(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vfmadd231pd(ymm1, ymm2, ymm5) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vfmadd231pd(ymm1, ymm2, ymm9) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vfmadd231pd(ymm1, ymm2, ymm13) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vfmadd231pd(ymm1, ymm3, ymm7) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vfmadd231pd(ymm1, ymm3, ymm11) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) vfmadd231pd(ymm1, ymm3, ymm15) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm7, ymm7) vpermilpd(imm(0x5), ymm10, ymm10) vpermilpd(imm(0x5), ymm11, ymm11) vpermilpd(imm(0x5), ymm14, ymm14) vpermilpd(imm(0x5), ymm15, ymm15) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm7, ymm5, ymm5) vaddsubpd(ymm10, ymm8, ymm8) vaddsubpd(ymm11, ymm9, ymm9) vaddsubpd(ymm14, ymm12, ymm12) vaddsubpd(ymm15, ymm13, ymm13) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) vpermilpd(imm(0x5), ymm5, ymm3) vmulpd(ymm0, ymm5, ymm5) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm5, ymm5) vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) vpermilpd(imm(0x5), ymm9, ymm3) vmulpd(ymm0, ymm9, ymm9) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm9, ymm9) vpermilpd(imm(0x5), ymm12, ymm3) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm12, ymm12) vpermilpd(imm(0x5), ymm13, ymm3) vmulpd(ymm0, ymm13, ymm13) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm13, ymm13) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) ==16. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddpd(ymm5, ymm0, ymm0) ZGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 1*rs_c ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_RS ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddpd(ymm9, ymm0, ymm0) ZGEMM_OUTPUT_RS_NEXT add(rdi, rcx) // rcx = c + 2*rs_c ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_RS ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT vaddpd(ymm13, ymm0, ymm0) ZGEMM_OUTPUT_RS_NEXT jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 3x4 | | 4x3 | |--------| |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real +imag)dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm4, ymm0, ymm4) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm8, ymm0, ymm8) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm12, ymm0, ymm12) lea(mem(r12, rsi, 2), rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm5, ymm0, ymm5) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm9, ymm0, ymm9) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm13, ymm0, ymm13) mov(r12, rcx) // reset rcx to current utile of c. /****3x4 tile going to save into 4x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real +imag)dt) /******************Transpose top tile 4x3***************************/ vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) vmovups(xmm13,mem(rcx,32)) add(rsi, rcx) vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vextractf128(imm(0x1), ymm13, xmm13) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) vmovups(xmm13,mem(rcx,32)) jmp(.SDONE) // jump to end. label(.SBETAZERO) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovupd(ymm4, mem(rcx)) vmovupd(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) vmovupd(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) vmovupd(ymm13, mem(rcx, rsi, 8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****3x4 tile going to save into 4x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose top tile 4x3***************************/ vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) vmovups(xmm13,mem(rcx,32)) add(rsi, rcx) vextractf128(imm(0x1), ymm5, xmm5) vextractf128(imm(0x1), ymm9, xmm9) vextractf128(imm(0x1), ymm13, xmm13) vmovups(xmm5, mem(rcx)) vmovups(xmm9, mem(rcx, 16)) vmovups(xmm13,mem(rcx,32)) label(.SDONE) lea(mem(r12, rdi, 2), r12) lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) lea(mem(r14, r8, 1), r14) //a_ii = r14 += 3*rs_a dec(r11) // ii -= 1; jne(.SLOOP3X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; dcomplex* cij = c + i_edge*rs_c; dcomplex* ai = a + i_edge*rs_a; dcomplex* bj = b; zgemmsup_ker_ft ker_fps[3] = { NULL, bli_zgemmsup_rv_zen_asm_1x4, bli_zgemmsup_rv_zen_asm_2x4, }; zgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_zgemmsup_rv_zen_asm_3x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 8), r8) // rs_a *= sizeof(dt) lea(mem(, r8, 2), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 8), r9) // cs_a *= sizeof(dt) lea(mem(, r9, 2), r9) // cs_a *= sizeof(dt) // lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 8), r10) // rs_b *= sizeof(dt) lea(mem(, r10, 2), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 8), rdi) // rs_c *= sizeof(dt) lea(mem(, rdi, 2), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP3X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] vzeroall() // zero all xmm/ymm registers. mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 8), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 1 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 2 vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; // ---------------------------------- iteration 3 lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovupd(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastsd(mem(rax ), ymm2) vfmadd231pd(ymm0, ymm2, ymm4) vbroadcastsd(mem(rax, r8, 1), ymm2) vfmadd231pd(ymm0, ymm2, ymm8) vbroadcastsd(mem(rax, r8, 2), ymm2) vfmadd231pd(ymm0, ymm2, ymm12) vbroadcastsd(mem(rax, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm6) vbroadcastsd(mem(rax, r8, 1, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm10) vbroadcastsd(mem(rax, r8, 2, 8), ymm3) vfmadd231pd(ymm0, ymm3, ymm14) add(r9, rax) // a += cs_a; dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. // permute even and odd elements // of ymm6/7, ymm10/11, ymm/14/15 vpermilpd(imm(0x5), ymm6, ymm6) vpermilpd(imm(0x5), ymm10, ymm10) vpermilpd(imm(0x5), ymm14, ymm14) // subtract/add even/odd elements vaddsubpd(ymm6, ymm4, ymm4) vaddsubpd(ymm10, ymm8, ymm8) vaddsubpd(ymm14, ymm12, ymm12) /* (ar + ai) x AB */ mov(var(alpha), rax) // load address of alpha vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate vpermilpd(imm(0x5), ymm4, ymm3) vmulpd(ymm0, ymm4, ymm4) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm4, ymm4) vpermilpd(imm(0x5), ymm8, ymm3) vmulpd(ymm0, ymm8, ymm8) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm8, ymm8) vpermilpd(imm(0x5), ymm12, ymm3) vmulpd(ymm0, ymm12, ymm12) vmulpd(ymm1, ymm3, ymm3) vaddsubpd(ymm3, ymm12, ymm12) /* (r + i)x C + ((ar + ai) x AB) */ mov(var(beta), rbx) // load address of beta vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm1) // set ZF if beta_r == 0. sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 ); vucomisd(xmm0, xmm2) // set ZF if beta_i == 0. sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 ); and(r13b, r15b) // set ZF if r13b & r15b == 1. jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a cmp(imm(16), rdi) // set ZF if (16*rs_c) == 16. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm4, ymm0, ymm0) ZGEMM_OUTPUT_RS add(rdi, rcx) // rcx = c + 1*rs_c ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm8, ymm0, ymm0) ZGEMM_OUTPUT_RS add(rdi, rcx) // rcx = c + 2*rs_c ZGEMM_INPUT_SCALE_RS_BETA_NZ vaddpd(ymm12, ymm0, ymm0) ZGEMM_OUTPUT_RS jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|--------| |-------| | | | | | 3x2 | | 2x3 | |--------| |-------| */ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(real dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof((real+imag) dt) lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm4, ymm0, ymm4) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm8, ymm0, ymm8) add(rdi, rcx) ZGEMM_INPUT_SCALE_CS_BETA_NZ vaddpd(ymm12, ymm0, ymm12) mov(r12, rcx) // reset rcx to current utile of c. /****3x2 tile going to save into 2x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose top tile 2x3***************************/ vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(16), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovupd(ymm4, mem(rcx)) add(rdi, rcx) vmovupd(ymm8, mem(rcx)) add(rdi, rcx) vmovupd(ymm12, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****3x2 tile going to save into 2x3 tile in C*****/ mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 8), rsi) // rsi = cs_c * sizeof(dt) lea(mem(, rsi, 2), rsi) // rsi = cs_c * sizeof(dt) /******************Transpose tile 3x2***************************/ vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) add(rsi, rcx) vextractf128(imm(0x1), ymm4, xmm4) vextractf128(imm(0x1), ymm8, xmm8) vextractf128(imm(0x1), ymm12, xmm12) vmovups(xmm4, mem(rcx)) vmovups(xmm8, mem(rcx, 16)) vmovups(xmm12, mem(rcx,32)) label(.SDONE) lea(mem(r12, rdi, 2), r12) lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) lea(mem(r14, r8, 1), r14) //a_ii = r14 += 3*rs_a dec(r11) // ii -= 1; jne(.SLOOP3X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; dcomplex* cij = c + i_edge*rs_c; dcomplex* ai = a + i_edge*rs_a; dcomplex* bj = b; zgemmsup_ker_ft ker_fps[3] = { NULL, bli_zgemmsup_rv_zen_asm_1x2, bli_zgemmsup_rv_zen_asm_2x2, }; zgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c000066400000000000000000001147731427272030600303030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #include "immintrin.h" /* rrr: -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : rcr: -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : */ void bli_zgemmsup_rv_zen_asm_3x4n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 3; if ( m_left ) { zgemmsup_ker_ft ker_fps[3] = { NULL, bli_zgemmsup_rv_zen_asm_1x4n, bli_zgemmsup_rv_zen_asm_2x4n, }; zgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, n0, k0, alpha, a, rs_a0, cs_a0, b, rs_b0, cs_b0, beta, c, rs_c0, cs_c0, data, cntx ); return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = 0; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- //scratch registers __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m256d ymm12, ymm13, ymm14, ymm15; __m128d xmm0, xmm3; dcomplex *tA = a; double *tAimag = &a->imag; dcomplex *tB = b; dcomplex *tC = c; for (n_iter = 0; n_iter < n0 / 4; n_iter++) { // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm13 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); ymm15 = _mm256_setzero_pd(); dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tb_inc_col = cs_b; dim_t tc_inc_col = cs_c; tA = a; tAimag = &a->imag; tB = b + n_iter*tb_inc_col*4; tC = c + n_iter*tc_inc_col*4; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_pd(ymm4, 5); ymm4 = _mm256_mul_pd(ymm0, ymm4); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_addsub_pd(ymm4, ymm3); ymm3 = _mm256_permute_pd(ymm5, 5); ymm5 = _mm256_mul_pd(ymm0, ymm5); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm5 = _mm256_addsub_pd(ymm5, ymm3); ymm3 = _mm256_permute_pd(ymm8, 5); ymm8 = _mm256_mul_pd(ymm0, ymm8); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm8 = _mm256_addsub_pd(ymm8, ymm3); ymm3 = _mm256_permute_pd(ymm9, 5); ymm9 = _mm256_mul_pd(ymm0, ymm9); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm9 = _mm256_addsub_pd(ymm9, ymm3); ymm3 = _mm256_permute_pd(ymm12, 5); ymm12 = _mm256_mul_pd(ymm0, ymm12); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm12 = _mm256_addsub_pd(ymm12, ymm3); ymm3 = _mm256_permute_pd(ymm13, 5); ymm13 = _mm256_mul_pd(ymm0, ymm13); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm13 = _mm256_addsub_pd(ymm13, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose left 3x2 _mm_storeu_pd((double *)(tC ), _mm256_castpd256_pd128(ymm4)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm8)); _mm_storeu_pd((double *)(tC+2), _mm256_castpd256_pd128(ymm12)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm4,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm8,1)); _mm_storeu_pd((double *)(tC+2), _mm256_extractf128_pd(ymm12, 1)); tC += tc_inc_col; //transpose right 3x2 _mm_storeu_pd((double *)(tC ), _mm256_castpd256_pd128(ymm5)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm9)); _mm_storeu_pd((double *)(tC+2), _mm256_castpd256_pd128(ymm13)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm5,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm9,1)); _mm_storeu_pd((double *)(tC+2), _mm256_extractf128_pd(ymm13, 1)); } else{ ymm1 = _mm256_broadcast_sd((double const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadu_pd((double *)(tC)) ; xmm3 = _mm_loadu_pd((double *)(tC + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm0); //Multiply ymm8 with beta xmm0 = _mm_loadu_pd((double *)(tC + 1)) ; xmm3 = _mm_loadu_pd((double *)(tC + 1 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm8 = _mm256_add_pd(ymm8, ymm0); //Multiply ymm12 with beta xmm0 = _mm_loadu_pd((double *)(tC + 2)) ; xmm3 = _mm_loadu_pd((double *)(tC + 2 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm12 = _mm256_add_pd(ymm12, ymm0); //transpose left 3x2 _mm_storeu_pd((double *)(tC ), _mm256_castpd256_pd128(ymm4)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm8)); _mm_storeu_pd((double *)(tC+2), _mm256_castpd256_pd128(ymm12)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm4,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm8,1)); _mm_storeu_pd((double *)(tC+2), _mm256_extractf128_pd(ymm12, 1)); tC += tc_inc_col; //Multiply ymm5 with beta xmm0 = _mm_loadu_pd((double *)(tC)) ; xmm3 = _mm_loadu_pd((double *)(tC + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm5 = _mm256_add_pd(ymm5, ymm0); //Multiply ymm9 with beta xmm0 = _mm_loadu_pd((double *)(tC + 1)) ; xmm3 = _mm_loadu_pd((double *)(tC + 1 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm9 = _mm256_add_pd(ymm9, ymm0); //Multiply ymm13 with beta xmm0 = _mm_loadu_pd((double *)(tC + 2)) ; xmm3 = _mm_loadu_pd((double *)(tC + 2 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm13 = _mm256_add_pd(ymm13, ymm0); //transpose right 3x2 _mm_storeu_pd((double *)(tC ), _mm256_castpd256_pd128(ymm5)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm9)); _mm_storeu_pd((double *)(tC+2), _mm256_castpd256_pd128(ymm13)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm5,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm9,1)); _mm_storeu_pd((double *)(tC+2), _mm256_extractf128_pd(ymm13, 1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + 2), ymm5); _mm256_storeu_pd((double *)(tC + tc_inc_row) , ymm8); _mm256_storeu_pd((double *)(tC + tc_inc_row + 2), ymm9); _mm256_storeu_pd((double *)(tC + tc_inc_row *2), ymm12); _mm256_storeu_pd((double *)(tC + tc_inc_row *2+ 2), ymm13); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_sd((double const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_pd((double const *)(tC)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm5 = _mm256_add_pd(ymm5, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm8 = _mm256_add_pd(ymm8, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row + 2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm9 = _mm256_add_pd(ymm9, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row*2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm12 = _mm256_add_pd(ymm12, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row*2 +2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm13 = _mm256_add_pd(ymm13, _mm256_addsub_pd(ymm2, ymm3)); _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + 2), ymm5); _mm256_storeu_pd((double *)(tC + tc_inc_row) , ymm8); _mm256_storeu_pd((double *)(tC + tc_inc_row + 2), ymm9); _mm256_storeu_pd((double *)(tC + tc_inc_row *2), ymm12); _mm256_storeu_pd((double *)(tC + tc_inc_row *2+ 2), ymm13); } } } consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; dcomplex* restrict cij = c + j_edge*cs_c; dcomplex* restrict ai = a; dcomplex* restrict bj = b + n_iter * 4; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_zgemmsup_rv_zen_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_zgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_zgemmsup_rv_zen_asm_2x4n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t k_iter = 0; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- //scratch registers __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m256d ymm8, ymm9, ymm10, ymm11; __m128d xmm0, xmm3; dcomplex *tA = a; double *tAimag = &a->imag; dcomplex *tB = b; dcomplex *tC = c; for (n_iter = 0; n_iter < n0 / 4; n_iter++) { // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm9 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm11 = _mm256_setzero_pd(); dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tb_inc_col = cs_b; dim_t tc_inc_col = cs_c; tA = a; tAimag = &a->imag; tB = b + n_iter*tb_inc_col*4; tC = c + n_iter*tc_inc_col*4; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_pd(ymm4, 5); ymm4 = _mm256_mul_pd(ymm0, ymm4); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_addsub_pd(ymm4, ymm3); ymm3 = _mm256_permute_pd(ymm5, 5); ymm5 = _mm256_mul_pd(ymm0, ymm5); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm5 = _mm256_addsub_pd(ymm5, ymm3); ymm3 = _mm256_permute_pd(ymm8, 5); ymm8 = _mm256_mul_pd(ymm0, ymm8); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm8 = _mm256_addsub_pd(ymm8, ymm3); ymm3 = _mm256_permute_pd(ymm9, 5); ymm9 = _mm256_mul_pd(ymm0, ymm9); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm9 = _mm256_addsub_pd(ymm9, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose left 2x2 _mm_storeu_pd((double *)(tC ), _mm256_castpd256_pd128(ymm4)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm8)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm4,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm8,1)); tC += tc_inc_col; //transpose right 2x2 _mm_storeu_pd((double *)(tC ), _mm256_castpd256_pd128(ymm5)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm9)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm5,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm9,1)); } else{ ymm1 = _mm256_broadcast_sd((double const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadu_pd((double *)(tC)) ; xmm3 = _mm_loadu_pd((double *)(tC + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm0); //Multiply ymm8 with beta xmm0 = _mm_loadu_pd((double *)(tC + 1)) ; xmm3 = _mm_loadu_pd((double *)(tC + 1 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm8 = _mm256_add_pd(ymm8, ymm0); //transpose left 2x2 _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm4)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm8)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ) ,_mm256_extractf128_pd (ymm4,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm8,1)); tC += tc_inc_col; //Multiply ymm5 with beta xmm0 = _mm_loadu_pd((double *)(tC)) ; xmm3 = _mm_loadu_pd((double *)(tC + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm5 = _mm256_add_pd(ymm5, ymm0); //Multiply ymm9 with beta xmm0 = _mm_loadu_pd((double *)(tC + 1)) ; xmm3 = _mm_loadu_pd((double *)(tC + 1 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm9 = _mm256_add_pd(ymm9, ymm0); //transpose right 2x2 _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm5)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm9)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ) ,_mm256_extractf128_pd (ymm5,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm9,1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + 2), ymm5); _mm256_storeu_pd((double *)(tC + tc_inc_row) , ymm8); _mm256_storeu_pd((double *)(tC + tc_inc_row + 2), ymm9); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_sd((double const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_pd((double const *)(tC)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm5 = _mm256_add_pd(ymm5, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm8 = _mm256_add_pd(ymm8, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row + 2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm9 = _mm256_add_pd(ymm9, _mm256_addsub_pd(ymm2, ymm3)); _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + 2), ymm5); _mm256_storeu_pd((double *)(tC + tc_inc_row) , ymm8); _mm256_storeu_pd((double *)(tC + tc_inc_row + 2), ymm9); } } } consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; dcomplex* restrict cij = c + j_edge*cs_c; dcomplex* restrict ai = a; dcomplex* restrict bj = b + n_iter * 4; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_zgemmsup_rv_zen_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_zgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_zgemmsup_rv_zen_asm_1x4n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = 0; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- //scratch registers __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm5, ymm6, ymm7; __m128d xmm0, xmm3; dcomplex *tA = a; double *tAimag = &a->imag; dcomplex *tB = b; dcomplex *tC = c; for (n_iter = 0; n_iter < n0 / 4; n_iter++) { // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tb_inc_col = cs_b; dim_t tc_inc_col = cs_c; tA = a; tAimag = &a->imag; tB = b + n_iter*tb_inc_col*4; tC = c + n_iter*tc_inc_col*4; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_pd(ymm4, 5); ymm4 = _mm256_mul_pd(ymm0, ymm4); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_addsub_pd(ymm4, ymm3); ymm3 = _mm256_permute_pd(ymm5, 5); ymm5 = _mm256_mul_pd(ymm0, ymm5); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm5 = _mm256_addsub_pd(ymm5, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose left 1x2 _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm4)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC) ,_mm256_extractf128_pd (ymm4,1)); tC += tc_inc_col; //transpose right 1x2 _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm5)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC) ,_mm256_extractf128_pd (ymm5,1)); } else{ ymm1 = _mm256_broadcast_sd((double const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadu_pd((double *)(tC)) ; xmm3 = _mm_loadu_pd((double *)(tC + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm0); _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm4)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ) ,_mm256_extractf128_pd (ymm4,1)); tC += tc_inc_col; //Multiply ymm5 with beta xmm0 = _mm_loadu_pd((double *)(tC)) ; xmm3 = _mm_loadu_pd((double *)(tC + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm5 = _mm256_add_pd(ymm5, ymm0); _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm5)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC) ,_mm256_extractf128_pd (ymm5,1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + 2), ymm5); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_sd((double const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_pd((double const *)(tC)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm5 = _mm256_add_pd(ymm5, _mm256_addsub_pd(ymm2, ymm3)); _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + 2), ymm5); } } } consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; dcomplex* restrict cij = c + j_edge*cs_c; dcomplex* restrict ai = a; dcomplex* restrict bj = b + n_iter * 4; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_zgemmsup_rv_zen_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_zgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_zgemmsup_rv_zen_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, dcomplex* restrict alpha, dcomplex* restrict a, inc_t rs_a0, inc_t cs_a0, dcomplex* restrict b, inc_t rs_b0, inc_t cs_b0, dcomplex* restrict beta, dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t k_iter = 0; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- //scratch registers __m256d ymm0, ymm1, ymm2, ymm3; __m256d ymm4, ymm6; __m256d ymm8, ymm10; __m256d ymm12, ymm14; __m128d xmm0, xmm3; dcomplex *tA = a; double *tAimag = &a->imag; dcomplex *tB = b; dcomplex *tC = c; // clear scratch registers. ymm4 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); ymm10 = _mm256_setzero_pd(); ymm12 = _mm256_setzero_pd(); ymm14 = _mm256_setzero_pd(); dim_t ta_inc_row = rs_a; dim_t tb_inc_row = rs_b; dim_t tc_inc_row = rs_c; dim_t ta_inc_col = cs_a; dim_t tc_inc_col = cs_c; for (k_iter = 0; k_iter imag)); // load alpha_i and duplicate ymm3 = _mm256_permute_pd(ymm4, 5); ymm4 = _mm256_mul_pd(ymm0, ymm4); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_addsub_pd(ymm4, ymm3); ymm3 = _mm256_permute_pd(ymm8, 5); ymm8 = _mm256_mul_pd(ymm0, ymm8); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm8 = _mm256_addsub_pd(ymm8, ymm3); ymm3 = _mm256_permute_pd(ymm12, 5); ymm12 = _mm256_mul_pd(ymm0, ymm12); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm12 = _mm256_addsub_pd(ymm12, ymm3); if(tc_inc_row == 1) //col stored { if(beta->real == 0.0 && beta->imag == 0.0) { //transpose left 3x2 _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm4)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm8)); _mm_storeu_pd((double *)(tC+2), _mm256_castpd256_pd128(ymm12)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm4,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm8,1)); _mm_storeu_pd((double *)(tC+2), _mm256_extractf128_pd(ymm12, 1)); } else{ ymm1 = _mm256_broadcast_sd((double const *)(beta)); // load alpha_r and duplicate ymm2 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load alpha_i and duplicate //Multiply ymm4 with beta xmm0 = _mm_loadu_pd((double *)(tC)) ; xmm3 = _mm_loadu_pd((double *)(tC + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm0); //Multiply ymm8 with beta xmm0 = _mm_loadu_pd((double *)(tC + 1)) ; xmm3 = _mm_loadu_pd((double *)(tC + 1 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm8 = _mm256_add_pd(ymm8, ymm0); //Multiply ymm12 with beta xmm0 = _mm_loadu_pd((double *)(tC + 2)) ; xmm3 = _mm_loadu_pd((double *)(tC + 2 + tc_inc_col)) ; ymm0 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xmm0), xmm3, 1) ; ymm3 = _mm256_permute_pd(ymm0, 5); ymm0 = _mm256_mul_pd(ymm1, ymm0); ymm3 = _mm256_mul_pd(ymm2, ymm3); ymm0 = _mm256_addsub_pd(ymm0, ymm3); ymm12 = _mm256_add_pd(ymm12, ymm0); _mm_storeu_pd((double *)(tC), _mm256_castpd256_pd128(ymm4)); _mm_storeu_pd((double *)(tC+1), _mm256_castpd256_pd128(ymm8)); _mm_storeu_pd((double *)(tC+2), _mm256_castpd256_pd128(ymm12)); tC += tc_inc_col; _mm_storeu_pd((double *)(tC ),_mm256_extractf128_pd (ymm4,1)); _mm_storeu_pd((double *)(tC+1) ,_mm256_extractf128_pd (ymm8,1)); _mm_storeu_pd((double *)(tC+2), _mm256_extractf128_pd(ymm12, 1)); } } else { if(beta->real == 0.0 && beta->imag == 0.0) { _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + tc_inc_row ), ymm8); _mm256_storeu_pd((double *)(tC + tc_inc_row *2), ymm12); } else{ /* (br + bi) C + (ar + ai) AB */ ymm0 = _mm256_broadcast_sd((double const *)(beta)); // load beta_r and duplicate ymm1 = _mm256_broadcast_sd((double const *)(&beta->imag)); // load beta_i and duplicate ymm2 = _mm256_loadu_pd((double const *)(tC)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 =_mm256_mul_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm8 = _mm256_add_pd(ymm8, _mm256_addsub_pd(ymm2, ymm3)); ymm2 = _mm256_loadu_pd((double const *)(tC+tc_inc_row*2)); ymm3 = _mm256_permute_pd(ymm2, 5); ymm2 = _mm256_mul_pd(ymm0, ymm2); ymm3 = _mm256_mul_pd(ymm1, ymm3); ymm12 = _mm256_add_pd(ymm12, _mm256_addsub_pd(ymm2, ymm3)); _mm256_storeu_pd((double *)(tC), ymm4); _mm256_storeu_pd((double *)(tC + tc_inc_row) , ymm8); _mm256_storeu_pd((double *)(tC + tc_inc_row *2), ymm12); } } } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/other/000077500000000000000000000000001427272030600221675ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16.c000066400000000000000000003204101427272030600300060ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. */ void bli_sgemmsup_rd_zen_asm_2x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3,xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(16), r15) // compare jj to 4 jl(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_1x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle. vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(16), r15) // compare jj to 4 jl(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = unused // r15 = n dim index jj // r10 = unused mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3,xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 4 jl(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle. vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), rcx) // rcx = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rbx) // rbx = b + 4*jj*cs_b; lea(mem(r14), rax) // rax = a; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3,xmm4) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) label(.SDONE) add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jle(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3,xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), rcx) // load address of c prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3,xmm4) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rax, r8, 1), xmm3) add(imm(1*4), rax) // a += 1*cs_a = 1*8; vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm4) vhaddps( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps(xmm0,xmm0,xmm6) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1) vmovsd(xmm4, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle. vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) // ---------------------------------- iteration 2 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vmovss(mem(rax ), xmm3) add(imm(1*4), rax) // a += 1*cs_a = 1*8; vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm4) // xmm4 = sum(ymm4) sum(ymm5) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1) vmovsd(xmm4, mem(rcx))//a0a1 jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rd_zen_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle, vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) lea(mem(r12), rcx) // rcx = c + 6*ii*rs_c; lea(mem(r14), rax) // rax = a + 6*ii*rs_a; lea(mem(rdx), rbx) // rbx = b; lea(mem(rcx, rdi, 2), r10) // lea(mem(r10, rdi, 1), r10) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(r10, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovups(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rax, r8, 4), ymm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovups(mem(rax, r15, 1), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rax, r8, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovss(mem(rax, r8, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) vmovss(mem(rax, r13, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rax, r8, 4), xmm3) vfmadd231ps(ymm0, ymm3, ymm12) vfmadd231ps(ymm1, ymm3, ymm13) vmovss(mem(rax, r15, 1), xmm3) add(imm(1*4), rax) // a += 1*cs_a = 1*8; vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 // ymm10 ymm11 // ymm12 ymm13 // ymm14 ymm15 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm4) vhaddps( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm6) vhaddps( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm8) vhaddps( ymm11, ymm10, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm10) vhaddps( ymm13, ymm12, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm12) vhaddps( ymm15, ymm14, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm14) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) // xmm10 = sum(ymm10) sum(ymm11) // xmm12 = sum(ymm12) sum(ymm13) // xmm14 = sum(ymm14) sum(ymm15) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1) vmovsd(xmm4, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm6)//c*beta+(a0a1) vmovsd(xmm6, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm8)//c*beta+(a0a1) vmovsd(xmm8, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm10)//c*beta+(a0a1) vmovsd(xmm10, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm12)//c*beta+(a0a1) vmovsd(xmm12, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm14)//c*beta+(a0a1) vmovsd(xmm14, mem(rcx))//a0a1 //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(xmm14, mem(rcx)) label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c lea(mem(r14, r8, 4), r14) // lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_sgemmsup_rd_zen_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_zen_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_zen_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_zen_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vzeroall() // zero all xmm/ymm registers. mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // initialize loop by pre-loading // a column of a. mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) // ---------------------------------- iteration 1 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) // ---------------------------------- iteration 2 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) // ---------------------------------- iteration 3 vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rbx ), ymm0) vmovups(mem(rbx, r11, 1), ymm1) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vmovups(mem(rax ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovups(mem(rax, r8, 2), ymm3) add(imm(8*4), rax) // a += 4*cs_a = 4*8; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rbx ), xmm0) vmovss(mem(rbx, r11, 1), xmm1) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vmovss(mem(rax ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rax, r8, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vmovss(mem(rax, r8, 2), xmm3) add(imm(1*4), rax) // a += 1*cs_a = 1*8; vfmadd231ps(ymm0, ymm3, ymm8) vfmadd231ps(ymm1, ymm3, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm5 // ymm6 ymm7 // ymm8 ymm9 vhaddps( ymm5, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5) vhaddps(xmm0,xmm0,xmm4) vhaddps( ymm7, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps(xmm0,xmm0,xmm6) vhaddps( ymm9, ymm8, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps(xmm0,xmm0,xmm8) // xmm4 = sum(ymm4) sum(ymm5) // xmm6 = sum(ymm6) sum(ymm7) // xmm8 = sum(ymm8) sum(ymm9) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c000066400000000000000000002024561427272030600301740ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. NOTE: These kernels implicitly support column-oriented IO, implemented via an a high-level transposition of the entire operation. A and B will effectively remain row- and column-stored, respectively, but C will then effectively appear column-stored. Thus, this kernel may be used for both rrc and crc cases. */ // Prototype reference microkernels. void bli_sgemmsup_rd_zen_asm_6x16m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 16; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if ( n_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rd_zen_asm_6x8m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rd_zen_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_zen_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } return; } // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(c), r12) // load address of c mov(var(b), rdx) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_b = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm6) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(16), r15) // compare jj to 4 jl(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 16; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_zen_asm_2x16 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_zen_asm_1x16 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_zen_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(c), r12) // load address of c mov(var(b), rdx) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_b = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm6) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(8), r15) // compare jj to 4 jl(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_zen_asm_2x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_zen_asm_1x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_zen_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(c), r12) // load address of c mov(var(b), rdx) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_b = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm6) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jl(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_zen_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_zen_asm_1x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rd_zen_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t m_iter = m0 / 3; uint64_t m_left = m0 % 3; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), rdx) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b lea(mem(r8, r8, 2), r10) // r10 = 3*rs_a // r12 = rcx = c // r14 = rax = a // rdx = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r15) // jj = 0; label(.SLOOP3X4J) // LOOP OVER jj = [ 0 1 ... ] mov(var(a), r14) // load address of a mov(var(c), r12) // load address of c mov(var(b), rdx) lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(imm(1*4), rsi) // rsi *= cs_c = 1*8 lea(mem(r12, rsi, 1), r12) // r12 = c + 4*jj*cs_c; lea(mem( , r15, 1), rsi) // rsi = r15 = 4*jj; imul(r11, rsi) // rsi *= cs_b; lea(mem(rdx, rsi, 1), rdx) // rbx = b + 4*jj*cs_b; mov(var(m_iter), r9) // ii = m_iter; label(.SLOOP3X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(r14), rax) // rax = a_ii; lea(mem(rdx), rbx) // rbx = b_jj; lea(mem(r8, r8, 4), rdi) // rdi = 5*rs_a mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; // ---------------------------------- iteration 1 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; // ---------------------------------- iteration 2 prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; // ---------------------------------- iteration 3 vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a prefetch(0, mem(rax, r8, 4, 0*8)) // prefetch rax + 4*cs_a prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_b = 1*4; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) add(imm(1*4), rbx) // b += 1*rs_b = 1*4; dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 // ymm5 ymm8 // ymm6 ymm9 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps(xmm0,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps(xmm0,xmm0,xmm5) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps(xmm0,xmm0,xmm6) // ymm4 = sum(ymm4) sum(ymm7) // ymm5 = sum(ymm5) sum(ymm8) // ymm6 = sum(ymm6) sum(ymm9) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1) vmovsd(xmm4, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm5) vmovsd(xmm5, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm5, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) label(.SDONE) lea(mem(r12, rdi, 2), r12) // lea(mem(r12, rdi, 1), r12) // c_ii = r12 += 3*rs_c lea(mem(r14, r8, 2), r14) // lea(mem(r14, r8, 1), r14) // a_ii = r14 += 3*rs_a dec(r9) // ii -= 1; jne(.SLOOP3X4I) // iterate again if ii != 0. add(imm(4), r15) // jj += 4; cmp(imm(4), r15) // compare jj to 4 jl(.SLOOP3X4J) // if jj <= 4, jump to beginning // of jj loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict bj = b; float* restrict ai = a + i_edge*rs_a; if ( 2 == m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_zen_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { const dim_t mr_cur = 1; bli_sgemmsup_rd_zen_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c000066400000000000000000001732421427272030600301750ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrc: -------- ------ | | | | | | | | -------- ------ | | | | | | | | -------- += ------ ... | | | | | | | | -------- ------ | | | | | | | | -------- ------ : -------- ------ : Assumptions: - C is row-stored and B is column-stored; - A is row-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential microkernel is well-suited for a dot-product-based accumulation that performs vector loads from both A and B. NOTE: These kernels implicitly support column-oriented IO, implemented via an a high-level transposition of the entire operation. A and B will effectively remain row- and column-stored, respectively, but C will then effectively appear column-stored. Thus, this kernel may be used for both rrc and crc cases. */ void bli_sgemmsup_rd_zen_asm_6x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 6; // First check whether this is a edge case in the n dimension. If so, // dispatch other ?x8m kernels, as needed. if ( m_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m0 ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m0 == 7 ) { mr1 = 6; mr2 = 1; ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n; ker_fp2 = bli_sgemmsup_rd_zen_asm_1x16n; } else if ( m0 == 8 ) { mr1 = 6; mr2 = 2; ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n; ker_fp2 = bli_sgemmsup_rd_zen_asm_2x16n; } else // if ( m0 == 9 ) { mr1 = 6; mr2 = 3; ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n; ker_fp2 = bli_sgemmsup_rd_zen_asm_3x16n; } ker_fp1 ( conja, conjb, mr1, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } if ( 3 <= m_left ) { const dim_t mr_cur = 3; bli_sgemmsup_rd_zen_asm_3x16n ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 2 <= m_left ) { const dim_t mr_cur = 2; bli_sgemmsup_rd_zen_asm_2x16n ( conja, conjb, mr_cur, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur; } if ( 1 == m_left ) { bli_sgemv_ex ( BLIS_TRANSPOSE, conja, k0, n0, alpha, bj, rs_b0, cs_b0, ai, cs_a0, beta, cij, cs_c0, cntx, NULL ); } return; } // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = m dim index ii // r15 = n dim index jj mov(imm(0), r9) // ii = 0; label(.SLOOP3X4I) // LOOP OVER ii = [ 0 1 ... ] mov(var(b), r14) // load address of b mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(rdi, rsi) // rsi *= rs_c lea(mem(r12, rsi, 1), r12) // r12 = c + 3*ii*rs_c; lea(mem( , r9, 1), rsi) // rsi = r9 = 3*ii; imul(r8, rsi) // rsi *= rs_a; lea(mem(rdx, rsi, 1), rdx) // rax = a + 3*ii*rs_a; mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm6) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. add(imm(3), r9) // ii += 3; cmp(imm(3), r9) // compare ii to 3 jle(.SLOOP3X4I) // if ii <= 3, jump to beginning // of ii loop; otherwise, loop ends. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 6; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_zen_asm_6x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_sgemmsup_rd_zen_asm_3x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), r14) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 2 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) // ---------------------------------- iteration 3 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) vmovups(mem(rax, r8, 2), ymm2) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) vmovss(mem(rax, r8, 2), xmm2) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vfmadd231ps(ymm2, ymm3, ymm6) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vfmadd231ps(ymm2, ymm3, ymm9) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vfmadd231ps(ymm2, ymm3, ymm12) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) vfmadd231ps(ymm2, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) vhaddps( ymm9, ymm6, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm15, ymm12, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm6) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_zen_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_sgemmsup_rd_zen_asm_2x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), rdx) // load address of a. mov(var(rs_a), r8) // load rs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) mov(var(b), r14) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 2 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) // ---------------------------------- iteration 3 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) vmovups(mem(rax, r8, 1), ymm1) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) vmovss(mem(rax, r8, 1), xmm1) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vfmadd231ps(ymm1, ymm3, ymm5) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vfmadd231ps(ymm1, ymm3, ymm8) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) vfmadd231ps(ymm1, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) vhaddps( ymm8, ymm5, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) vhaddps( ymm14, ymm11, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) vhaddps(xmm2,xmm0,xmm5) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm5, xmm5) vmulps(xmm0, xmm6, xmm6) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm5) vmovups(xmm5, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm5, mem(rcx)) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 2; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_zen_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } } } void bli_sgemmsup_rd_zen_asm_1x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter32 = k0 / 32; uint64_t k_left32 = k0 % 32; uint64_t k_iter8 = k_left32 / 8; uint64_t k_left1 = k_left32 % 8; uint64_t n_iter = n0 / 4; uint64_t n_left = n0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), rdx) // load address of a. mov(var(b), r14) // load address of b. mov(var(cs_b), r11) // load cs_b lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) lea(mem(r11, r11, 2), r13) // r13 = 3*cs_b mov(var(c), r12) // load address of c // r12 = rcx = c // rdx = rax = a // r14 = rbx = b // r9 = unused // r15 = n dim index jj mov(var(n_iter), r15) // jj = n_iter; label(.SLOOP3X4J) // LOOP OVER jj = [ n_iter ... 1 0 ] // zen2 can execute 4 vxorpd ipc with // a latency of 1 cycle vxorps(ymm4, ymm4, ymm4) vxorps(ymm7, ymm7, ymm7) vxorps(ymm10, ymm10, ymm10) vxorps(ymm13, ymm13, ymm13) lea(mem(r12), rcx) // rcx = c_iijj; lea(mem(rdx), rax) // rax = a_ii; lea(mem(r14), rbx) // rbx = b_jj; mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c lea(mem(r11, r11, 2), rdi) // rdi = 3*cs_b lea(mem(rbx, r11, 4), r10) // r10 = rbx + 4*cs_b mov(var(k_iter32), rsi) // i = k_iter32; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKITER8) // if i == 0, jump to code that // contains the k_iter8 loop. label(.SLOOPKITER32) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(r10, 0*8)) // prefetch rbx + 4*cs_b prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 1 prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 2 prefetch(0, mem(r10, 8*8)) // prefetch rbx + 4*cs_b + 8*rs_b prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) // ---------------------------------- iteration 3 prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b add(imm(16*8), r10) // r10 += 8*rs_b = 8*8; vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER32) // iterate again if i != 0. label(.SCONSIDKITER8) mov(var(k_iter8), rsi) // i = k_iter8; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT1) // if i == 0, jump to code that // considers k_left1 loop. // else, we prepare to enter k_iter8 loop. label(.SLOOPKITER8) // EDGE LOOP (ymm) vmovups(mem(rax ), ymm0) add(imm(8*4), rax) // a += 4*cs_b = 4*8; vmovups(mem(rbx ), ymm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovups(mem(rbx, r11, 1), ymm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovups(mem(rbx, r11, 2), ymm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovups(mem(rbx, r13, 1), ymm3) add(imm(8*4), rbx) // b += 4*rs_b = 4*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER8) // iterate again if i != 0. label(.SCONSIDKLEFT1) mov(var(k_left1), rsi) // i = k_left1; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left1 loop. label(.SLOOPKLEFT1) // EDGE LOOP (scalar) // NOTE: We must use ymm registers here bc // using the xmm registers would zero out the // high bits of the destination registers, // which would destory intermediate results. vmovss(mem(rax ), xmm0) add(imm(1*4), rax) // a += 1*cs_b = 1*8; vmovss(mem(rbx ), xmm3) vfmadd231ps(ymm0, ymm3, ymm4) vmovss(mem(rbx, r11, 1), xmm3) vfmadd231ps(ymm0, ymm3, ymm7) vmovss(mem(rbx, r11, 2), xmm3) vfmadd231ps(ymm0, ymm3, ymm10) vmovss(mem(rbx, r13, 1), xmm3) add(imm(1*4), rbx) // b += 1*rs_b = 1*8; vfmadd231ps(ymm0, ymm3, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT1) // iterate again if i != 0. label(.SPOSTACCUM) // ymm4 ymm7 ymm10 ymm13 // ymm5 ymm8 ymm11 ymm14 // ymm6 ymm9 ymm12 ymm15 vhaddps( ymm7, ymm4, ymm0 ) vextractf128(imm(1), ymm0, xmm1 ) vaddps( xmm0, xmm1, xmm0 ) // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7) vhaddps( ymm13, ymm10, ymm2 ) vextractf128(imm(1), ymm2, xmm1 ) vaddps( xmm2, xmm1, xmm2 ) // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13) vhaddps(xmm2,xmm0,xmm4) // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13) mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomisd(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SBETAZERO) label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) label(.SDONE) add(imm(4*4), r12) // c_jj = r12 += 4*cs_c lea(mem(r14, r11, 4), r14) // b_jj = r14 += 4*cs_b dec(r15) // jj -= 1; jne(.SLOOP3X4J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter32] "m" (k_iter32), [k_iter8] "m" (k_iter8), [k_left1] "m" (k_left1), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 1; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + j_edge*cs_b; if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rd_zen_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { bli_sdotxv_ex ( conja, conjb, k0, alpha, ai, cs_a0, bj, rs_b0, beta, cij, cntx, NULL ); } } } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16.c000066400000000000000000011275071427272030600300450ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref ) void bli_sgemmsup_rv_zen_asm_5x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm11) vmovups(ymm11, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm13) vmovups(ymm13, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm12, xmm0)//e0-e3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm12, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vextractf128(imm(0x0), ymm13, xmm0)//e0-e3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm13, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) vmovups(ymm13, mem(rcx, rsi, 8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm12, xmm0)//e0-e3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm12, xmm0)//e4-e7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) vextractf128(imm(0x0), ymm13, xmm0)//e0-e3 vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm13, xmm0)//e4-e7 vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_4x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm11) vmovups(ymm11, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_3x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c //lea(mem(rcx, rdi, 2), rdx) // //lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm8, xmm0)//c0-c3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm11) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm8, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e4 vfmadd231ps(xmm6, xmm3, xmm1)//e5 vfmadd231ps(xmm8, xmm3, xmm2)//e6 vfmadd231ps(xmm10, xmm3, xmm14)//e7 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) /********************************************/ vextractf128(imm(0x0), ymm9, xmm0)//c0-c3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm9, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm8, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm8, xmm0)//c4-c7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) /********************************************/ vextractf128(imm(0x0), ymm9, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm9, xmm0)//c4-c7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_2x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) //lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*cs_c; //lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_1x16 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vextractf128(imm(0x0), ymm4, xmm0)//c0-c3 vmovss(mem(rcx),xmm7) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm11) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm7, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm4, xmm0)//e4-e7 vmovss(mem(rcx),xmm4) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm8) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e4 vfmadd231ps(xmm6, xmm3, xmm1)//e5 vfmadd231ps(xmm8, xmm3, xmm2)//e6 vfmadd231ps(xmm10, xmm3, xmm14)//e7 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vextractf128(imm(0x0), ymm5, xmm0)//c0-c3 vmovss(mem(rcx),xmm4) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm11) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vextractf128(imm(0x1), ymm5, xmm0)//e4-e7 vmovss(mem(rcx),xmm4) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm8) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e4 vfmadd231ps(xmm6, xmm3, xmm1)//e5 vfmadd231ps(xmm8, xmm3, xmm2)//e6 vfmadd231ps(xmm10, xmm3, xmm14)//e7 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vextractf128(imm(0x0), ymm4, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm4, xmm0)//e4-e7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) vextractf128(imm(0x0), ymm5, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm5, xmm0)//e4-e7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_6x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*16), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*16), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*16), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*16), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*16), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm14) vmovups(ymm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) /****6x8 tile is transposed and saved in col major as 8x6*****/ vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vpermilpd(imm(1),xmm0,xmm5)//e1f1 vpermilpd(imm(1),xmm2,xmm6)//e5f5 vfmadd231ps(mem(rdx), xmm3, xmm0) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vfmadd231ps(mem(rdx), xmm3, xmm5) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm6) vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vpermilpd(imm(1),xmm0,xmm5) vpermilpd(imm(1),xmm2,xmm6) vfmadd231ps(mem(rdx), xmm3, xmm0) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vfmadd231ps(mem(rdx), xmm3, xmm5) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm6) vmovlpd(xmm5, mem(rdx)) // store ( gamma43..gamma53 ) vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) add(rdi, rcx) vmovups(ymm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) /******************top right tile 8x2***************************/ vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_5x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm12, xmm0)//e0-e3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm12, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm12, xmm0)//e0-e3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm12, xmm0)//e4-e7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_4x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_3x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 5*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) /********************************************/ vextractf128(imm(0x0), ymm8, xmm0)//c0-c3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm11) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rcx += cs_c vextractf128(imm(0x1), ymm8, xmm0)//c0-c3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm11) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) /********************************************/ vextractf128(imm(0x0), ymm8, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm8, xmm0)//c4-c7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_2x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 5*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0,xmm0,xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_1x8 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*rs_c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0,xmm0,xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) /********************************************/ vextractf128(imm(0x0), ymm4, xmm0)//c0-c3 vmovss(mem(rcx),xmm8) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm11) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm8, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm4, xmm0)//e4-e7 vmovss(mem(rcx),xmm4) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm8) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e4 vfmadd231ps(xmm6, xmm3, xmm1)//e5 vfmadd231ps(xmm8, xmm3, xmm2)//e6 vfmadd231ps(xmm10, xmm3, xmm14)//e7 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vextractf128(imm(0x0), ymm4, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm4, xmm0)//c4-c7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_6x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 3*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm8) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm10) vmovups(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm12) vmovups(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm14) vmovups(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklpd(xmm1, xmm0, xmm2)//a0b0c0d0 vunpckhpd(xmm1, xmm0, xmm5)//a1b1c1d1 vfmadd231ps(mem(rcx), xmm3, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm5) vmovups(xmm2, mem(rcx)) vmovups(xmm5, mem(rcx, rsi, 1)) lea(mem(rcx, rsi, 2), rcx) // rcx += 2*cs_c vunpckhps(xmm6, xmm4, xmm0)//a2b2a3b3 vunpckhps(xmm10, xmm8, xmm1)//c2d2c3d3 vunpcklpd(xmm1, xmm0, xmm7)//a2b2c2d2 vunpckhpd(xmm1, xmm0, xmm9)//a3b3c3d3 vfmadd231ps(mem(rcx), xmm3, xmm7) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm9) vmovups(xmm7, mem(rcx)) vmovups(xmm9, mem(rcx, rsi, 1)) vunpcklps(xmm14, xmm12, xmm0)//e0f0e1f1 vunpckhps(xmm14, xmm12, xmm1)//e2f2e3f3 vmovsd(mem(rdx),xmm2) vmovsd(mem(rdx, rsi, 1),xmm4) vmovsd(mem(rdx, rsi, 2),xmm6) vmovsd(mem(rdx, rax, 1),xmm8) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//e1f1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//e3f3 vfmadd231ps(xmm2, xmm3, xmm0) vfmadd231ps(xmm4, xmm3, xmm5) vfmadd231ps(xmm6, xmm3, xmm1) vfmadd231ps(xmm8, xmm3, xmm7) vmovsd(xmm0, mem(rdx)) //e0f0 vmovsd(xmm5, mem(rdx, rsi, 1)) //e1f1 vmovsd(xmm1, mem(rdx, rsi, 2)) //e2f2 vmovsd(xmm7, mem(rdx, rax, 1)) //e3f3 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vmovups(xmm10, mem(rcx)) add(rdi, rcx) vmovups(xmm12, mem(rcx)) add(rdi, rcx) vmovups(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklpd(xmm1, xmm0, xmm2)//a0b0c0d0 vunpckhpd(xmm1, xmm0, xmm5)//a1b1c1d1 vmovups(xmm2, mem(rcx)) vmovups(xmm5, mem(rcx, rsi, 1)) lea(mem(rcx, rsi, 2), rcx) // rcx += 2*cs_c vunpckhps(xmm6, xmm4, xmm0)//a2b2a3b3 vunpckhps(xmm10, xmm8, xmm1)//c2d2c3d3 vunpcklpd(xmm1, xmm0, xmm7)//a2b2c2d2 vunpckhpd(xmm1, xmm0, xmm9)//a3b3c3d3 vmovups(xmm7, mem(rcx)) vmovups(xmm9, mem(rcx, rsi, 1)) vunpcklps(xmm14, xmm12, xmm0)//e0f0e1f1 vunpckhps(xmm14, xmm12, xmm1)//e2f2e3f3 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//e1f1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//e3f3 vmovsd(xmm0, mem(rdx)) //e0f0 vmovsd(xmm5, mem(rdx, rsi, 1)) //e1f1 vmovsd(xmm1, mem(rdx, rsi, 2)) //e2f2 vmovsd(xmm7, mem(rdx, rax, 1)) //e3f3 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_5x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 3*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c mov(r9, rsi) // rsi = rs_b; sal(imm(5), rsi) // rsi = 16*rs_b; lea(mem(rax, rsi, 1), rdx) // rdx = b + 16*rs_b; mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm8) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm10) vmovups(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm12) vmovups(xmm12, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklpd(xmm1, xmm0, xmm2)//a0b0c0d0 vunpckhpd(xmm1, xmm0, xmm5)//a1b1c1d1 vfmadd231ps(mem(rcx), xmm3, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm5) vmovups(xmm2, mem(rcx)) vmovups(xmm5, mem(rcx, rsi, 1)) lea(mem(rcx, rsi, 2), rcx) // rcx += 2*cs_c vunpckhps(xmm6, xmm4, xmm0)//a2b2a3b3 vunpckhps(xmm10, xmm8, xmm1)//c2d2c3d3 vunpcklpd(xmm1, xmm0, xmm7)//a2b2c2d2 vunpckhpd(xmm1, xmm0, xmm9)//a3b3c3d3 vfmadd231ps(mem(rcx), xmm3, xmm7) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm9) vmovups(xmm7, mem(rcx)) vmovups(xmm9, mem(rcx, rsi, 1)) vmovss(mem(rdx),xmm2) vmovss(mem(rdx, rsi, 1),xmm4) vmovss(mem(rdx, rsi, 2),xmm6) vmovss(mem(rdx, rax, 1),xmm8) vshufps(imm(0x01), xmm12, xmm12,xmm1) vshufps(imm(0x02), xmm12, xmm12,xmm5) vshufps(imm(0x03), xmm12, xmm12,xmm7) vfmadd231ps(xmm2, xmm3, xmm12) vfmadd231ps(xmm4, xmm3, xmm1) vfmadd231ps(xmm6, xmm3, xmm5) vfmadd231ps(xmm8, xmm3, xmm7) vmovss(xmm12, mem(rdx)) //e0 vmovss(xmm1, mem(rdx, rsi, 1)) //e1 vmovss(xmm5, mem(rdx, rsi, 2)) //e2 vmovss(xmm7, mem(rdx, rax, 1)) //e3 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vmovups(xmm10, mem(rcx)) add(rdi, rcx) vmovups(xmm12, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklpd(xmm1, xmm0, xmm2)//a0b0c0d0 vunpckhpd(xmm1, xmm0, xmm5)//a1b1c1d1 vmovups(xmm2, mem(rcx)) vmovups(xmm5, mem(rcx, rsi, 1)) lea(mem(rcx, rsi, 2), rcx) // rcx += 2*cs_c vunpckhps(xmm6, xmm4, xmm0)//a2b2a3b3 vunpckhps(xmm10, xmm8, xmm1)//c2d2c3d3 vunpcklpd(xmm1, xmm0, xmm7)//a2b2c2d2 vunpckhpd(xmm1, xmm0, xmm9)//a3b3c3d3 vmovups(xmm7, mem(rcx)) vmovups(xmm9, mem(rcx, rsi, 1)) vshufps(imm(0x01), xmm12, xmm12,xmm1) vshufps(imm(0x02), xmm12, xmm12,xmm5) vshufps(imm(0x03), xmm12, xmm12,xmm7) vmovss(xmm12, mem(rdx)) //e0 vmovss(xmm1, mem(rdx, rsi, 1)) //e1 vmovss(xmm5, mem(rdx, rsi, 2)) //e2 vmovss(xmm7, mem(rdx, rax, 1)) //e3 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_4x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm8) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm10) vmovups(xmm10, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklpd(xmm1, xmm0, xmm2)//a0b0c0d0 vunpckhpd(xmm1, xmm0, xmm5)//a1b1c1d1 vfmadd231ps(mem(rcx), xmm3, xmm2) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm5) vmovups(xmm2, mem(rcx)) vmovups(xmm5, mem(rcx, rsi, 1)) lea(mem(rcx, rsi, 2), rcx) // rcx += 2*cs_c vunpckhps(xmm6, xmm4, xmm0)//a2b2a3b3 vunpckhps(xmm10, xmm8, xmm1)//c2d2c3d3 vunpcklpd(xmm1, xmm0, xmm7)//a2b2c2d2 vunpckhpd(xmm1, xmm0, xmm9)//a3b3c3d3 vfmadd231ps(mem(rcx), xmm3, xmm7) vfmadd231ps(mem(rcx, rsi, 1), xmm3, xmm9) vmovups(xmm7, mem(rcx)) vmovups(xmm9, mem(rcx, rsi, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vmovups(xmm10, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklpd(xmm1, xmm0, xmm2)//a0b0c0d0 vunpckhpd(xmm1, xmm0, xmm5)//a1b1c1d1 vmovups(xmm2, mem(rcx)) vmovups(xmm5, mem(rcx, rsi, 1)) lea(mem(rcx, rsi, 2), rcx) // rcx += 2*cs_c vunpckhps(xmm6, xmm4, xmm0)//a2b2a3b3 vunpckhps(xmm10, xmm8, xmm1)//c2d2c3d3 vunpcklpd(xmm1, xmm0, xmm7)//a2b2c2d2 vunpckhpd(xmm1, xmm0, xmm9)//a3b3c3d3 vmovups(xmm7, mem(rcx)) vmovups(xmm9, mem(rcx, rsi, 1)) label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_3x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm8) vmovups(xmm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//e0f0e1f1 vunpckhps(xmm6, xmm4, xmm1)//e2f2e3f3 vmovsd(mem(rcx),xmm2) vmovsd(mem(rcx, rsi, 1),xmm4) vmovsd(mem(rcx, rsi, 2),xmm6) vmovsd(mem(rcx, rax, 1),xmm10) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//e1f1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//e3f3 vfmadd231ps(xmm2, xmm3, xmm0) vfmadd231ps(xmm4, xmm3, xmm5) vfmadd231ps(xmm6, xmm3, xmm1) vfmadd231ps(xmm10, xmm3, xmm7) vmovsd(xmm0, mem(rcx)) //e0f0 vmovsd(xmm5, mem(rcx, rsi, 1)) //e1f1 vmovsd(xmm1, mem(rcx, rsi, 2)) //e2f2 vmovsd(xmm7, mem(rcx, rax, 1)) //e3f3 vmovss(mem(rdx),xmm2) vmovss(mem(rdx, rsi, 1),xmm4) vmovss(mem(rdx, rsi, 2),xmm6) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm8, xmm8,xmm1) vshufps(imm(0x02), xmm8, xmm8,xmm5) vshufps(imm(0x03), xmm8, xmm8,xmm7) vfmadd231ps(xmm2, xmm3, xmm8) vfmadd231ps(xmm4, xmm3, xmm1) vfmadd231ps(xmm6, xmm3, xmm5) vfmadd231ps(xmm10, xmm3, xmm7) vmovss(xmm8, mem(rdx)) //e0 vmovss(xmm1, mem(rdx, rsi, 1)) //e1 vmovss(xmm5, mem(rdx, rsi, 2)) //e2 vmovss(xmm7, mem(rdx, rax, 1)) //e3 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vmovups(xmm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//e0f0e1f1 vunpckhps(xmm6, xmm4, xmm1)//e2f2e3f3 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//e1f1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//e3f3 vmovsd(xmm0, mem(rcx)) //e0f0 vmovsd(xmm5, mem(rcx, rsi, 1)) //e1f1 vmovsd(xmm1, mem(rcx, rsi, 2)) //e2f2 vmovsd(xmm7, mem(rcx, rax, 1)) //e3f3 vshufps(imm(0x01), xmm8, xmm8,xmm1) vshufps(imm(0x02), xmm8, xmm8,xmm5) vshufps(imm(0x03), xmm8, xmm8,xmm7) vmovss(xmm8, mem(rdx)) //e0 vmovss(xmm1, mem(rdx, rsi, 1)) //e1 vmovss(xmm5, mem(rdx, rsi, 2)) //e2 vmovss(xmm7, mem(rdx, rax, 1)) //e3 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_2x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//e0f0e1f1 vunpckhps(xmm6, xmm4, xmm1)//e2f2e3f3 vmovsd(mem(rcx),xmm2) vmovsd(mem(rcx, rsi, 1),xmm4) vmovsd(mem(rcx, rsi, 2),xmm6) vmovsd(mem(rcx, rax, 1),xmm10) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//e1f1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//e3f3 vfmadd231ps(xmm2, xmm3, xmm0) vfmadd231ps(xmm4, xmm3, xmm5) vfmadd231ps(xmm6, xmm3, xmm1) vfmadd231ps(xmm10, xmm3, xmm7) vmovsd(xmm0, mem(rcx)) //e0f0 vmovsd(xmm5, mem(rcx, rsi, 1)) //e1f1 vmovsd(xmm1, mem(rcx, rsi, 2)) //e2f2 vmovsd(xmm7, mem(rcx, rax, 1)) //e3f3 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//e0f0e1f1 vunpckhps(xmm6, xmm4, xmm1)//e2f2e3f3 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//e1f1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//e3f3 vmovsd(xmm0, mem(rcx)) //e0f0 vmovsd(xmm5, mem(rcx, rsi, 1)) //e1f1 vmovsd(xmm1, mem(rcx, rsi, 2)) //e2f2 vmovsd(xmm7, mem(rcx, rax, 1)) //e3f3 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_1x4 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 0*8)) // prefetch c + 3*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vmovss(mem(rcx),xmm2) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm8) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm4, xmm4,xmm1) vshufps(imm(0x02), xmm4, xmm4,xmm5) vshufps(imm(0x03), xmm4, xmm4,xmm7) vfmadd231ps(xmm2, xmm3, xmm4) vfmadd231ps(xmm6, xmm3, xmm1) vfmadd231ps(xmm8, xmm3, xmm5) vfmadd231ps(xmm10, xmm3, xmm7) vmovss(xmm4, mem(rcx)) //e0 vmovss(xmm1, mem(rcx, rsi, 1)) //e1 vmovss(xmm5, mem(rcx, rsi, 2)) //e2 vmovss(xmm7, mem(rcx, rax, 1)) //e3 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vshufps(imm(0x01), xmm4, xmm4,xmm1) vshufps(imm(0x02), xmm4, xmm4,xmm5) vshufps(imm(0x03), xmm4, xmm4,xmm7) vmovss(xmm4, mem(rcx)) //e0 vmovss(xmm1, mem(rcx, rsi, 1)) //e1 vmovss(xmm5, mem(rcx, rsi, 2)) //e2 vmovss(xmm7, mem(rcx, rax, 1)) //e3 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_6x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 1*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) prefetch(0, mem(rcx, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1) vmovsd(xmm4, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm14) vmovsd(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklps(xmm14, xmm12, xmm2)//e0f0 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rdi, 2),xmm6) vmovsd(mem(rcx, rdi, 4),xmm8) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//c1d1 vshufpd(imm(0x01), xmm2, xmm2, xmm9)//e1f1 vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vfmadd231ps(xmm8, xmm3, xmm2) vmovsd(xmm0, mem(rcx)) //a0b0 vmovsd(xmm1, mem(rcx, rdi, 2)) //c0d0 vmovsd(xmm2, mem(rcx, rdi, 4)) //e0f0 lea(mem(rcx, rsi, 1), rcx) vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rdi, 2),xmm6) vmovsd(mem(rcx, rdi, 4),xmm8) vfmadd231ps(xmm4, xmm3, xmm5) vfmadd231ps(xmm6, xmm3, xmm7) vfmadd231ps(xmm8, xmm3, xmm9) vmovsd(xmm5, mem(rcx)) //a1b1 vmovsd(xmm7, mem(rcx, rdi, 2)) //c1d1 vmovsd(xmm9, mem(rcx, rdi, 4)) //e1f1 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) vmovsd(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vunpcklps(xmm14, xmm12, xmm2)//e0f0 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//c1d1 vshufpd(imm(0x01), xmm2, xmm2, xmm9)//e1f1 vmovsd(xmm0, mem(rcx)) //a0b0 vmovsd(xmm1, mem(rcx, rdi, 2)) //c0d0 vmovsd(xmm2, mem(rcx, rdi, 4)) //e0f0 lea(mem(rcx, rsi, 1), rcx) vmovsd(xmm5, mem(rcx)) //e0f0 vmovsd(xmm7, mem(rcx, rdi, 2)) //e1f1 vmovsd(xmm9, mem(rcx, rdi, 4)) //e0f0 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_5x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 1*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) prefetch(0, mem(rcx, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1) vmovsd(xmm4, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm12) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rdi, 2),xmm6) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//c1d1 vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) //a0b0 vmovsd(xmm1, mem(rcx, rdi, 2)) //c0d0 vmovss(mem(rcx, rdi, 4),xmm4) vshufps(imm(0x01), xmm12, xmm12, xmm9)//e1 vfmadd231ps(xmm4, xmm3, xmm12) vmovss(xmm12,mem(rcx,rdi,4))//e0 lea(mem(rcx, rsi, 1), rcx) vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rdi, 2),xmm6) vfmadd231ps(xmm4, xmm3, xmm5) vfmadd231ps(xmm6, xmm3, xmm7) vmovsd(xmm5, mem(rcx)) //a1b1 vmovsd(xmm7, mem(rcx, rdi, 2)) //c1d1 vmovss( mem(rcx, rdi, 4),xmm4) vfmadd231ps(xmm4, xmm3, xmm9) vmovss(xmm9,mem(rcx,rdi,4))//e1 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx))//a0a1 add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(xmm10, mem(rcx)) add(rdi, rcx) vmovsd(xmm12, mem(rcx)) add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//c1d1 vmovsd(xmm0, mem(rcx)) //a0b0 vmovsd(xmm1, mem(rcx, rdi, 2)) //c0d0 vshufps(imm(0x01), xmm12, xmm12, xmm9)//e1 vmovss(xmm12,mem(rcx,rdi,4))//e0 lea(mem(rcx, rsi, 1), rcx) vmovsd(xmm5, mem(rcx)) //a1b1 vmovsd(xmm7, mem(rcx, rdi, 2)) //c1d1 vmovss(xmm9,mem(rcx,rdi,4))//e1 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_4x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(rcx, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(rcx, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(xmm0,xmm0,xmm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vmovsd(mem(rcx), xmm0)////a0a1 vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm8) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(mem(rcx), xmm0) vfmadd231ps(xmm0, xmm3, xmm10) vmovsd(xmm10, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rdi, 2),xmm6) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//c1d1 vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) //a0b0 vmovsd(xmm1, mem(rcx, rdi, 2)) //c0d0 lea(mem(rcx, rsi, 1), rcx) vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rdi, 2),xmm6) vfmadd231ps(xmm4, xmm3, xmm5) vfmadd231ps(xmm6, xmm3, xmm7) vmovsd(xmm5, mem(rcx)) //a1b1 vmovsd(xmm7, mem(rcx, rdi, 2)) //c1d1 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) add(rdi, rcx) vmovsd(xmm10, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vshufpd(imm(0x01), xmm1, xmm1, xmm7)//c1d1 vmovsd(xmm0, mem(rcx)) //a0b0 vmovsd(xmm1, mem(rcx, rdi, 2)) //c0d0 lea(mem(rcx, rsi, 1), rcx) vmovsd(xmm5, mem(rcx)) //a1b1 vmovsd(xmm7, mem(rcx, rdi, 2)) //c1d1 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_3x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) prefetch(0, mem(rcx, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm8) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm8) vmovsd(xmm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vmovsd(mem(rcx),xmm4) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vfmadd231ps(xmm4, xmm3, xmm0) vmovsd(xmm0, mem(rcx)) //a0b0 vmovss(mem(rcx,rdi,2),xmm4) vshufps(imm(0x01), xmm8, xmm8, xmm9)//c1 vfmadd231ps(xmm4, xmm3, xmm8) vmovss(xmm8,mem(rcx,rdi,2))//c0 lea(mem(rcx, rsi, 1), rcx) vmovsd(mem(rcx),xmm4) vfmadd231ps(xmm4, xmm3, xmm5) vmovsd(xmm5, mem(rcx)) //a1b1 vmovss(mem(rcx,rdi,2),xmm4) vfmadd231ps(xmm4, xmm3, xmm9) vmovss(xmm9,mem(rcx,rdi,2))//c1 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) add(rdi, rcx) vmovsd(xmm8, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vmovsd(xmm0, mem(rcx)) //a0b0 vshufps(imm(0x01), xmm8, xmm8, xmm9)//c1 vmovss(xmm8,mem(rcx,rdi,2))//c0 lea(mem(rcx, rsi, 1), rcx) vmovsd(xmm5, mem(rcx)) //a1b1 vmovss(xmm9,mem(rcx,rdi,2))//c1 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_2x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovsd(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vmovsd(mem(rcx),xmm4) vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vfmadd231ps(xmm4, xmm3, xmm0) vmovsd(xmm0, mem(rcx)) //a0b0 lea(mem(rcx, rsi, 1), rcx) vmovsd(mem(rcx),xmm4) vfmadd231ps(xmm4, xmm3, xmm5) vmovsd(xmm5, mem(rcx)) //a1b1 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) add(rdi, rcx) vmovsd(xmm6, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vshufpd(imm(0x01), xmm0, xmm0, xmm5)//a1b1 vmovsd(xmm0, mem(rcx)) //a0b0 vmovsd(xmm5, mem(rcx, rsi, 1)) //a1b1 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } void bli_sgemmsup_rv_zen_asm_1x2 ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // ------------------------------------------------------------------------- begin_asm() vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), rbx) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), rcx) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(rcx, 1*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) prefetch(0, mem(rcx, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 1 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 2 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) // ---------------------------------- iteration 3 vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm4) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovsd(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vshufps(imm(0x01), xmm4, xmm4, xmm9)//c1 vmovss(mem(rcx),xmm6) vfmadd231ps(xmm6, xmm3, xmm4) vmovss(xmm4,mem(rcx))//c0 vmovss(mem(rcx,rsi,1),xmm6) vfmadd231ps(xmm6, xmm3, xmm9) vmovss(xmm9,mem(rcx,rsi,1))//c1 jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovsd(xmm4, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vshufps(imm(0x01), xmm4, xmm4, xmm9)//c1 vmovss(xmm4,mem(rcx))//c0 vmovss(xmm9,mem(rcx,rsi,1))//c1 label(.SDONE) end_asm( : // output operands (none) : // input operands [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) } // ----------------------------------------------------------------------------- // NOTE: Normally, for any "?x1" kernel, we would call the reference kernel. // However, at least one other subconfiguration (zen) uses this kernel set, so // we need to be able to call a set of "?x1" kernels that we know will actually // exist regardless of which subconfiguration these kernels were used by. Thus, // the compromise employed here is to inline the reference kernel so it gets // compiled as part of the zen kernel set, and hence can unconditionally be // called by other kernels within that kernel set. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mdim ) \ \ void PASTEMAC(ch,opname) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ for ( dim_t i = 0; i < mdim; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ /* for ( dim_t j = 0; j < 1; ++j ) */ \ { \ ctype* restrict cij = ci /*[ j*cs_c ]*/ ; \ ctype* restrict bj = b /*[ j*cs_b ]*/ ; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(d,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } GENTFUNC( float, s, gemmsup_r_zen_ref_6x1, 6 ) GENTFUNC( float, s, gemmsup_r_zen_ref_5x1, 5 ) GENTFUNC( float, s, gemmsup_r_zen_ref_4x1, 4 ) GENTFUNC( float, s, gemmsup_r_zen_ref_3x1, 3 ) GENTFUNC( float, s, gemmsup_r_zen_ref_2x1, 2 ) GENTFUNC( float, s, gemmsup_r_zen_ref_1x1, 1 ) cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16m.c000066400000000000000000002303111427272030600302050ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ void bli_sgemmsup_rv_zen_asm_6x16m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t n_left = n0 % 16; // First check whether this is a edge case in the n dimension. If so, // dispatch other 6x?m kernels, as needed. if (n_left ) { float* cij = c; float* bj = b; float* ai = a; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_zen_asm_6x8m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_zen_asm_6x4m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_zen_asm_6x2m ( conja, conjb, m0, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { dim_t ps_a0 = bli_auxinfo_ps_a( data ); if ( ps_a0 == 6 * rs_a0 ) { bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); } else { const dim_t mr = 6; // Since A is packed into row panels, we must use a loop over // gemv. dim_t m_iter = ( m0 + mr - 1 ) / mr; dim_t m_left = m0 % mr; float* restrict ai_ii = ai; float* restrict cij_ii = cij; for ( dim_t ii = 0; ii < m_iter; ii += 1 ) { dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left ) ? mr : m_left ); bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, mr_cur, k0, alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0, beta, cij_ii, rs_c0, cntx, NULL ); cij_ii += mr*rs_c0; ai_ii += ps_a0; } } } return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(dt) lea(mem(, r9, 4), r9) // cs_a *= sizeof(dt) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(dt) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(dt) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X16I) // LOOP OVER ii = [ m_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(b), rbx) // load address of b. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored pre-fetching on c // not used lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of pre-fetching c label(.SCOLPFETCH) // column-stored pre-fetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(dt) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for pre-fetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rdx, 5*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 prefetch(0, mem(rdx, r9, 1, 5*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 prefetch(0, mem(rdx, r9, 2, 5*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 prefetch(0, mem(rdx, rcx, 1, 5*8)) lea(mem(rdx, r9, 4), rdx) // a_prefetch += 4*cs_a; vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(dt) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm11) vmovups(ymm11, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm13) vmovups(ymm13, mem(rcx, rsi, 8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm14) vmovups(ymm14, mem(rcx)) vfmadd231ps(mem(rcx, rsi, 8), ymm3, ymm15) vmovups(ymm15, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) /*|-----------------| |-----|----| | | | | 8x4 | 8x2| | 4x8 | 4x8 | | | | | | | |-----|----| |-----------------| | 8x4 | 8x2| | 2x8 | 2x8 | | | | |------------------ |----------|*/ /****6x16 tile is transposed and saved in col major as 6x16*****/ /****top left tile 4x8 transposed to top left tile 8x4**********/ vunpcklps(ymm6, ymm4, ymm0)//a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1)//c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2)//a1b1c0d0 a5b5c4d4 vblendps(imm(0xcc), ymm2, ymm0, ymm0)//a0b0c0d0 a4b4c4d4 vblendps(imm(0x33), ymm2, ymm1, ymm1)//a1b1c1d1 a5b5c5d5 vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /***bottom left tile - 2x8 is transposed to top right tile 8x2**********/ vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c vmovlpd(mem(rax), xmm1, xmm1) vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c /***top right tile 4x8 is transposed to bottom left tile 8x4**********/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c /*** bottom right 2x8 is transposed to bottom right tile 8x2*******/ vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c vmovlpd(mem(rax), xmm1, xmm1) vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) vmovups(ymm13, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm14, mem(rcx)) vmovups(ymm15, mem(rcx, rsi, 8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****6x16 tile going to save into 16x6 tile in C*****/ /******************top left tile 8x4***************************/ vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /******************top right tile 8x2***************************/ vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 1), rdx) lea(mem(rdx, rsi, 4), rdx) // rdx += 8*cs_c /******************bottom left tile 8x4***************************/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) /******************bottom right tile 8x2***************************/ vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X16I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 16; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict ai = a + m_iter*ps_a; float* restrict bj = b; sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_zen_asm_1x16, bli_sgemmsup_rv_zen_asm_2x16, bli_sgemmsup_rv_zen_asm_3x16, bli_sgemmsup_rv_zen_asm_4x16, bli_sgemmsup_rv_zen_asm_5x16 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_zen_asm_6x8m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X8I) // LOOP OVER ii = [ m_iter ... 1 0 ] // skylake can execute 3 vxorpd ipc with // a latency of 1 cycle, while vzeroall // has a latency of 12 cycles. vxorps(ymm1, ymm1, ymm1) // zero ymm1 since we only use the lower vxorps(ymm4, ymm4, ymm4) // half (xmm1), and nans/infs may slow us down. vxorps(ymm6, ymm6, ymm6) vxorps(ymm8, ymm8, ymm8) vxorps(ymm10, ymm10, ymm10) vxorps(ymm12, ymm12, ymm12) vxorps(ymm14, ymm14, ymm14) mov(var(b), rbx) // load address of b. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rdx, 5*8)) vmovups(mem(rbx), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 1 prefetch(0, mem(rdx, r9, 1, 5*8)) vmovups(mem(rbx), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 2 prefetch(0, mem(rdx, r9, 2, 5*8)) vmovups(mem(rbx), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) // ---------------------------------- iteration 3 prefetch(0, mem(rdx, rcx, 1, 5*8)) vmovups(mem(rbx), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx), ymm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm0, ymm3, ymm6) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm0, ymm3, ymm10) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm0, ymm3, ymm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm14, ymm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm14) vmovups(ymm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) /****6x8 tile is transposed and saved in col major as 8x6*****/ vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vpermilps(imm(0xe),xmm0,xmm5) vpermilps(imm(0xe),xmm2,xmm6) vfmadd231ps(mem(rdx), xmm3, xmm0) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vfmadd231ps(mem(rdx), xmm3, xmm5) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm6) vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vpermilps(imm(0xe),xmm0,xmm5) vpermilps(imm(0xe),xmm2,xmm6) vfmadd231ps(mem(rdx), xmm3, xmm0) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vfmadd231ps(mem(rdx), xmm3, xmm5) vfmadd231ps(mem(rdx, rsi, 4), xmm3, xmm6) vmovlpd(xmm5, mem(rdx)) // store ( gamma43..gamma53 ) vmovlpd(xmm6, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) add(rdi, rcx) vmovups(ymm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) /******************top right tile 8x2***************************/ vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X8I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 8; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict ai = a + m_iter*ps_a; float* restrict bj = b; sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_zen_asm_1x8, bli_sgemmsup_rv_zen_asm_2x8, bli_sgemmsup_rv_zen_asm_3x8, bli_sgemmsup_rv_zen_asm_4x8, bli_sgemmsup_rv_zen_asm_5x8 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_zen_asm_6x4m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X4I) // LOOP OVER ii = [ m_iter ... 1 0 ] vxorps(xmm1, xmm1, xmm1) vxorps(xmm4, xmm4, xmm4) vxorps(xmm6, xmm6, xmm6) vxorps(xmm8, xmm8, xmm8) vxorps(xmm10, xmm10, xmm10) vxorps(xmm12, xmm12, xmm12) vxorps(xmm14, xmm14, xmm14) mov(var(b), rbx) // load address of b. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rdx, 5*8)) vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 prefetch(0, mem(rdx, r9, 1, 5*8)) vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 prefetch(0, mem(rdx, r9, 2, 5*8)) vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 prefetch(0, mem(rdx, rcx, 1, 5*8)) vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovups(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm8) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm10) vmovups(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm12) vmovups(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm14) vmovups(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) /****6x4 tile is transposed and saved in col major as 4x6*****/ vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx), xmm3, xmm0) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vfmadd231ps(mem(rcx), xmm3, xmm1) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vfmadd231ps(mem(rcx), xmm3, xmm0) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vfmadd231ps(mem(rcx), xmm3, xmm1) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vunpcklps(xmm14, xmm12, xmm0) vpermilps(imm(0x4e), xmm0, xmm5) vmovq(mem(rdx),xmm4) vfmadd231ps(xmm4, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) lea(mem(rdx, rsi, 1), rdx) vmovq(mem(rdx),xmm4) vfmadd231ps(xmm4, xmm3, xmm5) vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(xmm14, xmm12, xmm0) vpermilps(imm(0x4e), xmm0, xmm5) vfmadd231ps(mem(rdx), xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) lea(mem(rdx, rsi, 1), rdx) vfmadd231ps(mem(rdx), xmm3, xmm5) vmovlpd(xmm5, mem(rdx)) // store ( gamma43..gamma53 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(xmm4, mem(rcx)) add(rdi, rcx) vmovups(xmm6, mem(rcx)) add(rdi, rcx) vmovups(xmm8, mem(rcx)) add(rdi, rcx) vmovups(xmm10, mem(rcx)) add(rdi, rcx) vmovups(xmm12, mem(rcx)) add(rdi, rcx) vmovups(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0) vunpcklps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(xmm6, xmm4, xmm0) vunpckhps(xmm10, xmm8, xmm1) vshufps(imm(0x4e), xmm1, xmm0, xmm2) vblendps(imm(0xcc), xmm2, xmm0, xmm0) vblendps(imm(0x33), xmm2, xmm1, xmm1) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vunpcklps(xmm14, xmm12, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(xmm14, xmm12, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X4I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 4; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict ai = a + m_iter*ps_a; float* restrict bj = b; sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_zen_asm_1x4, bli_sgemmsup_rv_zen_asm_2x4, bli_sgemmsup_rv_zen_asm_3x4, bli_sgemmsup_rv_zen_asm_4x4, bli_sgemmsup_rv_zen_asm_5x4 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } void bli_sgemmsup_rv_zen_asm_6x2m ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t m_iter = m0 / 6; uint64_t m_left = m0 % 6; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of A and convert it to units of bytes. uint64_t ps_a = bli_auxinfo_ps_a( data ); uint64_t ps_a4 = ps_a * sizeof( float ); if ( m_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() mov(var(a), r14) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rax = a // read rbx from var(b) near beginning of loop // r11 = m dim index ii mov(var(m_iter), r11) // ii = m_iter; label(.SLOOP6X2I) // LOOP OVER ii = [ m_iter ... 1 0 ] vxorps(xmm1, xmm1, xmm1) vxorps(xmm4, xmm4, xmm4) vxorps(xmm6, xmm6, xmm6) vxorps(xmm8, xmm8, xmm8) vxorps(xmm10, xmm10, xmm10) vxorps(xmm12, xmm12, xmm12) vxorps(xmm14, xmm14, xmm14) mov(var(b), rbx) // load address of b. mov(r14, rax) // reset rax to current upanel of a. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 5*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 5*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 5*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 5*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c label(.SPOSTPFETCH) // done prefetching c lea(mem(r9, r9, 2), rcx) // rcx = 3*cs_a; lea(mem(rax, r8, 4), rdx) // use rdx for prefetching lines lea(mem(rdx, r8, 2), rdx) // from next upanel of a. mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rdx, 5*8)) vmovq(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 1 prefetch(0, mem(rdx, r9, 1, 5*8)) vmovq(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 2 prefetch(0, mem(rdx, r9, 2, 5*8)) vmovq(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) // ---------------------------------- iteration 3 prefetch(0, mem(rdx, rcx, 1, 5*8)) vmovq(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP vmovq(mem(rbx), xmm0) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), xmm2) vbroadcastss(mem(rax, r8, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm4) vfmadd231ps(xmm0, xmm3, xmm6) vbroadcastss(mem(rax, r8, 2), xmm2) vbroadcastss(mem(rax, r13, 1), xmm3) vfmadd231ps(xmm0, xmm2, xmm8) vfmadd231ps(xmm0, xmm3, xmm10) vbroadcastss(mem(rax, r8, 4), xmm2) vbroadcastss(mem(rax, r15, 1), xmm3) add(r9, rax) // a += cs_a; vfmadd231ps(xmm0, xmm2, xmm12) vfmadd231ps(xmm0, xmm3, xmm14) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), xmm0) // load alpha and duplicate vbroadcastss(mem(rbx), xmm3) // load beta and duplicate vmulps(xmm0, xmm4, xmm4) // scale by alpha vmulps(xmm0, xmm6, xmm6) vmulps(xmm0, xmm8, xmm8) vmulps(xmm0, xmm10, xmm10) vmulps(xmm0, xmm12, xmm12) vmulps(xmm0, xmm14, xmm14) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(xmm0, xmm0, xmm0) // set xmm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), xmm3, xmm4) vmovlpd(xmm4, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm6) vmovlpd(xmm6, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm8) vmovlpd(xmm8, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm10) vmovlpd(xmm10, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm12) vmovlpd(xmm12, mem(rcx)) add(rdi, rcx) vfmadd231ps(mem(rcx), xmm3, xmm14) vmovlpd(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) /****6x2 tile is transposed and saved in col major as 2x6*****/ vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vshufps(imm(0x44), xmm1, xmm0, xmm2) //01-00-01-00 vshufps(imm(0xee), xmm1, xmm0, xmm4) //11-10-11-10 vfmadd231ps(mem(rcx), xmm3, xmm2) vmovupd(xmm2, mem(rcx)) // store ( gamma00..gamma30 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vfmadd231ps(mem(rcx), xmm3, xmm4) vmovupd(xmm4, mem(rcx)) // store ( gamma01..gamma31 ) vunpcklps(xmm14, xmm12, xmm0)//eof0e1f1 vpermilps(imm(0x4e),xmm0,xmm5) vmovq(mem(rdx), xmm4) vfmadd231ps(xmm4, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) lea(mem(rdx, rsi, 1), rdx) vmovq(mem(rdx), xmm4) vfmadd231ps(xmm4, xmm3, xmm5) vmovlpd(xmm5, mem(rdx)) // store ( gamma41..gamma51 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovlpd(xmm4, mem(rcx)) add(rdi, rcx) vmovlpd(xmm6, mem(rcx)) add(rdi, rcx) vmovlpd(xmm8, mem(rcx)) add(rdi, rcx) vmovlpd(xmm10, mem(rcx)) add(rdi, rcx) vmovlpd(xmm12, mem(rcx)) add(rdi, rcx) vmovlpd(xmm14, mem(rcx)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(xmm6, xmm4, xmm0)//a0b0a1b1 vunpcklps(xmm10, xmm8, xmm1)//c0d0c1d1 vshufps(imm(0x44), xmm1, xmm0, xmm2) //01-00-01-00 vshufps(imm(0xee), xmm1, xmm0, xmm4) //11-10-11-10 vmovupd(xmm2, mem(rcx)) // store ( gamma00..gamma30 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vmovupd(xmm4, mem(rcx)) // store ( gamma01..gamma31 ) vunpcklps(xmm14, xmm12, xmm0)//eof0e1f1 vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) label(.SDONE) lea(mem(r12, rdi, 4), r12) // lea(mem(r12, rdi, 2), r12) // c_ii = r12 += 6*rs_c //lea(mem(r14, r8, 4), r14) // //lea(mem(r14, r8, 2), r14) // a_ii = r14 += 6*rs_a mov(var(ps_a4), rax) // load ps_a4 lea(mem(r14, rax, 1), r14) // a_ii = r14 += ps_a4 dec(r11) // ii -= 1; jne(.SLOOP6X2I) // iterate again if ii != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [m_iter] "m" (m_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [ps_a4] "m" (ps_a4), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( m_left ) { const dim_t nr_cur = 2; const dim_t i_edge = m0 - ( dim_t )m_left; float* restrict cij = c + i_edge*rs_c; float* restrict ai = a + m_iter*ps_a; float* restrict bj = b; sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_zen_asm_1x2, bli_sgemmsup_rv_zen_asm_2x2, bli_sgemmsup_rv_zen_asm_3x2, bli_sgemmsup_rv_zen_asm_4x2, bli_sgemmsup_rv_zen_asm_5x2 }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } } cython-blis-0.9.1/blis/_src/kernels/zen/3/sup/other/bli_gemmsup_rv_zen_asm_s6x16n.c000066400000000000000000003662251427272030600302240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #define BLIS_ASM_SYNTAX_ATT #include "bli_x86_asm_macros.h" /* rrr: -------- ------ -------- -------- ------ -------- -------- += ------ ... -------- -------- ------ -------- -------- ------ : -------- ------ : rcr: -------- | | | | -------- -------- | | | | -------- -------- += | | | | ... -------- -------- | | | | -------- -------- | | | | : -------- | | | | : Assumptions: - B is row-stored; - A is row- or column-stored; - m0 and n0 are at most MR and NR, respectively. Therefore, this (r)ow-preferential kernel is well-suited for contiguous (v)ector loads on B and single-element broadcasts from A. NOTE: These kernels explicitly support column-oriented IO, implemented via an in-register transpose. And thus they also support the crr and ccr cases, though only crr is ever utilized (because ccr is handled by transposing the operation and executing rcr, which does not incur the cost of the in-register transpose). crr: | | | | | | | | ------ -------- | | | | | | | | ------ -------- | | | | | | | | += ------ ... -------- | | | | | | | | ------ -------- | | | | | | | | ------ : | | | | | | | | ------ : */ // Prototype reference microkernels. //GEMMSUP_KER_PROT( float, f, semmsup_r_zen_ref ) void bli_sgemmsup_rv_zen_asm_6x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { uint64_t m_left = m0 % 6; if ( m_left ) { float* restrict cij = c; float* restrict bj = b; float* restrict ai = a; // We add special handling for slightly inflated MR blocksizes // at edge cases, up to a maximum of 9. if ( 6 < m0 ) { sgemmsup_ker_ft ker_fp1 = NULL; sgemmsup_ker_ft ker_fp2 = NULL; dim_t mr1, mr2; if ( m0 == 7 ) { mr1 = 4; mr2 = 3; ker_fp1 = bli_sgemmsup_rv_zen_asm_4x16n; ker_fp2 = bli_sgemmsup_rv_zen_asm_3x16n; } else if ( m0 == 8 ) { mr1 = 4; mr2 = 4; ker_fp1 = bli_sgemmsup_rv_zen_asm_4x16n; ker_fp2 = bli_sgemmsup_rv_zen_asm_4x16n; } else // if ( m0 == 9 ) { mr1 = 4; mr2 = 5; ker_fp1 = bli_sgemmsup_rv_zen_asm_4x16n; ker_fp2 = bli_sgemmsup_rv_zen_asm_5x16n; } ker_fp1 ( conja, conjb, mr1, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += mr1*rs_c0; ai += mr1*rs_a0; ker_fp2 ( conja, conjb, mr2, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } sgemmsup_ker_ft ker_fps[6] = { NULL, bli_sgemmsup_rv_zen_asm_1x16n, bli_sgemmsup_rv_zen_asm_2x16n, bli_sgemmsup_rv_zen_asm_3x16n, bli_sgemmsup_rv_zen_asm_4x16n, bli_sgemmsup_rv_zen_asm_5x16n }; sgemmsup_ker_ft ker_fp = ker_fps[ m_left ]; ker_fp ( conja, conjb, m_left, n0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); return; } //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 /16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X16J) // LOOP OVER jj = [ n_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c prefetch(0, mem(rdx, rdi, 2, 7*8)) // prefetch c + 5*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 5*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 5*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 5*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 5*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 5*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 5*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done pre-fetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rbx, 11*4)) // pre-fetch line of next upanel of b vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 1 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 2 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) // ---------------------------------- iteration 3 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) vbroadcastss(mem(rax, r15, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) vfmadd231ps(ymm0, ymm3, ymm14) vfmadd231ps(ymm1, ymm3, ymm15) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) vmulps(ymm0, ymm14, ymm14) vmulps(ymm0, ymm15, ymm15) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm11) vmovups(ymm11, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm13) vmovups(ymm13, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm14) vmovups(ymm14, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm15) vmovups(ymm15, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /***bottom left tile - 2x8 is transposed to top right tile 8x2**********/ vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c vmovlpd(mem(rax), xmm1, xmm1) vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c /***top right tile 4x8 is transposed to bottom left tile 8x4**********/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c /*** bottom right 2x8 is transposed to bottom right tile 8x2*******/ vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma41..gamma51 ) lea(mem(rdx, rsi, 4), rax) // rax += 4*cs_c vmovlpd(mem(rax), xmm1, xmm1) vmovhpd(mem(rax, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rax)) // store ( gamma44..gamma54 ) vmovhpd(xmm2, mem(rax, rsi, 1)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 2), rdx) // rdx += 2*cs_c vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm0) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovhpd(xmm0, mem(rdx, rsi, 1)) // store ( gamma43..gamma53 ) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vmovlpd(mem(rdx), xmm1, xmm1) vmovhpd(mem(rdx, rsi, 1), xmm1, xmm1) vfmadd231ps(xmm1, xmm3, xmm2) vmovlpd(xmm2, mem(rdx)) // store ( gamma46..gamma56 ) vmovhpd(xmm2, mem(rdx, rsi, 1)) // store ( gamma47..gamma57 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) vmovups(ymm13, mem(rcx, rsi, 8)) add(rdi, rcx) vmovups(ymm14, mem(rcx)) vmovups(ymm15, mem(rcx, rsi, 8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) /****6x16 tile going to save into 16x6 tile in C*****/ /******************top left tile 8x4***************************/ vunpcklps(ymm6, ymm4, ymm0) vunpcklps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /******************top right tile 8x2***************************/ vunpcklps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm14, ymm12, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) lea(mem(rdx, rsi, 1), rdx) lea(mem(rdx, rsi, 4), rdx) // rdx += 8*cs_c /******************bottom left tile 8x4***************************/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += 1*cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) /******************bottom right tile 8x2***************************/ vunpcklps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma40..gamma50 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma44..gamma54 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma41..gamma51 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma45..gamma55 ) lea(mem(rdx, rsi, 1), rdx) vunpckhps(ymm15, ymm13, ymm0) vextractf128(imm(0x1), ymm0, xmm2) vmovlpd(xmm0, mem(rdx)) // store ( gamma42..gamma52 ) vmovlpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma46..gamma56 ) lea(mem(rdx, rsi, 1), rdx) vmovhpd(xmm0, mem(rdx)) // store ( gamma43..gamma53 ) vmovhpd(xmm2, mem(rdx, rsi, 4)) // store ( gamma47..gamma57 ) label(.SDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(4*16), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X16J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 6; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + n_iter * ps_b; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_zen_asm_6x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_zen_asm_6x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_zen_asm_6x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_zen_ref_6x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rv_zen_asm_5x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 /16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP6X16J) // LOOP OVER jj = [ n_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c prefetch(0, mem(rdx, rdi, 1, 7*8)) // prefetch c + 4*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 4*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 4*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 4*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 4*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 4*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 4*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c lea(mem(r10, r10, 2), rcx) // rcx = 3*rs_b; //lea(mem(rbx, r10, 8), rdx) // use rdx for prefetching b. //lea(mem(rdx, r10, 8), rdx) // rdx = b + 16*rs_b; mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rbx, 11*8)) // prefetch line of next upanel of b vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 1 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 2 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) // ---------------------------------- iteration 3 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) vbroadcastss(mem(rax, r8, 4), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm12) vfmadd231ps(ymm1, ymm2, ymm13) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) vmulps(ymm0, ymm12, ymm12) vmulps(ymm0, ymm13, ymm13) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 4), rdx) // load address of c + 4*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm11) vmovups(ymm11, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm12) vmovups(ymm12, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm13) vmovups(ymm13, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm12, xmm0)//e0-e3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm12, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) //lea(mem(rcx, rsi, 8), rcx) // rcx += 8*cs_c vextractf128(imm(0x0), ymm13, xmm0)//e0-e3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm13, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm12, mem(rcx)) vmovups(ymm13, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm12, xmm0)//e0-e3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm12, xmm0)//e4-e7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) vextractf128(imm(0x0), ymm13, xmm0)//e0-e3 vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm13, xmm0)//e4-e7 vshufps(imm(0x01), xmm0,xmm0, xmm1) vshufps(imm(0x02), xmm0,xmm0, xmm2) vshufps(imm(0x03), xmm0,xmm0, xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) label(.SDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(4*16), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP6X16J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 5; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + n_iter * ps_b; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_zen_asm_5x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_zen_asm_5x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_zen_asm_5x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { #if 1 const dim_t nr_cur = 1; bli_sgemmsup_r_zen_ref_5x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); #else bli_sgemv_ex ( BLIS_NO_TRANSPOSE, conjb, m0, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, beta, cij, rs_c0, cntx, NULL ); #endif } } } void bli_sgemmsup_rv_zen_asm_4x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 /16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //vzeroall() // zero all xmm/ymm registers. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) lea(mem(r8, r8, 2), r13) // r13 = 3*rs_a //lea(mem(r8, r8, 4), r15) // r15 = 5*rs_a mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP4X16J) // LOOP OVER jj = [ n_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) mov(var(a), rax) // load address of a. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c lea(mem(r12, rdi, 2), rdx) // lea(mem(rdx, rdi, 1), rdx) // rdx = c + 3*rs_c; prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c prefetch(0, mem(rdx, 7*8)) // prefetch c + 3*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 3*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 3*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 3*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 3*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 3*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rbx, 11*8)) // prefetch line of next upanel of b vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 1 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 2 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) // ---------------------------------- iteration 3 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) vbroadcastss(mem(rax, r13, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) vfmadd231ps(ymm0, ymm3, ymm10) vfmadd231ps(ymm1, ymm3, ymm11) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) vmulps(ymm0, ymm10, ymm10) vmulps(ymm0, ymm11, ymm11) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm10) vmovups(ymm10, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm11) vmovups(ymm11, mem(rcx, rsi,8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm0) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vfmadd231ps(mem(rcx), xmm3, xmm1) vfmadd231ps(mem(rcx, rsi, 4), xmm3, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm10, mem(rcx)) vmovups(ymm11, mem(rcx, rsi,8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a4b4a5b5 vunpcklps(ymm10, ymm8, ymm1) //c0d0c1d1 c4d4c5d5 vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm6, ymm4, ymm0) vunpckhps(ymm10, ymm8, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) vunpcklps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma00..gamma30 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma04..gamma34 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma01..gamma31 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma05..gamma35 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vunpckhps(ymm7, ymm5, ymm0) vunpckhps(ymm11, ymm9, ymm1) vshufps(imm(0x4e), ymm1, ymm0, ymm2) vblendps(imm(0xcc), ymm2, ymm0, ymm0) vblendps(imm(0x33), ymm2, ymm1, ymm1) vextractf128(imm(0x1), ymm0, xmm2) vmovups(xmm0, mem(rcx)) // store ( gamma02..gamma32 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma06..gamma36 ) lea(mem(rcx, rsi, 1), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm1, xmm2) vmovups(xmm1, mem(rcx)) // store ( gamma03..gamma33 ) vmovups(xmm2, mem(rcx, rsi, 4)) // store ( gamma07..gamma37 ) label(.SDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c //add(imm(4*16), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP4X16J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 4; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + n_iter * ps_b; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_zen_asm_4x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_zen_asm_4x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_zen_asm_4x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { const dim_t nr_cur = 1; bli_sgemmsup_r_zen_ref_4x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rv_zen_asm_3x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 /16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP4X16J) // LOOP OVER jj = [ n_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) vxorps(ymm8, ymm8, ymm8) vxorps(ymm9, ymm9, ymm9) vxorps(ymm10, ymm10, ymm10) vxorps(ymm11, ymm11, ymm11) vxorps(ymm12, ymm12, ymm12) vxorps(ymm13, ymm13, ymm13) vxorps(ymm14, ymm14, ymm14) vxorps(ymm15, ymm15, ymm15) mov(var(a), rax) // load address of a. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c prefetch(0, mem(r12, rdi, 2, 7*8)) // prefetch c + 2*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 2*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 2*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 2*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 2*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 2*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rbx, 11*8)) // prefetch line of next upanel of b vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 1 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 2 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) // ---------------------------------- iteration 3 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) vbroadcastss(mem(rax, r8, 2), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm8) vfmadd231ps(ymm1, ymm2, ymm9) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) vmulps(ymm0, ymm8, ymm8) vmulps(ymm0, ymm9, ymm9) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*rs_c; lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm8) vmovups(ymm8, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm9) vmovups(ymm9, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm8, xmm0)//c0-c3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm11) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm8, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e4 vfmadd231ps(xmm6, xmm3, xmm1)//e5 vfmadd231ps(xmm8, xmm3, xmm2)//e6 vfmadd231ps(xmm10, xmm3, xmm14)//e7 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) /********************************************/ vextractf128(imm(0x0), ymm9, xmm0)//c0-c3 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm9, xmm0)//e4-e7 vmovss(mem(rdx),xmm4) vmovss(mem(rdx, rsi, 1),xmm6) vmovss(mem(rdx, rsi, 2),xmm8) vmovss(mem(rdx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm8, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm8, mem(rcx)) vmovups(ymm9, mem(rcx, rsi,8)) //add(rdi, rcx) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c /********************************************/ vextractf128(imm(0x0), ymm8, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm8, xmm0)//c4-c7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c /*********************************************/ vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) /********************************************/ vextractf128(imm(0x0), ymm9, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) lea(mem(rdx, rsi, 4), rdx) // rdx += 4*cs_c vextractf128(imm(0x1), ymm9, xmm0)//c4-c7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rdx)) vmovss(xmm1, mem(rdx, rsi, 1)) vmovss(xmm2, mem(rdx, rsi, 2)) vmovss(xmm14, mem(rdx, rax, 1)) label(.SDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c lea(mem(r12, rsi, 8), r12) //add(imm(4*16), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP4X16J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 3; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + n_iter * ps_b; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_zen_asm_3x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_zen_asm_3x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_zen_asm_3x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { const dim_t nr_cur = 1; bli_sgemmsup_r_zen_ref_3x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rv_zen_asm_2x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 /16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP2X16J) // LOOP OVER jj = [ n_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) vxorps(ymm6, ymm6, ymm6) vxorps(ymm7, ymm7, ymm7) mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c prefetch(0, mem(r12, rdi, 1, 7*8)) // prefetch c + 1*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 1*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 1*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 1*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 1*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rbx, 11*8)) // prefetch line of next upanel of b vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 1 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 2 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) // ---------------------------------- iteration 3 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) vbroadcastss(mem(rax, r8, 1), ymm3) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) vfmadd231ps(ymm0, ymm3, ymm6) vfmadd231ps(ymm1, ymm3, ymm7) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) vmulps(ymm0, ymm6, ymm6) vmulps(ymm0, ymm7, ymm7) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vfmadd231ps(mem(rcx), ymm3, ymm6) vmovups(ymm6, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm7) vmovups(ymm7, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vperm2f128(imm(0x01),ymm0,ymm0,ymm11) vperm2f128(imm(0x01),ymm2,ymm2,ymm12) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm0) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm2) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vshufpd(imm(0x01), xmm11, xmm11, xmm1)//a1b1 vshufpd(imm(0x01), xmm12, xmm12, xmm10)//a3b3 vmovsd(mem(rcx),xmm4) vmovsd(mem(rcx, rsi, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm11) vfmadd231ps(xmm6, xmm3, xmm1) vmovsd(xmm11, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(mem(rcx, rsi, 2),xmm4) vmovsd(mem(rcx, rax, 1),xmm6) vfmadd231ps(xmm4, xmm3, xmm12) vfmadd231ps(xmm6, xmm3, xmm10) vmovsd(xmm12, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi,8)) add(rdi, rcx) vmovups(ymm6, mem(rcx)) vmovups(ymm7, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vunpcklps(ymm6, ymm4, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm6, ymm4, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vunpcklps(ymm7, ymm5, ymm0) //a0b0a1b1 a2b2a3b3 vunpckhps(ymm7, ymm5, ymm2) //a2b2a3b3 a6b6a7b7 vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a1b1 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vperm2f128(imm(0x01),ymm0,ymm0,ymm0) vperm2f128(imm(0x01),ymm2,ymm2,ymm2) vshufpd(imm(0x01), xmm0, xmm0, xmm1)//a2b2 vshufpd(imm(0x01), xmm2, xmm2, xmm10)//a3b3 vmovsd(xmm0, mem(rcx)) // store ( gamma00..gamma10 ) vmovsd(xmm1, mem(rcx, rsi, 1)) // store ( gamma01..gamma11 ) vmovsd(xmm2, mem(rcx, rsi, 2)) // store ( gamma02..gamma12 ) vmovsd(xmm10, mem(rcx, rax, 1)) // store ( gamma03..gamma13 ) label(.SDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c lea(mem(r12, rsi, 8), r12) //add(imm(4*16), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP2X16J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 2; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + n_iter * ps_b; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_zen_asm_2x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_zen_asm_2x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_zen_asm_2x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { const dim_t nr_cur = 1; bli_sgemmsup_r_zen_ref_2x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } void bli_sgemmsup_rv_zen_asm_1x16n ( conj_t conja, conj_t conjb, dim_t m0, dim_t n0, dim_t k0, float* restrict alpha, float* restrict a, inc_t rs_a0, inc_t cs_a0, float* restrict b, inc_t rs_b0, inc_t cs_b0, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); //void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t n_iter = n0 /16; uint64_t n_left = n0 % 16; uint64_t rs_a = rs_a0; uint64_t cs_a = cs_a0; uint64_t rs_b = rs_b0; uint64_t cs_b = cs_b0; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; // Query the panel stride of B and convert it to units of bytes. uint64_t ps_b = bli_auxinfo_ps_b( data ); uint64_t ps_b4 = ps_b * sizeof( float ); if ( n_iter == 0 ) goto consider_edge_cases; // ------------------------------------------------------------------------- begin_asm() //mov(var(a), rax) // load address of a. mov(var(rs_a), r8) // load rs_a mov(var(cs_a), r9) // load cs_a lea(mem(, r8, 4), r8) // rs_a *= sizeof(float) lea(mem(, r9, 4), r9) // cs_a *= sizeof(float) mov(var(b), r14) // load address of b. mov(var(rs_b), r10) // load rs_b //mov(var(cs_b), r11) // load cs_b lea(mem(, r10, 4), r10) // rs_b *= sizeof(float) //lea(mem(, r11, 4), r11) // cs_b *= sizeof(float) // NOTE: We cannot pre-load elements of a or b // because it could eventually, in the last // unrolled iter or the cleanup loop, result // in reading beyond the bounds allocated mem // (the likely result: a segmentation fault). mov(var(c), r12) // load address of c mov(var(rs_c), rdi) // load rs_c lea(mem(, rdi, 4), rdi) // rs_c *= sizeof(float) // During preamble and loops: // r12 = rcx = c // r14 = rbx = b // read rax from var(a) near beginning of loop // r11 = m dim index ii mov(var(n_iter), r11) // jj = n_iter; label(.SLOOP1X16J) // LOOP OVER jj = [ n_iter ... 1 0 ] vxorps(ymm4, ymm4, ymm4) vxorps(ymm5, ymm5, ymm5) mov(var(a), rax) // load address of a. //mov(r12, rcx) // reset rcx to current utile of c. mov(r14, rbx) // reset rbx to current upanel of b. cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLPFETCH) // jump to column storage case label(.SROWPFETCH) // row-stored prefetching on c prefetch(0, mem(r12, 7*8)) // prefetch c + 0*rs_c jmp(.SPOSTPFETCH) // jump to end of prefetching c label(.SCOLPFETCH) // column-stored prefetching c mov(var(cs_c), rsi) // load cs_c to rsi (temporarily) lea(mem(, rsi, 4), rsi) // cs_c *= sizeof(float) lea(mem(r12, rsi, 2), rdx) // lea(mem(rdx, rsi, 1), rdx) // rdx = c + 3*cs_c; prefetch(0, mem(r12, 0*8)) // prefetch c + 0*cs_c prefetch(0, mem(r12, rsi, 1, 0*8)) // prefetch c + 1*cs_c prefetch(0, mem(r12, rsi, 2, 0*8)) // prefetch c + 2*cs_c prefetch(0, mem(rdx, 1*8)) // prefetch c + 3*cs_c prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 4*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 5*cs_c lea(mem(rdx, rsi, 2), rdx) // rdx = c + 5*cs_c; prefetch(0, mem(rdx, rsi, 1, 0*8)) // prefetch c + 6*cs_c prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c label(.SPOSTPFETCH) // done prefetching c mov(var(k_iter), rsi) // i = k_iter; test(rsi, rsi) // check i via logical AND. je(.SCONSIDKLEFT) // if i == 0, jump to code that // contains the k_left loop. label(.SLOOPKITER) // MAIN LOOP // ---------------------------------- iteration 0 prefetch(0, mem(rbx, 11*8)) // prefetch line of next upanel of b vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 1 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 2 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) // ---------------------------------- iteration 3 prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKITER) // iterate again if i != 0. label(.SCONSIDKLEFT) mov(var(k_left), rsi) // i = k_left; test(rsi, rsi) // check i via logical AND. je(.SPOSTACCUM) // if i == 0, we're done; jump to end. // else, we prepare to enter k_left loop. label(.SLOOPKLEFT) // EDGE LOOP prefetch(0, mem(rbx, 11*8)) vmovups(mem(rbx, 0*32), ymm0) vmovups(mem(rbx, 1*32), ymm1) add(r10, rbx) // b += rs_b; vbroadcastss(mem(rax ), ymm2) add(r9, rax) // a += cs_a; vfmadd231ps(ymm0, ymm2, ymm4) vfmadd231ps(ymm1, ymm2, ymm5) dec(rsi) // i -= 1; jne(.SLOOPKLEFT) // iterate again if i != 0. label(.SPOSTACCUM) mov(r12, rcx) // reset rcx to current utile of c. mov(var(alpha), rax) // load address of alpha mov(var(beta), rbx) // load address of beta vbroadcastss(mem(rax), ymm0) // load alpha and duplicate vbroadcastss(mem(rbx), ymm3) // load beta and duplicate vmulps(ymm0, ymm4, ymm4) // scale by alpha vmulps(ymm0, ymm5, ymm5) mov(var(cs_c), rsi) // load cs_c lea(mem(, rsi, 4), rsi) // rsi = cs_c * sizeof(float) lea(mem(rsi, rsi, 2), rax) // rax = 3*cs_c; // now avoid loading C if beta == 0 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero. vucomiss(xmm0, xmm3) // set ZF if beta == 0. je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case cmp(imm(4), rdi) // set ZF if (4*rs_c) == 4. jz(.SCOLSTORED) // jump to column storage case label(.SROWSTORED) vfmadd231ps(mem(rcx), ymm3, ymm4) vmovups(ymm4, mem(rcx)) vfmadd231ps(mem(rcx, rsi,8), ymm3, ymm5) vmovups(ymm5, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORED) vextractf128(imm(0x0), ymm4, xmm0)//c0-c3 vmovss(mem(rcx),xmm7) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm11) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm7, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm4, xmm0)//e4-e7 vmovss(mem(rcx),xmm4) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm8) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e4 vfmadd231ps(xmm6, xmm3, xmm1)//e5 vfmadd231ps(xmm8, xmm3, xmm2)//e6 vfmadd231ps(xmm10, xmm3, xmm14)//e7 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vextractf128(imm(0x0), ymm5, xmm0)//c0-c3 vmovss(mem(rcx),xmm4) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm11) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e0 vfmadd231ps(xmm6, xmm3, xmm1)//e1 vfmadd231ps(xmm11, xmm3, xmm2)//e2 vfmadd231ps(xmm10, xmm3, xmm14)//e3 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += 4*cs_c vextractf128(imm(0x1), ymm5, xmm0)//e4-e7 vmovss(mem(rcx),xmm4) vmovss(mem(rcx, rsi, 1),xmm6) vmovss(mem(rcx, rsi, 2),xmm8) vmovss(mem(rcx, rax, 1),xmm10) vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vfmadd231ps(xmm4, xmm3, xmm0)//e4 vfmadd231ps(xmm6, xmm3, xmm1)//e5 vfmadd231ps(xmm8, xmm3, xmm2)//e6 vfmadd231ps(xmm10, xmm3, xmm14)//e7 vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) jmp(.SDONE) // jump to end. label(.SBETAZERO) cmp(imm(4), rdi) // set ZF if (8*rs_c) == 8. jz(.SCOLSTORBZ) // jump to column storage case label(.SROWSTORBZ) vmovups(ymm4, mem(rcx)) vmovups(ymm5, mem(rcx, rsi,8)) jmp(.SDONE) // jump to end. label(.SCOLSTORBZ) vextractf128(imm(0x0), ymm4, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm4, xmm0)//e4-e7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) vextractf128(imm(0x0), ymm5, xmm0)//c0-c3 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) lea(mem(rcx, rsi, 4), rcx) // rcx += cs_c vextractf128(imm(0x1), ymm5, xmm0)//e4-e7 vshufps(imm(0x01), xmm0, xmm0,xmm1) vshufps(imm(0x02), xmm0, xmm0,xmm2) vshufps(imm(0x03), xmm0, xmm0,xmm14) vmovss(xmm0, mem(rcx)) vmovss(xmm1, mem(rcx, rsi, 1)) vmovss(xmm2, mem(rcx, rsi, 2)) vmovss(xmm14, mem(rcx, rax, 1)) label(.SDONE) lea(mem(r12, rsi, 8), r12) // c_jj = r12 += 8*cs_c lea(mem(r12, rsi, 8), r12) //add(imm(4*16), r14) // b_jj = r14 += 8*cs_b mov(var(ps_b4), rbx) // load ps_b4 lea(mem(r14, rbx, 1), r14) // a_ii = r14 += ps_b4 dec(r11) // jj -= 1; jne(.SLOOP1X16J) // iterate again if jj != 0. label(.SRETURN) end_asm( : // output operands (none) : // input operands [n_iter] "m" (n_iter), [k_iter] "m" (k_iter), [k_left] "m" (k_left), [a] "m" (a), [rs_a] "m" (rs_a), [cs_a] "m" (cs_a), [b] "m" (b), [rs_b] "m" (rs_b), [cs_b] "m" (cs_b), [ps_b4] "m" (ps_b4), [alpha] "m" (alpha), [beta] "m" (beta), [c] "m" (c), [rs_c] "m" (rs_c), [cs_c] "m" (cs_c)/*, [a_next] "m" (a_next), [b_next] "m" (b_next)*/ : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ) consider_edge_cases: // Handle edge cases in the m dimension, if they exist. if ( n_left ) { const dim_t mr_cur = 1; const dim_t j_edge = n0 - ( dim_t )n_left; float* restrict cij = c + j_edge*cs_c; float* restrict ai = a; float* restrict bj = b + n_iter * ps_b; if ( 8 <= n_left ) { const dim_t nr_cur = 8; bli_sgemmsup_rv_zen_asm_1x8 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 4 <= n_left ) { const dim_t nr_cur = 4; bli_sgemmsup_rv_zen_asm_1x4 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 2 <= n_left ) { const dim_t nr_cur = 2; bli_sgemmsup_rv_zen_asm_1x2 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur; } if ( 1 == n_left ) { const dim_t nr_cur = 1; bli_sgemmsup_r_zen_ref_1x1 ( conja, conjb, mr_cur, nr_cur, k0, alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0, beta, cij, rs_c0, cs_c0, data, cntx ); } } } cython-blis-0.9.1/blis/_src/kernels/zen/bli_kernels_zen.h000066400000000000000000000211061427272030600234130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- level-1m -- PACKM_KER_PROT(double, d, packm_8xk_gen_zen) PACKM_KER_PROT(double, d, packm_6xk_gen_zen) PACKM_KER_PROT(double, d, packm_8xk_nn_zen) PACKM_KER_PROT(double, d, packm_6xk_nn_zen) // -- level-1v -- // amaxv (intrinsics) AMAXV_KER_PROT( float, s, amaxv_zen_int ) AMAXV_KER_PROT( double, d, amaxv_zen_int ) // axpyv (intrinsics) AXPYV_KER_PROT( float, s, axpyv_zen_int ) AXPYV_KER_PROT( double, d, axpyv_zen_int ) // axpyv (intrinsics unrolled x10) AXPYV_KER_PROT( float, s, axpyv_zen_int10 ) AXPYV_KER_PROT( double, d, axpyv_zen_int10 ) // dotv (intrinsics) DOTV_KER_PROT( float, s, dotv_zen_int ) DOTV_KER_PROT( double, d, dotv_zen_int ) // dotv (intrinsics, unrolled x10) DOTV_KER_PROT( float, s, dotv_zen_int10 ) DOTV_KER_PROT( double, d, dotv_zen_int10 ) // dotxv (intrinsics) DOTXV_KER_PROT( float, s, dotxv_zen_int ) DOTXV_KER_PROT( double, d, dotxv_zen_int ) // scalv (intrinsics) SCALV_KER_PROT( float, s, scalv_zen_int ) SCALV_KER_PROT( double, d, scalv_zen_int ) // scalv (intrinsics unrolled x10) SCALV_KER_PROT( float, s, scalv_zen_int10 ) SCALV_KER_PROT( double, d, scalv_zen_int10 ) SCALV_KER_PROT( scomplex, c, scalv_zen_int10 ) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // copyv (intrinsics) COPYV_KER_PROT( float, s, copyv_zen_int ) COPYV_KER_PROT( double, d, copyv_zen_int ) // SETV_KER_PROT(float, s, setv_zen_int) SETV_KER_PROT(double, d, setv_zen_int) // swapv (intrinsics) SWAPV_KER_PROT(float, s, swapv_zen_int8 ) SWAPV_KER_PROT(double, d, swapv_zen_int8 ) // -- level-1f -- // axpyf (intrinsics) AXPYF_KER_PROT( float, s, axpyf_zen_int_8 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_8 ) AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_16x4 ) AXPYF_KER_PROT( scomplex, c, axpyf_zen_int_4 ) // dotxf (intrinsics) DOTXF_KER_PROT( float, s, dotxf_zen_int_8 ) DOTXF_KER_PROT( double, d, dotxf_zen_int_8 ) // -- level-3 sup -------------------------------------------------------------- // semmsup_rv //GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_6x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_5x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_4x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_3x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_2x1 ) GEMMSUP_KER_PROT( float, s, gemmsup_r_zen_ref_1x1 ) // gemmsup_rv (mkernel in m dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x8m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x4m ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x2m ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_6x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_5x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_4x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_3x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_2x16n ) GEMMSUP_KER_PROT( float, s, gemmsup_rv_zen_asm_1x16n ) // gemmsup_rd GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x8) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x4) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x2) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x8m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x4m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x2m) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_6x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_3x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_2x16n) GEMMSUP_KER_PROT( float, s, gemmsup_rd_zen_asm_1x16n) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2m ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x2 ) // gemmsup_rv (mkernel in n dim) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_2x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_1x8n ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x4 ) GEMMSUP_KER_PROT( scomplex, c, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_2x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_1x4n ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x2 ) GEMMSUP_KER_PROT( dcomplex, z, gemmsup_rv_zen_asm_3x1 ) cython-blis-0.9.1/blis/_src/kernels/zen2/000077500000000000000000000000001427272030600201575ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/kernels/zen2/bli_kernels_zen2.h000066400000000000000000000034571427272030600235700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // -- level-1f -- AXPYF_KER_PROT( float, s, axpyf_zen_int_5 ) AXPYF_KER_PROT( double, d, axpyf_zen_int_5 ) cython-blis-0.9.1/blis/_src/make/000077500000000000000000000000001427272030600165535ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/make/darwin-firestorm.jsonl000066400000000000000000004123771427272030600231340ustar00rootroot00000000000000{"environment": {"SHELL": "/bin/zsh", "PYENV_HOOK_PATH": "/Users/daniel/.pyenv/pyenv.d:/opt/homebrew/Cellar/pyenv/2.2.5/pyenv.d:/opt/homebrew/etc/pyenv.d:/etc/pyenv.d:/usr/lib/pyenv/hooks", "PYENV_SHELL": "zsh", "XPC_FLAGS": "0x0", "TERM_PROGRAM_VERSION": "444", "HISTSIZE": "10000", "__CFBundleIdentifier": "com.apple.Terminal", "SSH_AUTH_SOCK": "/private/tmp/com.apple.launchd.bOd18fGMG1/Listeners", "TERM_SESSION_ID": "F4088A9F-96AC-4F7F-84E3-A64C72E33F96", "HOMEBREW_PREFIX": "/opt/homebrew", "PYENV_VERSION": "blis-0.9.0", "PWD": "/Users/daniel/projects/cython-blis/flame-blis", "LOGNAME": "daniel", "MANPATH": "/opt/homebrew/share/man::", "HOME": "/Users/daniel", "HISTFILE": "/Users/daniel/.zhistory", "KEYTIMEOUT": "1", "TMPDIR": "/var/folders/r6/0qdg7k1j2kg053mxlygwz7s40000gn/T/", "SAVEHIST": "10000", "PYENV_DIR": "/Users/daniel/projects/cython-blis/flame-blis", "INFOPATH": "/opt/homebrew/share/info:", "TERM": "xterm-256color", "USER": "daniel", "HOMEBREW_CELLAR": "/opt/homebrew/Cellar", "SHLVL": "1", "HOMEBREW_REPOSITORY": "/opt/homebrew", "XPC_SERVICE_NAME": "0", "LC_CTYPE": "UTF-8", "PYENV_ROOT": "/Users/daniel/.pyenv", "PATH": "/Users/daniel/.pyenv/versions/blis-0.9.0/bin:/opt/homebrew/Cellar/pyenv/2.2.5/libexec:/opt/homebrew/Cellar/pyenv/2.2.5/plugins/python-build/bin:/opt/homebrew/opt/ccache/libexec:/Users/daniel/.pyenv/shims:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Applications/Little Snitch.app/Contents/Components:/Library/Apple/usr/bin:/Users/daniel/.cargo/bin", "OLDPWD": "/Users/daniel/projects/cython-blis", "TERM_PROGRAM": "Apple_Terminal", "__CF_USER_TEXT_ENCODING": "0x1F5:0x0:0x0"}} {"compiler": "gcc", "source": "config/firestorm/bli_cntx_init_firestorm.c", "target": "obj/firestorm/config/firestorm/bli_cntx_init_firestorm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c", "target": "obj/firestorm/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c", "target": "obj/firestorm/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c", "target": "obj/firestorm/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c", "target": "obj/firestorm/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c", "target": "obj/firestorm/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c", "target": "obj/firestorm/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c", "target": "obj/firestorm/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c", "target": "obj/firestorm/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c", "target": "obj/firestorm/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c", "target": "obj/firestorm/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c", "target": "obj/firestorm/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c", "target": "obj/firestorm/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/bli_cntx_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_addv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_amaxv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_axpbyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_axpyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_copyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_dotv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_dotxv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_invertv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_scal2v_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_scalv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_setv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_subv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_swapv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1/bli_xpbyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1f/bli_axpy2v_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1f/bli_axpyf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1f/bli_dotaxpyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1f/bli_dotxaxpyf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1f/bli_dotxf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1m/bli_packm_cxk_1er_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1m/bli_packm_cxk_bb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1m/bli_packm_cxk_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/1m/bli_unpackm_cxk_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/3/bli_gemm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/3/bli_gemmsup_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/3/bli_gemmtrsm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/3/bli_trsm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/3/bb/bli_gemmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/3/bb/bli_gemmtrsmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/3/bb/bli_trsmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/ind/bli_gemm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/ind/bli_gemmtrsm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/firestorm/ref_kernels/firestorm/ind/bli_trsm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/firestorm/frame/0/bli_l0_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/firestorm/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/firestorm/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/firestorm/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/firestorm/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/firestorm/frame/1/bli_l1v_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/firestorm/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/firestorm/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/firestorm/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/firestorm/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/firestorm/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/firestorm/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/firestorm/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/firestorm/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/firestorm/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/firestorm/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/firestorm/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/firestorm/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/firestorm/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/firestorm/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/firestorm/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/firestorm/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/firestorm/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/firestorm/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/firestorm/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/firestorm/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/firestorm/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/firestorm/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/firestorm/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/firestorm/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/firestorm/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/firestorm/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/firestorm/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/firestorm/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/firestorm/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/firestorm/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/firestorm/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/firestorm/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/firestorm/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/firestorm/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/firestorm/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/firestorm/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/firestorm/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/firestorm/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/firestorm/frame/2/bli_l2_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/firestorm/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/firestorm/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/firestorm/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/firestorm/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/firestorm/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/firestorm/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/firestorm/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/firestorm/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/firestorm/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/firestorm/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/firestorm/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/firestorm/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/firestorm/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/firestorm/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/firestorm/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/firestorm/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/firestorm/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/firestorm/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/firestorm/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/firestorm/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/firestorm/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/firestorm/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/firestorm/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/firestorm/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/firestorm/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/firestorm/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/firestorm/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/firestorm/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/firestorm/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/firestorm/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/firestorm/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/firestorm/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/firestorm/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/firestorm/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/firestorm/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/firestorm/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/firestorm/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/firestorm/frame/3/bli_l3_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/firestorm/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/firestorm/frame/3/bli_l3_direct.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/firestorm/frame/3/bli_l3_ind.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/firestorm/frame/3/bli_l3_int.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/firestorm/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/firestorm/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/firestorm/frame/3/bli_l3_packab.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/firestorm/frame/3/bli_l3_prune.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/firestorm/frame/3/bli_l3_schema.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/firestorm/frame/3/bli_l3_sup.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/firestorm/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/firestorm/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/firestorm/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/firestorm/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/firestorm/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/firestorm/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/firestorm/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/firestorm/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/firestorm/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/firestorm/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/firestorm/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/firestorm/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/firestorm/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/firestorm/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/firestorm/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/firestorm/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/firestorm/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/firestorm/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/firestorm/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/firestorm/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/firestorm/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/firestorm/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/firestorm/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/firestorm/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/firestorm/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/firestorm/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/firestorm/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/firestorm/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/firestorm/frame/base/bli_apool.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/firestorm/frame/base/bli_arch.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/firestorm/frame/base/bli_array.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/firestorm/frame/base/bli_blksz.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/firestorm/frame/base/bli_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/firestorm/frame/base/bli_clock.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/firestorm/frame/base/bli_cntl.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/firestorm/frame/base/bli_cntx.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/firestorm/frame/base/bli_const.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/firestorm/frame/base/bli_cpuid.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/firestorm/frame/base/bli_env.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/firestorm/frame/base/bli_error.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/firestorm/frame/base/bli_func.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/firestorm/frame/base/bli_getopt.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/firestorm/frame/base/bli_gks.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/firestorm/frame/base/bli_ind.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/firestorm/frame/base/bli_info.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/firestorm/frame/base/bli_init.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/firestorm/frame/base/bli_machval.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/firestorm/frame/base/bli_malloc.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/firestorm/frame/base/bli_mbool.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/firestorm/frame/base/bli_memsys.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/firestorm/frame/base/bli_obj.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/firestorm/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/firestorm/frame/base/bli_pack.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/firestorm/frame/base/bli_param_map.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/firestorm/frame/base/bli_part.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/firestorm/frame/base/bli_pba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/firestorm/frame/base/bli_pool.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/firestorm/frame/base/bli_prune.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/firestorm/frame/base/bli_query.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/firestorm/frame/base/bli_rntm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/firestorm/frame/base/bli_sba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/firestorm/frame/base/bli_setgetijm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/firestorm/frame/base/bli_setgetijv.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/firestorm/frame/base/bli_setri.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/firestorm/frame/base/bli_string.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/firestorm/frame/base/bli_winsys.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/firestorm/frame/base/cast/bli_castm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/firestorm/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/firestorm/frame/base/cast/bli_castv.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/firestorm/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/firestorm/frame/base/check/bli_part_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/firestorm/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/firestorm/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/firestorm/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/firestorm/frame/base/proj/bli_projm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/firestorm/frame/base/proj/bli_projv.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/firestorm/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/firestorm/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/firestorm/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/firestorm/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/firestorm/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/firestorm/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/firestorm/frame/thread/bli_pthread.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/firestorm/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/firestorm/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/firestorm/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/firestorm/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/firestorm/frame/thread/bli_thread.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/firestorm/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/firestorm/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/firestorm/frame/util/bli_util_check.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/firestorm/frame/util/bli_util_fpa.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/firestorm/frame/util/bli_util_oapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/firestorm/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/firestorm/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/firestorm/frame/util/bli_util_tapi.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/firestorm/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/firestorm/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/firestorm/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-firestorm", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/darwin-generic.jsonl000066400000000000000000003372211427272030600225300ustar00rootroot00000000000000{"environment": {"SHELL": "/bin/zsh", "PYENV_HOOK_PATH": "/Users/daniel/.pyenv/pyenv.d:/opt/homebrew/Cellar/pyenv/2.2.5/pyenv.d:/opt/homebrew/etc/pyenv.d:/etc/pyenv.d:/usr/lib/pyenv/hooks", "PYENV_SHELL": "zsh", "XPC_FLAGS": "0x0", "TERM_PROGRAM_VERSION": "444", "HISTSIZE": "10000", "__CFBundleIdentifier": "com.apple.Terminal", "SSH_AUTH_SOCK": "/private/tmp/com.apple.launchd.bOd18fGMG1/Listeners", "TERM_SESSION_ID": "5A228860-D94B-4F77-ABC7-1A33028815F1", "HOMEBREW_PREFIX": "/opt/homebrew", "PYENV_VERSION": "blis-0.9.0", "PWD": "/Users/daniel/projects/blis-0.9.0/cython-blis/flame-blis", "LOGNAME": "daniel", "MANPATH": "/opt/homebrew/share/man::", "HOME": "/Users/daniel", "HISTFILE": "/Users/daniel/.zhistory", "KEYTIMEOUT": "1", "TMPDIR": "/var/folders/r6/0qdg7k1j2kg053mxlygwz7s40000gn/T/", "SAVEHIST": "10000", "PYENV_DIR": "/Users/daniel/projects/blis-0.9.0/cython-blis/flame-blis", "INFOPATH": "/opt/homebrew/share/info:", "TERM": "xterm-256color", "USER": "daniel", "HOMEBREW_CELLAR": "/opt/homebrew/Cellar", "SHLVL": "1", "HOMEBREW_REPOSITORY": "/opt/homebrew", "XPC_SERVICE_NAME": "0", "LC_CTYPE": "UTF-8", "PYENV_ROOT": "/Users/daniel/.pyenv", "PATH": "/Users/daniel/.pyenv/versions/blis-0.9.0/bin:/opt/homebrew/Cellar/pyenv/2.2.5/libexec:/opt/homebrew/Cellar/pyenv/2.2.5/plugins/python-build/bin:/opt/homebrew/opt/ccache/libexec:/Users/daniel/.pyenv/shims:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Applications/Little Snitch.app/Contents/Components:/Library/Apple/usr/bin:/Users/daniel/.cargo/bin", "OLDPWD": "/Users/daniel/projects/blis-0.9.0/cython-blis", "TERM_PROGRAM": "Apple_Terminal", "__CF_USER_TEXT_ENCODING": "0x1F5:0x0:0x0"}} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/generic/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/generic/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/generic/frame/0/bli_l0_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/generic/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/generic/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/generic/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/generic/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/generic/frame/1/bli_l1v_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/generic/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/generic/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/generic/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/generic/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/generic/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/generic/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/generic/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/generic/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/generic/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/generic/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/generic/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/generic/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/generic/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/generic/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/generic/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/generic/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/generic/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/generic/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/generic/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/generic/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/generic/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/generic/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/generic/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/generic/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/generic/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/generic/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/generic/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/generic/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/generic/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/generic/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/generic/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/generic/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/generic/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/generic/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/generic/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/generic/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/generic/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/generic/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/generic/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/generic/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/generic/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/generic/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/generic/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/generic/frame/2/bli_l2_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/generic/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/generic/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/generic/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/generic/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/generic/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/generic/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/generic/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/generic/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/generic/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/generic/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/generic/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/generic/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/generic/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/generic/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/generic/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/generic/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/generic/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/generic/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/generic/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/generic/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/generic/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/generic/frame/3/bli_l3_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/generic/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/generic/frame/3/bli_l3_direct.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/generic/frame/3/bli_l3_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/generic/frame/3/bli_l3_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/generic/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/generic/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/generic/frame/3/bli_l3_packab.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/generic/frame/3/bli_l3_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/generic/frame/3/bli_l3_schema.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/generic/frame/3/bli_l3_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/generic/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/generic/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/generic/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/generic/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/generic/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/generic/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/generic/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/generic/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/generic/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/generic/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/generic/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/generic/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/generic/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/generic/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/generic/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/generic/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/generic/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/generic/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/generic/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/generic/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/generic/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/generic/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/generic/frame/base/bli_apool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/generic/frame/base/bli_arch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/generic/frame/base/bli_array.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/generic/frame/base/bli_blksz.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/generic/frame/base/bli_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/generic/frame/base/bli_clock.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/generic/frame/base/bli_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/generic/frame/base/bli_cntx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/generic/frame/base/bli_const.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/generic/frame/base/bli_cpuid.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/generic/frame/base/bli_env.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/generic/frame/base/bli_error.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/generic/frame/base/bli_func.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/generic/frame/base/bli_getopt.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/generic/frame/base/bli_gks.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/generic/frame/base/bli_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/generic/frame/base/bli_info.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/generic/frame/base/bli_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/generic/frame/base/bli_machval.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/generic/frame/base/bli_malloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/generic/frame/base/bli_mbool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/generic/frame/base/bli_memsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/generic/frame/base/bli_obj.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/generic/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/generic/frame/base/bli_pack.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/generic/frame/base/bli_param_map.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/generic/frame/base/bli_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/generic/frame/base/bli_pba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/generic/frame/base/bli_pool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/generic/frame/base/bli_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/generic/frame/base/bli_query.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/generic/frame/base/bli_rntm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/generic/frame/base/bli_sba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/generic/frame/base/bli_setgetijm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/generic/frame/base/bli_setgetijv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/generic/frame/base/bli_setri.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/generic/frame/base/bli_string.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/generic/frame/base/bli_winsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/generic/frame/base/cast/bli_castm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/generic/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/generic/frame/base/cast/bli_castv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/generic/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/generic/frame/base/check/bli_part_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/generic/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/generic/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/generic/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/generic/frame/base/proj/bli_projm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/generic/frame/base/proj/bli_projv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/generic/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/generic/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/generic/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/generic/frame/thread/bli_pthread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/generic/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/generic/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/generic/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/generic/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/generic/frame/thread/bli_thread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/generic/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/generic/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/generic/frame/util/bli_util_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/generic/frame/util/bli_util_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/generic/frame/util/bli_util_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/generic/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/generic/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/generic/frame/util/bli_util_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/generic/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/generic/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/generic/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/darwin-x86_64.jsonl000066400000000000000000014634401427272030600220560ustar00rootroot00000000000000{"environment": {"BUILD_SOURCEBRANCH": "refs/pull/69/merge", "SYSTEM_TEAMFOUNDATIONCOLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_JOBTIMEOUT": "60", "SYSTEM_RESTRICTSECRETS": "True", "SYSTEM_TASKDEFINITIONSURI": "https://dev.azure.com/explosion-ai/", "AGENT_VERSION": "2.202.0", "SYSTEM_JOBATTEMPT": "1", "SYSTEM_PULLREQUEST_SOURCECOMMITID": "87f212fd7a2299cb6793812f91926d059cd8e8cb", "BUILD_QUEUEDBY": "GitHub", "XCODE_12_DEVELOPER_DIR": "/Applications/Xcode_12.5.1.app/Contents/Developer", "SYSTEM_COLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_HOSTTYPE": "build", "NVM_CD_FLAGS": "", "ANDROID_HOME": "/Users/runner/Library/Android/sdk", "SYSTEM_JOBPARALLELISMTAG": "Public", "BUILD_REPOSITORY_GIT_SUBMODULECHECKOUT": "False", "GOROOT_1_17_X64": "/Users/runner/hostedtoolcache/go/1.17.8/x64", "SHELL": "/bin/bash", "CHROMEWEBDRIVER": "/usr/local/Caskroom/chromedriver/99.0.4844.51", "PIPX_BIN_DIR": "/usr/local/opt/pipx_bin", "BUILD_STAGINGDIRECTORY": "/Users/runner/work/1/a", "TMPDIR": "/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/", "SYSTEM_PULLREQUEST_PULLREQUESTNUMBER": "69", "AGENT_MACHINENAME": "Mac-1649158575375", "COMMON_TESTRESULTSDIRECTORY": "/Users/runner/work/1/TestResults", "SYSTEM_WORKFOLDER": "/Users/runner/work", "AGENT_JOBNAME": "JSONL Python38Mac", "IMAGENAME": "macos-latest", "ANDROID_SDK_ROOT": "/Users/runner/Library/Android/sdk", "OLDPWD": "/Users/runner/work/1/s", "RCT_NO_LAUNCH_PACKAGER": "1", "JAVA_HOME_8_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "MSDEPLOY_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "BUILD_SOURCEVERSIONAUTHOR": "Dani\u00ebl de Kok", "AGENT_OSARCHITECTURE": "X64", "RUNNER_PERFLOG": "/usr/local/opt/runner/perflog", "NUNIT_BASE_PATH": "/Library/Developer/nunit", "BUILD_REQUESTEDFOREMAIL": "", "LC_ALL": "en_US.UTF-8", "NUNIT3_PATH": "/Library/Developer/nunit/3.6.0", "AGENT_ACCEPTTEEEULA": "True", "SYSTEM_STAGEATTEMPT": "1", "AGENT_READONLYVARIABLES": "true", "JAVA_HOME_11_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/11.0.14-101/x64/Contents/Home/", "RUNNER_TOOL_CACHE": "/Users/runner/hostedtoolcache", "GIT_TERMINAL_PROMPT": "0", "SYSTEM_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_CULTURE": "en-US", "USER": "runner", "NVM_DIR": "/Users/runner/.nvm", "BUILD_REPOSITORY_PROVIDER": "GitHub", "AGENT_TEMPDIRECTORY": "/Users/runner/work/_temp", "BUILD_SOURCEBRANCHNAME": "merge", "SYSTEM_JOBIDENTIFIER": "JSONL.Python38Mac", "TF_BUILD": "True", "SYSTEM_TEAMFOUNDATIONSERVERURI": "https://dev.azure.com/explosion-ai/", "ANDROID_NDK_ROOT": "/Users/runner/Library/Android/sdk/ndk-bundle", "SYSTEM_TASKDISPLAYNAME": "Generate JSONL (Mac)", "BUILD_QUEUEDBYID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "AZURE_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "ImageVersion": "20220322.1", "ANDROID_NDK_LATEST_HOME": "/Users/runner/Library/Android/sdk/ndk/23.1.7779620", "SYSTEM_STAGENAME": "__default", "SSH_AUTH_SOCK": "/private/tmp/com.apple.launchd.YkqGCzkVWJ/Listeners", "AGENT_DISABLELOGPLUGIN_TESTRESULTLOGPLUGIN": "false", "__CF_USER_TEXT_ENCODING": "0x1F5:0:0", "HOMEBREW_NO_AUTO_UPDATE": "1", "SYSTEM_TEAMPROJECTID": "5c6613e9-6ccf-48bd-81de-dbc3b0a6f957", "VSTS_PROCESS_LOOKUP_ID": "vsts_85e0ec50-d80c-41f7-bde7-83a7307a7c31", "AGENT_ROOTDIRECTORY": "/Users/runner/work", "AGENT_TOOLSDIRECTORY": "/Users/runner/hostedtoolcache", "AGENT_HOMEDIRECTORY": "/Users/runner/runners/2.202.0", "SYSTEM_TEAMPROJECT": "Public", "BUILD_SOURCEVERSIONMESSAGE": "Merge 87f212fd7a2299cb6793812f91926d059cd8e8cb into 6daabf0c925bfe67f7d87874ce014eb3212711e7", "BUILD_REPOSITORY_ID": "explosion/cython-blis", "JAVA_HOME_17_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/17.0.2-8/x64/Contents/Home/", "BUILD_REPOSITORY_LOCALPATH": "/Users/runner/work/1/s", "agent.jobstatus": "Succeeded", "AGENT_LOGTOBLOBSTORAGESERVICE": "true", "SYSTEM_JOBDISPLAYNAME": "JSONL Python38Mac", "TASK_SKIPTRANSLATORFORCHECKOUT": "False", "SYSTEM_PULLREQUEST_TARGETBRANCH": "master", "PYTHON_VERSION": "3.8", "SYSTEM": "build", "BUILD_REASON": "PullRequest", "SYSTEM_PIPELINESTARTTIME": "2022-04-05 11:45:27+00:00", "USEPYTHONVERSION_PYTHONLOCATION": "/Users/runner/hostedtoolcache/Python/3.8.12/x64", "AGENT_BUILDDIRECTORY": "/Users/runner/work/1", "AGENT_OS": "Darwin", "BUILD_SOURCESDIRECTORY": "/Users/runner/work/1/s", "PATH": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin:/Users/runner/hostedtoolcache/Python/3.8.12/x64:/usr/local/lib/ruby/gems/2.7.0/bin:/usr/local/opt/ruby@2.7/bin:/usr/local/opt/pipx_bin:/Users/runner/.cargo/bin:/usr/local/opt/curl/bin:/usr/local/bin:/usr/local/sbin:/Users/runner/bin:/Users/runner/.yarn/bin:/Users/runner/Library/Android/sdk/tools:/Users/runner/Library/Android/sdk/platform-tools:/Users/runner/Library/Android/sdk/ndk-bundle:/Library/Frameworks/Mono.framework/Versions/Current/Commands:/usr/bin:/bin:/usr/sbin:/sbin:/Users/runner/.dotnet/tools:/Users/runner/.ghcup/bin:/Users/runner/hostedtoolcache/stack/2.7.5/x64", "SYSTEM_PHASEATTEMPT": "1", "SYSTEM_ISSCHEDULED": "False", "SYSTEM_DEBUG": "false", "PERFLOG_LOCATION_SETTING": "RUNNER_PERFLOG", "GOROOT_1_15_X64": "/Users/runner/hostedtoolcache/go/1.15.15/x64", "SYSTEM_PULLREQUEST_SOURCEREPOSITORYURI": "https://github.com/explosion/cython-blis", "VM_ASSETS": "/usr/local/opt/runner/scripts", "DOTNET_ROOT": "/Users/runner/.dotnet", "CONDA": "/usr/local/miniconda", "EDGEWEBDRIVER": "/usr/local/share/edge_driver", "PWD": "/Users/runner/work/1/s/flame-blis", "SYSTEM_PULLREQUEST_ISFORK": "True", "BUILD_BUILDURI": "vstfs:///Build/Build/16961", "JAVA_HOME": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "SYSTEM_DEFINITIONID": "6", "VCPKG_INSTALLATION_ROOT": "/usr/local/share/vcpkg", "SYSTEM_STAGEID": "96ac2280-8cb4-5df5-99de-dd2da759617d", "AGENT_DISABLELOGPLUGIN_TESTFILEPUBLISHERPLUGIN": "true", "LANG": "en_US.UTF-8", "SYSTEM_ENABLEACCESSTOKEN": "SecretVariable", "ImageOS": "macos11", "XCODE_13_DEVELOPER_DIR": "/Applications/Xcode_13.2.1.app/Contents/Developer", "SYSTEM_TASKINSTANCENAME": "CmdLine5", "SYSTEM_PHASEDISPLAYNAME": "JSONL", "RESOURCES_TRIGGERINGCATEGORY": "", "SYSTEM_POSTLINESSPEED": "500", "XPC_FLAGS": "0x0", "SYSTEM_SERVERTYPE": "Hosted", "BUILD_REPOSITORY_NAME": "explosion/cython-blis", "BUILD_REPOSITORY_URI": "https://github.com/explosion/cython-blis", "PIPELINE_WORKSPACE": "/Users/runner/work/1", "PIPX_HOME": "/usr/local/opt/pipx", "AGENT_WORKFOLDER": "/Users/runner/work", "BUILD_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_JOBNAME": "Python38Mac", "BUILD_REQUESTEDFOR": "GitHub", "XPC_SERVICE_NAME": "0", "GECKOWEBDRIVER": "/usr/local/opt/geckodriver/bin", "SYSTEM_TIMELINEID": "234a8612-1dea-4ff0-b476-24616de676fc", "SYSTEM_ARTIFACTSDIRECTORY": "/Users/runner/work/1/a", "HOME": "/Users/runner", "SHLVL": "3", "AGENT_ID": "92", "AGENT_RETAINDEFAULTENCODING": "false", "GRAALVM_11_ROOT": "/Library/Java/JavaVirtualMachines/graalvm-ce-java11-22.0.0.2/Contents/Home/bin", "SYSTEM_JOBPOSITIONINPHASE": "1", "BUILD_REQUESTEDFORID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "AGENT_USEWORKSPACEID": "true", "BUILD_ARTIFACTSTAGINGDIRECTORY": "/Users/runner/work/1/a", "BUILD_BINARIESDIRECTORY": "/Users/runner/work/1/b", "BUILD_BUILDID": "16961", "RESOURCES_TRIGGERINGALIAS": "", "GOROOT_1_16_X64": "/Users/runner/hostedtoolcache/go/1.16.15/x64", "LOGNAME": "runner", "SYSTEM_TASKINSTANCEID": "476bff7d-b32a-5e93-16d5-defdc40dd1cd", "BUILD_SOURCEVERSION": "0c8497cb0eb86242df0cb0f3e46fe9d97b7fb5d8", "LC_CTYPE": "en_US.UTF-8", "HOMEBREW_CLEANUP_PERIODIC_FULL_DAYS": "3650", "SYSTEM_PULLREQUEST_MERGEDAT": "", "HOMEBREW_CASK_OPTS": "--no-quarantine", "SYSTEM_DEFAULTWORKINGDIRECTORY": "/Users/runner/work/1/s", "POWERSHELL_DISTRIBUTION_CHANNEL": "Azure-DevOps-macos11", "SYSTEM_JOBID": "82ca4189-e9ff-5ba9-3895-5b644d1542b5", "SYSTEM_PULLREQUEST_PULLREQUESTID": "899994381", "ANDROID_NDK_HOME": "/Users/runner/Library/Android/sdk/ndk-bundle", "BOOTSTRAP_HASKELL_NONINTERACTIVE": "1", "SYSTEM_TOTALJOBSINPHASE": "2", "SYSTEM_PULLREQUEST_SOURCEBRANCH": "update-to-blis-0.9.0", "XCODE_11_DEVELOPER_DIR": "/Applications/Xcode_11.7.app/Contents/Developer", "AGENT_NAME": "Hosted Agent", "SYSTEM_STAGEDISPLAYNAME": "__default", "SYSTEM_PHASEID": "ecb95708-c2a5-5456-f379-96cd8090c2a6", "BUILD_DEFINITIONVERSION": "1", "SYSTEM_PLANID": "234a8612-1dea-4ff0-b476-24616de676fc", "ENDPOINT_URL_SYSTEMVSSCONNECTION": "https://dev.azure.com/explosion-ai/", "AGENT_JOBSTATUS": "Succeeded", "TASK_DISPLAYNAME": "Generate JSONL (Mac)", "SYSTEM_COLLECTIONID": "116cc368-5c0c-4eb4-bb44-7f3fa5bdce14", "SYSTEM_PHASENAME": "JSONL", "BUILD_BUILDNUMBER": "20220405.9", "DOTNET_MULTILEVEL_LOOKUP": "0", "AGENT_TASKRESTRICTIONSENFORCEMENTMODE": "Enabled", "BUILD_CONTAINERID": "11756447", "SYSTEM_PARALLELEXECUTIONTYPE": "MultiConfiguration", "_": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/knl/bli_cntx_init_knl.c", "target": "obj/x86_64/config/knl/bli_cntx_init_knl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/skx/bli_cntx_init_skx.c", "target": "obj/x86_64/config/skx/bli_cntx_init_skx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen/bli_cntx_init_zen.c", "target": "obj/x86_64/config/zen/bli_cntx_init_zen.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen2/bli_cntx_init_zen2.c", "target": "obj/x86_64/config/zen2/bli_cntx_init_zen2.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen3/bli_cntx_init_zen3.c", "target": "obj/x86_64/config/zen3/bli_cntx_init_zen3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c", "target": "obj/x86_64/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x14.c", "target": "obj/x86_64/kernels/skx/3/bli_dgemm_skx_asm_16x14.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c", "target": "obj/x86_64/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_dpackm_knl_asm_24x8.c", "target": "obj/x86_64/kernels/knl/1m/bli_dpackm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_spackm_knl_asm_24x16.c", "target": "obj/x86_64/kernels/knl/1m/bli_spackm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_dgemm_knl_asm_24x8.c", "target": "obj/x86_64/kernels/knl/3/bli_dgemm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_sgemm_knl_asm_24x16.c", "target": "obj/x86_64/kernels/knl/3/bli_sgemm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/skx/bli_cntx_skx_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_addv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_amaxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_axpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_axpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_copyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_dotv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_dotxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_invertv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_scal2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_scalv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_setv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_subv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_swapv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_xpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_axpy2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_axpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotaxpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotxaxpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotxf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_1er_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_bb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_unpackm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemmsup_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemmtrsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_trsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_gemmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_gemmtrsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_trsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_gemm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_gemmtrsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_trsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/knl/bli_cntx_knl_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_addv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_amaxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_axpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_axpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_copyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_dotv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_dotxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_invertv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_scal2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_scalv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_setv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_subv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_swapv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_xpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_axpy2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_axpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotaxpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotxaxpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotxf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_1er_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_bb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_unpackm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemmsup_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemmtrsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_trsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_gemmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_gemmtrsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_trsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_gemm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_gemmtrsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_trsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen3/bli_cntx_zen3_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_addv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_amaxv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_axpbyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_axpyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_copyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_dotv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_dotxv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_invertv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_scal2v_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_scalv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_setv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_subv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_swapv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_xpbyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_axpy2v_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_axpyf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotaxpyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotxaxpyf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotxf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_1er_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_bb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_unpackm_cxk_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemmsup_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemmtrsm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_trsm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_gemmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_gemmtrsmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_trsmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_gemm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_gemmtrsm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_trsm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen2/bli_cntx_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_addv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_amaxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_axpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_axpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_copyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_dotv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_dotxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_invertv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_scal2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_scalv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_setv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_subv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_swapv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_xpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_axpy2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_axpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotaxpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotxaxpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotxf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_1er_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_bb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_unpackm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemmsup_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemmtrsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_trsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_gemmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_gemmtrsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_trsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_gemm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_gemmtrsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_trsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen/bli_cntx_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_addv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_amaxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_axpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_axpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_copyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_dotv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_dotxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_invertv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_scal2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_scalv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_setv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_subv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_swapv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_xpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_axpy2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_axpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotaxpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotxaxpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotxf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_1er_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_bb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_unpackm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemmsup_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemmtrsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_trsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_gemmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_gemmtrsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_trsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_gemm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_gemmtrsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_trsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64/frame/0/bli_l0_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64/frame/1/bli_l1v_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64/frame/2/bli_l2_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64/frame/3/bli_l3_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64/frame/3/bli_l3_direct.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64/frame/3/bli_l3_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64/frame/3/bli_l3_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64/frame/3/bli_l3_packab.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64/frame/3/bli_l3_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64/frame/3/bli_l3_schema.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64/frame/3/bli_l3_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64/frame/base/bli_apool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64/frame/base/bli_arch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64/frame/base/bli_array.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64/frame/base/bli_blksz.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64/frame/base/bli_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64/frame/base/bli_clock.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64/frame/base/bli_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64/frame/base/bli_cntx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64/frame/base/bli_const.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64/frame/base/bli_cpuid.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64/frame/base/bli_env.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64/frame/base/bli_error.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64/frame/base/bli_func.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64/frame/base/bli_getopt.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64/frame/base/bli_gks.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64/frame/base/bli_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64/frame/base/bli_info.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64/frame/base/bli_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64/frame/base/bli_machval.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64/frame/base/bli_malloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64/frame/base/bli_mbool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64/frame/base/bli_memsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64/frame/base/bli_obj.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64/frame/base/bli_pack.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64/frame/base/bli_param_map.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64/frame/base/bli_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64/frame/base/bli_pba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64/frame/base/bli_pool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64/frame/base/bli_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64/frame/base/bli_query.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64/frame/base/bli_rntm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64/frame/base/bli_sba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64/frame/base/bli_setgetijm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64/frame/base/bli_setgetijv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64/frame/base/bli_setri.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64/frame/base/bli_string.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64/frame/base/bli_winsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64/frame/base/cast/bli_castm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64/frame/base/cast/bli_castv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64/frame/base/check/bli_part_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64/frame/base/proj/bli_projm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64/frame/base/proj/bli_projv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64/frame/thread/bli_pthread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64/frame/thread/bli_thread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64/frame/util/bli_util_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64/frame/util/bli_util_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64/frame/util/bli_util_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64/frame/util/bli_util_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/darwin-x86_64_no_skx.jsonl000066400000000000000000011574671427272030600234500ustar00rootroot00000000000000{"environment": {"SYSTEM_TEAMFOUNDATIONCOLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_JOBTIMEOUT": "60", "BUILD_SOURCEBRANCH": "refs/pull/69/merge", "SYSTEM_TASKDEFINITIONSURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_RESTRICTSECRETS": "True", "SYSTEM_JOBATTEMPT": "1", "SYSTEM_PULLREQUEST_SOURCECOMMITID": "1db78ab0302a0aaf31a97ba8553f31f01106bcdd", "AGENT_VERSION": "2.202.0", "BUILD_QUEUEDBY": "GitHub", "XCODE_12_DEVELOPER_DIR": "/Applications/Xcode_12.5.1.app/Contents/Developer", "SYSTEM_HOSTTYPE": "build", "SYSTEM_COLLECTIONURI": "https://dev.azure.com/explosion-ai/", "NVM_CD_FLAGS": "", "ANDROID_HOME": "/Users/runner/Library/Android/sdk", "BUILD_REPOSITORY_GIT_SUBMODULECHECKOUT": "False", "SYSTEM_JOBPARALLELISMTAG": "Public", "CHROMEWEBDRIVER": "/usr/local/Caskroom/chromedriver/100.0.4896.60", "GOROOT_1_17_X64": "/Users/runner/hostedtoolcache/go/1.17.8/x64", "SHELL": "/bin/bash", "PIPX_BIN_DIR": "/usr/local/opt/pipx_bin", "BUILD_STAGINGDIRECTORY": "/Users/runner/work/1/a", "TMPDIR": "/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/", "SYSTEM_PULLREQUEST_PULLREQUESTNUMBER": "69", "AGENT_MACHINENAME": "Mac-1649312315670", "SYSTEM_WORKFOLDER": "/Users/runner/work", "COMMON_TESTRESULTSDIRECTORY": "/Users/runner/work/1/TestResults", "AGENT_JOBNAME": "JSONL Python38Mac", "IMAGENAME": "macos-latest", "ANDROID_SDK_ROOT": "/Users/runner/Library/Android/sdk", "OLDPWD": "/Users/runner/work/1/s", "RCT_NO_LAUNCH_PACKAGER": "1", "JAVA_HOME_8_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "MSDEPLOY_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "BUILD_SOURCEVERSIONAUTHOR": "Dani\u00ebl de Kok", "AGENT_OSARCHITECTURE": "X64", "NUNIT_BASE_PATH": "/Library/Developer/nunit", "RUNNER_PERFLOG": "/usr/local/opt/runner/perflog", "BUILD_REQUESTEDFOREMAIL": "", "LC_ALL": "en_US.UTF-8", "NUNIT3_PATH": "/Library/Developer/nunit/3.6.0", "AGENT_ACCEPTTEEEULA": "True", "AGENT_READONLYVARIABLES": "true", "SYSTEM_STAGEATTEMPT": "1", "RUNNER_TOOL_CACHE": "/Users/runner/hostedtoolcache", "JAVA_HOME_11_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/11.0.14-101/x64/Contents/Home/", "SYSTEM_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_CULTURE": "en-US", "GIT_TERMINAL_PROMPT": "0", "NVM_DIR": "/Users/runner/.nvm", "USER": "runner", "BUILD_SOURCEBRANCHNAME": "merge", "AGENT_TEMPDIRECTORY": "/Users/runner/work/_temp", "BUILD_REPOSITORY_PROVIDER": "GitHub", "SYSTEM_JOBIDENTIFIER": "JSONL.Python38Mac", "TF_BUILD": "True", "SYSTEM_TEAMFOUNDATIONSERVERURI": "https://dev.azure.com/explosion-ai/", "ANDROID_NDK_ROOT": "/Users/runner/Library/Android/sdk/ndk-bundle", "AZURE_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "SYSTEM_TASKDISPLAYNAME": "Generate JSONL (Mac)", "BUILD_QUEUEDBYID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "ImageVersion": "20220402.1", "ANDROID_NDK_LATEST_HOME": "/Users/runner/Library/Android/sdk/ndk/23.1.7779620", "SYSTEM_STAGENAME": "__default", "SSH_AUTH_SOCK": "/private/tmp/com.apple.launchd.YQrQpXKfSr/Listeners", "AGENT_DISABLELOGPLUGIN_TESTRESULTLOGPLUGIN": "false", "__CF_USER_TEXT_ENCODING": "0x1F5:0:0", "HOMEBREW_NO_AUTO_UPDATE": "1", "AGENT_ROOTDIRECTORY": "/Users/runner/work", "SYSTEM_TEAMPROJECTID": "5c6613e9-6ccf-48bd-81de-dbc3b0a6f957", "VSTS_PROCESS_LOOKUP_ID": "vsts_ebf1e834-725e-4a78-b68a-c769f181ca69", "AGENT_TOOLSDIRECTORY": "/Users/runner/hostedtoolcache", "SYSTEM_TEAMPROJECT": "Public", "AGENT_HOMEDIRECTORY": "/Users/runner/runners/2.202.0", "BUILD_SOURCEVERSIONMESSAGE": "Merge 1db78ab0302a0aaf31a97ba8553f31f01106bcdd into 6daabf0c925bfe67f7d87874ce014eb3212711e7", "BUILD_REPOSITORY_ID": "explosion/cython-blis", "JAVA_HOME_17_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/17.0.2-8/x64/Contents/Home/", "SYSTEM_PULLREQUEST_TARGETBRANCH": "master", "agent.jobstatus": "Succeeded", "AGENT_LOGTOBLOBSTORAGESERVICE": "true", "SYSTEM_JOBDISPLAYNAME": "JSONL Python38Mac", "BUILD_REPOSITORY_LOCALPATH": "/Users/runner/work/1/s", "TASK_SKIPTRANSLATORFORCHECKOUT": "False", "AGENT_BUILDDIRECTORY": "/Users/runner/work/1", "PYTHON_VERSION": "3.8", "BUILD_REASON": "PullRequest", "SYSTEM_PIPELINESTARTTIME": "2022-04-07 06:44:25+00:00", "SYSTEM": "build", "USEPYTHONVERSION_PYTHONLOCATION": "/Users/runner/hostedtoolcache/Python/3.8.12/x64", "BUILD_SOURCESDIRECTORY": "/Users/runner/work/1/s", "AGENT_OS": "Darwin", "PATH": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin:/Users/runner/hostedtoolcache/Python/3.8.12/x64:/usr/local/lib/ruby/gems/2.7.0/bin:/usr/local/opt/ruby@2.7/bin:/usr/local/opt/pipx_bin:/Users/runner/.cargo/bin:/usr/local/opt/curl/bin:/usr/local/bin:/usr/local/sbin:/Users/runner/bin:/Users/runner/.yarn/bin:/Users/runner/Library/Android/sdk/tools:/Users/runner/Library/Android/sdk/platform-tools:/Users/runner/Library/Android/sdk/ndk-bundle:/Library/Frameworks/Mono.framework/Versions/Current/Commands:/usr/bin:/bin:/usr/sbin:/sbin:/Users/runner/.dotnet/tools:/Users/runner/.ghcup/bin:/Users/runner/hostedtoolcache/stack/2.7.5/x64", "SYSTEM_PHASEATTEMPT": "1", "SYSTEM_ISSCHEDULED": "False", "SYSTEM_DEBUG": "false", "PERFLOG_LOCATION_SETTING": "RUNNER_PERFLOG", "GOROOT_1_15_X64": "/Users/runner/hostedtoolcache/go/1.15.15/x64", "SYSTEM_PULLREQUEST_SOURCEREPOSITORYURI": "https://github.com/explosion/cython-blis", "CONDA": "/usr/local/miniconda", "PWD": "/Users/runner/work/1/s/flame-blis", "EDGEWEBDRIVER": "/usr/local/share/edge_driver", "VM_ASSETS": "/usr/local/opt/runner/scripts", "DOTNET_ROOT": "/Users/runner/.dotnet", "SYSTEM_PULLREQUEST_ISFORK": "True", "BUILD_BUILDURI": "vstfs:///Build/Build/16987", "JAVA_HOME": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "SYSTEM_DEFINITIONID": "6", "VCPKG_INSTALLATION_ROOT": "/usr/local/share/vcpkg", "SYSTEM_STAGEID": "96ac2280-8cb4-5df5-99de-dd2da759617d", "AGENT_DISABLELOGPLUGIN_TESTFILEPUBLISHERPLUGIN": "true", "LANG": "en_US.UTF-8", "SYSTEM_ENABLEACCESSTOKEN": "SecretVariable", "XCODE_13_DEVELOPER_DIR": "/Applications/Xcode_13.2.1.app/Contents/Developer", "ImageOS": "macos11", "SYSTEM_TASKINSTANCENAME": "CmdLine5", "SYSTEM_POSTLINESSPEED": "10000", "RESOURCES_TRIGGERINGCATEGORY": "", "SYSTEM_PHASEDISPLAYNAME": "JSONL", "XPC_FLAGS": "0x0", "BUILD_REPOSITORY_NAME": "explosion/cython-blis", "SYSTEM_SERVERTYPE": "Hosted", "BUILD_REPOSITORY_URI": "https://github.com/explosion/cython-blis", "PIPELINE_WORKSPACE": "/Users/runner/work/1", "PIPX_HOME": "/usr/local/opt/pipx", "AGENT_WORKFOLDER": "/Users/runner/work", "BUILD_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_JOBNAME": "Python38Mac", "BUILD_REQUESTEDFOR": "GitHub", "GECKOWEBDRIVER": "/usr/local/opt/geckodriver/bin", "XPC_SERVICE_NAME": "0", "SYSTEM_TIMELINEID": "aacac669-7e0e-4d59-bd0d-94448850e4a9", "SYSTEM_ARTIFACTSDIRECTORY": "/Users/runner/work/1/a", "HOME": "/Users/runner", "SHLVL": "3", "AGENT_ID": "93", "AGENT_RETAINDEFAULTENCODING": "false", "GRAALVM_11_ROOT": "/Library/Java/JavaVirtualMachines/graalvm-ce-java11-22.0.0.2/Contents/Home/bin", "SYSTEM_JOBPOSITIONINPHASE": "1", "BUILD_BINARIESDIRECTORY": "/Users/runner/work/1/b", "BUILD_ARTIFACTSTAGINGDIRECTORY": "/Users/runner/work/1/a", "BUILD_REQUESTEDFORID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "AGENT_USEWORKSPACEID": "true", "RESOURCES_TRIGGERINGALIAS": "", "BUILD_BUILDID": "16987", "LOGNAME": "runner", "GOROOT_1_16_X64": "/Users/runner/hostedtoolcache/go/1.16.15/x64", "SYSTEM_TASKINSTANCEID": "476bff7d-b32a-5e93-16d5-defdc40dd1cd", "BUILD_SOURCEVERSION": "2c06e79272a205923124f93e6d4ba9bc70f6a846", "LC_CTYPE": "en_US.UTF-8", "HOMEBREW_CLEANUP_PERIODIC_FULL_DAYS": "3650", "SYSTEM_PULLREQUEST_MERGEDAT": "", "HOMEBREW_CASK_OPTS": "--no-quarantine", "SYSTEM_DEFAULTWORKINGDIRECTORY": "/Users/runner/work/1/s", "POWERSHELL_DISTRIBUTION_CHANNEL": "Azure-DevOps-macos11", "SYSTEM_JOBID": "82ca4189-e9ff-5ba9-3895-5b644d1542b5", "SYSTEM_PULLREQUEST_PULLREQUESTID": "899994381", "ANDROID_NDK_HOME": "/Users/runner/Library/Android/sdk/ndk-bundle", "BOOTSTRAP_HASKELL_NONINTERACTIVE": "1", "GOROOT_1_18_X64": "/Users/runner/hostedtoolcache/go/1.18.0/x64", "SYSTEM_PULLREQUEST_SOURCEBRANCH": "update-to-blis-0.9.0", "SYSTEM_TOTALJOBSINPHASE": "2", "XCODE_11_DEVELOPER_DIR": "/Applications/Xcode_11.7.app/Contents/Developer", "SYSTEM_STAGEDISPLAYNAME": "__default", "AGENT_NAME": "Azure Pipelines 2", "SYSTEM_PLANID": "aacac669-7e0e-4d59-bd0d-94448850e4a9", "BUILD_DEFINITIONVERSION": "1", "SYSTEM_PHASEID": "ecb95708-c2a5-5456-f379-96cd8090c2a6", "ENDPOINT_URL_SYSTEMVSSCONNECTION": "https://dev.azure.com/explosion-ai/", "AGENT_JOBSTATUS": "Succeeded", "TASK_DISPLAYNAME": "Generate JSONL (Mac)", "SYSTEM_COLLECTIONID": "116cc368-5c0c-4eb4-bb44-7f3fa5bdce14", "BUILD_BUILDNUMBER": "20220407.1", "SYSTEM_PHASENAME": "JSONL", "DOTNET_MULTILEVEL_LOOKUP": "0", "SYSTEM_PARALLELEXECUTIONTYPE": "MultiConfiguration", "BUILD_CONTAINERID": "11784577", "AGENT_TASKRESTRICTIONSENFORCEMENTMODE": "Enabled", "_": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64_no_skx/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64_no_skx/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64_no_skx/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64_no_skx/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64_no_skx/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64_no_skx/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64_no_skx/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64_no_skx/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64_no_skx/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64_no_skx/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64_no_skx/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64_no_skx/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64_no_skx/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64_no_skx/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64_no_skx/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64_no_skx/frame/0/copysc/bli_copysc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/her/bli_her_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/her/bli_her_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/her/bli_her_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_blocksize.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_direct.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_packab.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_schema.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_var12.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64_no_skx/frame/3/hemm/bli_hemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64_no_skx/frame/3/symm/bli_symm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64_no_skx/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64_no_skx/frame/base/bli_apool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64_no_skx/frame/base/bli_arch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64_no_skx/frame/base/bli_array.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64_no_skx/frame/base/bli_blksz.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64_no_skx/frame/base/bli_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64_no_skx/frame/base/bli_clock.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64_no_skx/frame/base/bli_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64_no_skx/frame/base/bli_cntx.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64_no_skx/frame/base/bli_const.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64_no_skx/frame/base/bli_cpuid.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64_no_skx/frame/base/bli_env.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64_no_skx/frame/base/bli_error.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64_no_skx/frame/base/bli_func.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64_no_skx/frame/base/bli_getopt.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64_no_skx/frame/base/bli_gks.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64_no_skx/frame/base/bli_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64_no_skx/frame/base/bli_info.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64_no_skx/frame/base/bli_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64_no_skx/frame/base/bli_machval.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64_no_skx/frame/base/bli_malloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64_no_skx/frame/base/bli_mbool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64_no_skx/frame/base/bli_memsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64_no_skx/frame/base/bli_obj.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64_no_skx/frame/base/bli_obj_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64_no_skx/frame/base/bli_pack.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64_no_skx/frame/base/bli_param_map.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64_no_skx/frame/base/bli_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64_no_skx/frame/base/bli_pba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64_no_skx/frame/base/bli_pool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64_no_skx/frame/base/bli_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64_no_skx/frame/base/bli_query.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64_no_skx/frame/base/bli_rntm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64_no_skx/frame/base/bli_sba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64_no_skx/frame/base/bli_setgetijm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64_no_skx/frame/base/bli_setgetijv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64_no_skx/frame/base/bli_setri.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64_no_skx/frame/base/bli_string.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64_no_skx/frame/base/bli_winsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64_no_skx/frame/base/cast/bli_castm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64_no_skx/frame/base/cast/bli_castnzm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64_no_skx/frame/base/cast/bli_castv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64_no_skx/frame/base/check/bli_obj_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64_no_skx/frame/base/check/bli_part_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64_no_skx/frame/base/noopt/bli_dlamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64_no_skx/frame/base/noopt/bli_lsame.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64_no_skx/frame/base/noopt/bli_slamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64_no_skx/frame/base/proj/bli_projm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64_no_skx/frame/base/proj/bli_projv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64_no_skx/frame/thread/bli_pthread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrinfo_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/darwin-x86_64_no_zen2.jsonl000066400000000000000000013134251427272030600235050ustar00rootroot00000000000000{"environment": {"SYSTEM_TEAMFOUNDATIONCOLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_JOBTIMEOUT": "60", "BUILD_SOURCEBRANCH": "refs/pull/69/merge", "SYSTEM_TASKDEFINITIONSURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_RESTRICTSECRETS": "True", "SYSTEM_JOBATTEMPT": "1", "SYSTEM_PULLREQUEST_SOURCECOMMITID": "1db78ab0302a0aaf31a97ba8553f31f01106bcdd", "AGENT_VERSION": "2.202.0", "BUILD_QUEUEDBY": "GitHub", "XCODE_12_DEVELOPER_DIR": "/Applications/Xcode_12.5.1.app/Contents/Developer", "SYSTEM_HOSTTYPE": "build", "SYSTEM_COLLECTIONURI": "https://dev.azure.com/explosion-ai/", "NVM_CD_FLAGS": "", "ANDROID_HOME": "/Users/runner/Library/Android/sdk", "BUILD_REPOSITORY_GIT_SUBMODULECHECKOUT": "False", "SYSTEM_JOBPARALLELISMTAG": "Public", "CHROMEWEBDRIVER": "/usr/local/Caskroom/chromedriver/100.0.4896.60", "GOROOT_1_17_X64": "/Users/runner/hostedtoolcache/go/1.17.8/x64", "SHELL": "/bin/bash", "PIPX_BIN_DIR": "/usr/local/opt/pipx_bin", "BUILD_STAGINGDIRECTORY": "/Users/runner/work/1/a", "TMPDIR": "/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/", "SYSTEM_PULLREQUEST_PULLREQUESTNUMBER": "69", "AGENT_MACHINENAME": "Mac-1649312315670", "SYSTEM_WORKFOLDER": "/Users/runner/work", "COMMON_TESTRESULTSDIRECTORY": "/Users/runner/work/1/TestResults", "AGENT_JOBNAME": "JSONL Python38Mac", "IMAGENAME": "macos-latest", "ANDROID_SDK_ROOT": "/Users/runner/Library/Android/sdk", "OLDPWD": "/Users/runner/work/1/s", "RCT_NO_LAUNCH_PACKAGER": "1", "JAVA_HOME_8_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "MSDEPLOY_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "BUILD_SOURCEVERSIONAUTHOR": "Dani\u00ebl de Kok", "AGENT_OSARCHITECTURE": "X64", "NUNIT_BASE_PATH": "/Library/Developer/nunit", "RUNNER_PERFLOG": "/usr/local/opt/runner/perflog", "BUILD_REQUESTEDFOREMAIL": "", "LC_ALL": "en_US.UTF-8", "NUNIT3_PATH": "/Library/Developer/nunit/3.6.0", "AGENT_ACCEPTTEEEULA": "True", "AGENT_READONLYVARIABLES": "true", "SYSTEM_STAGEATTEMPT": "1", "RUNNER_TOOL_CACHE": "/Users/runner/hostedtoolcache", "JAVA_HOME_11_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/11.0.14-101/x64/Contents/Home/", "SYSTEM_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_CULTURE": "en-US", "GIT_TERMINAL_PROMPT": "0", "NVM_DIR": "/Users/runner/.nvm", "USER": "runner", "BUILD_SOURCEBRANCHNAME": "merge", "AGENT_TEMPDIRECTORY": "/Users/runner/work/_temp", "BUILD_REPOSITORY_PROVIDER": "GitHub", "SYSTEM_JOBIDENTIFIER": "JSONL.Python38Mac", "TF_BUILD": "True", "SYSTEM_TEAMFOUNDATIONSERVERURI": "https://dev.azure.com/explosion-ai/", "ANDROID_NDK_ROOT": "/Users/runner/Library/Android/sdk/ndk-bundle", "AZURE_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "SYSTEM_TASKDISPLAYNAME": "Generate JSONL (Mac)", "BUILD_QUEUEDBYID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "ImageVersion": "20220402.1", "ANDROID_NDK_LATEST_HOME": "/Users/runner/Library/Android/sdk/ndk/23.1.7779620", "SYSTEM_STAGENAME": "__default", "SSH_AUTH_SOCK": "/private/tmp/com.apple.launchd.YQrQpXKfSr/Listeners", "AGENT_DISABLELOGPLUGIN_TESTRESULTLOGPLUGIN": "false", "__CF_USER_TEXT_ENCODING": "0x1F5:0:0", "HOMEBREW_NO_AUTO_UPDATE": "1", "AGENT_ROOTDIRECTORY": "/Users/runner/work", "SYSTEM_TEAMPROJECTID": "5c6613e9-6ccf-48bd-81de-dbc3b0a6f957", "VSTS_PROCESS_LOOKUP_ID": "vsts_ebf1e834-725e-4a78-b68a-c769f181ca69", "AGENT_TOOLSDIRECTORY": "/Users/runner/hostedtoolcache", "SYSTEM_TEAMPROJECT": "Public", "AGENT_HOMEDIRECTORY": "/Users/runner/runners/2.202.0", "BUILD_SOURCEVERSIONMESSAGE": "Merge 1db78ab0302a0aaf31a97ba8553f31f01106bcdd into 6daabf0c925bfe67f7d87874ce014eb3212711e7", "BUILD_REPOSITORY_ID": "explosion/cython-blis", "JAVA_HOME_17_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/17.0.2-8/x64/Contents/Home/", "SYSTEM_PULLREQUEST_TARGETBRANCH": "master", "agent.jobstatus": "Succeeded", "AGENT_LOGTOBLOBSTORAGESERVICE": "true", "SYSTEM_JOBDISPLAYNAME": "JSONL Python38Mac", "BUILD_REPOSITORY_LOCALPATH": "/Users/runner/work/1/s", "TASK_SKIPTRANSLATORFORCHECKOUT": "False", "AGENT_BUILDDIRECTORY": "/Users/runner/work/1", "PYTHON_VERSION": "3.8", "BUILD_REASON": "PullRequest", "SYSTEM_PIPELINESTARTTIME": "2022-04-07 06:44:25+00:00", "SYSTEM": "build", "USEPYTHONVERSION_PYTHONLOCATION": "/Users/runner/hostedtoolcache/Python/3.8.12/x64", "BUILD_SOURCESDIRECTORY": "/Users/runner/work/1/s", "AGENT_OS": "Darwin", "PATH": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin:/Users/runner/hostedtoolcache/Python/3.8.12/x64:/usr/local/lib/ruby/gems/2.7.0/bin:/usr/local/opt/ruby@2.7/bin:/usr/local/opt/pipx_bin:/Users/runner/.cargo/bin:/usr/local/opt/curl/bin:/usr/local/bin:/usr/local/sbin:/Users/runner/bin:/Users/runner/.yarn/bin:/Users/runner/Library/Android/sdk/tools:/Users/runner/Library/Android/sdk/platform-tools:/Users/runner/Library/Android/sdk/ndk-bundle:/Library/Frameworks/Mono.framework/Versions/Current/Commands:/usr/bin:/bin:/usr/sbin:/sbin:/Users/runner/.dotnet/tools:/Users/runner/.ghcup/bin:/Users/runner/hostedtoolcache/stack/2.7.5/x64", "SYSTEM_PHASEATTEMPT": "1", "SYSTEM_ISSCHEDULED": "False", "SYSTEM_DEBUG": "false", "PERFLOG_LOCATION_SETTING": "RUNNER_PERFLOG", "GOROOT_1_15_X64": "/Users/runner/hostedtoolcache/go/1.15.15/x64", "SYSTEM_PULLREQUEST_SOURCEREPOSITORYURI": "https://github.com/explosion/cython-blis", "CONDA": "/usr/local/miniconda", "PWD": "/Users/runner/work/1/s/flame-blis", "EDGEWEBDRIVER": "/usr/local/share/edge_driver", "VM_ASSETS": "/usr/local/opt/runner/scripts", "DOTNET_ROOT": "/Users/runner/.dotnet", "SYSTEM_PULLREQUEST_ISFORK": "True", "BUILD_BUILDURI": "vstfs:///Build/Build/16987", "JAVA_HOME": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "SYSTEM_DEFINITIONID": "6", "VCPKG_INSTALLATION_ROOT": "/usr/local/share/vcpkg", "SYSTEM_STAGEID": "96ac2280-8cb4-5df5-99de-dd2da759617d", "AGENT_DISABLELOGPLUGIN_TESTFILEPUBLISHERPLUGIN": "true", "LANG": "en_US.UTF-8", "SYSTEM_ENABLEACCESSTOKEN": "SecretVariable", "XCODE_13_DEVELOPER_DIR": "/Applications/Xcode_13.2.1.app/Contents/Developer", "ImageOS": "macos11", "SYSTEM_TASKINSTANCENAME": "CmdLine5", "SYSTEM_POSTLINESSPEED": "10000", "RESOURCES_TRIGGERINGCATEGORY": "", "SYSTEM_PHASEDISPLAYNAME": "JSONL", "XPC_FLAGS": "0x0", "BUILD_REPOSITORY_NAME": "explosion/cython-blis", "SYSTEM_SERVERTYPE": "Hosted", "BUILD_REPOSITORY_URI": "https://github.com/explosion/cython-blis", "PIPELINE_WORKSPACE": "/Users/runner/work/1", "PIPX_HOME": "/usr/local/opt/pipx", "AGENT_WORKFOLDER": "/Users/runner/work", "BUILD_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_JOBNAME": "Python38Mac", "BUILD_REQUESTEDFOR": "GitHub", "GECKOWEBDRIVER": "/usr/local/opt/geckodriver/bin", "XPC_SERVICE_NAME": "0", "SYSTEM_TIMELINEID": "aacac669-7e0e-4d59-bd0d-94448850e4a9", "SYSTEM_ARTIFACTSDIRECTORY": "/Users/runner/work/1/a", "HOME": "/Users/runner", "SHLVL": "3", "AGENT_ID": "93", "AGENT_RETAINDEFAULTENCODING": "false", "GRAALVM_11_ROOT": "/Library/Java/JavaVirtualMachines/graalvm-ce-java11-22.0.0.2/Contents/Home/bin", "SYSTEM_JOBPOSITIONINPHASE": "1", "BUILD_BINARIESDIRECTORY": "/Users/runner/work/1/b", "BUILD_ARTIFACTSTAGINGDIRECTORY": "/Users/runner/work/1/a", "BUILD_REQUESTEDFORID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "AGENT_USEWORKSPACEID": "true", "RESOURCES_TRIGGERINGALIAS": "", "BUILD_BUILDID": "16987", "LOGNAME": "runner", "GOROOT_1_16_X64": "/Users/runner/hostedtoolcache/go/1.16.15/x64", "SYSTEM_TASKINSTANCEID": "476bff7d-b32a-5e93-16d5-defdc40dd1cd", "BUILD_SOURCEVERSION": "2c06e79272a205923124f93e6d4ba9bc70f6a846", "LC_CTYPE": "en_US.UTF-8", "HOMEBREW_CLEANUP_PERIODIC_FULL_DAYS": "3650", "SYSTEM_PULLREQUEST_MERGEDAT": "", "HOMEBREW_CASK_OPTS": "--no-quarantine", "SYSTEM_DEFAULTWORKINGDIRECTORY": "/Users/runner/work/1/s", "POWERSHELL_DISTRIBUTION_CHANNEL": "Azure-DevOps-macos11", "SYSTEM_JOBID": "82ca4189-e9ff-5ba9-3895-5b644d1542b5", "SYSTEM_PULLREQUEST_PULLREQUESTID": "899994381", "ANDROID_NDK_HOME": "/Users/runner/Library/Android/sdk/ndk-bundle", "BOOTSTRAP_HASKELL_NONINTERACTIVE": "1", "GOROOT_1_18_X64": "/Users/runner/hostedtoolcache/go/1.18.0/x64", "SYSTEM_PULLREQUEST_SOURCEBRANCH": "update-to-blis-0.9.0", "SYSTEM_TOTALJOBSINPHASE": "2", "XCODE_11_DEVELOPER_DIR": "/Applications/Xcode_11.7.app/Contents/Developer", "SYSTEM_STAGEDISPLAYNAME": "__default", "AGENT_NAME": "Azure Pipelines 2", "SYSTEM_PLANID": "aacac669-7e0e-4d59-bd0d-94448850e4a9", "BUILD_DEFINITIONVERSION": "1", "SYSTEM_PHASEID": "ecb95708-c2a5-5456-f379-96cd8090c2a6", "ENDPOINT_URL_SYSTEMVSSCONNECTION": "https://dev.azure.com/explosion-ai/", "AGENT_JOBSTATUS": "Succeeded", "TASK_DISPLAYNAME": "Generate JSONL (Mac)", "SYSTEM_COLLECTIONID": "116cc368-5c0c-4eb4-bb44-7f3fa5bdce14", "BUILD_BUILDNUMBER": "20220407.1", "SYSTEM_PHASENAME": "JSONL", "DOTNET_MULTILEVEL_LOOKUP": "0", "SYSTEM_PARALLELEXECUTIONTYPE": "MultiConfiguration", "BUILD_CONTAINERID": "11784577", "AGENT_TASKRESTRICTIONSENFORCEMENTMODE": "Enabled", "_": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64_no_zen2/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64_no_zen2/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64_no_zen2/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64_no_zen2/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64_no_zen2/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64_no_zen2/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64_no_zen2/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/skx/bli_cntx_init_skx.c", "target": "obj/x86_64_no_zen2/config/skx/bli_cntx_init_skx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64_no_zen2/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen/bli_cntx_init_zen.c", "target": "obj/x86_64_no_zen2/config/zen/bli_cntx_init_zen.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c", "target": "obj/x86_64_no_zen2/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x14.c", "target": "obj/x86_64_no_zen2/kernels/skx/3/bli_dgemm_skx_asm_16x14.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c", "target": "obj/x86_64_no_zen2/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64_no_zen2/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64_no_zen2/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64_no_zen2/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64_no_zen2/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64_no_zen2/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64_no_zen2/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64_no_zen2/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/bli_cntx_skx_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_addv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_amaxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_axpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_axpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_copyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_dotv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_dotxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_invertv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_scal2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_scalv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_setv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_subv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_swapv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_xpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_axpy2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_axpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_dotaxpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_dotxaxpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_dotxf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_packm_cxk_1er_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_packm_cxk_bb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_packm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_unpackm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_gemm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_gemmsup_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_gemmtrsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_trsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bb/bli_gemmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bb/bli_gemmtrsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bb/bli_trsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/ind/bli_gemm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/ind/bli_gemmtrsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/ind/bli_trsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/bli_cntx_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_addv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_amaxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_axpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_axpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_copyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_dotv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_dotxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_invertv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_scal2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_scalv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_setv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_subv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_swapv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_xpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_axpy2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_axpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_dotaxpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_dotxaxpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_dotxf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_packm_cxk_1er_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_packm_cxk_bb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_packm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_unpackm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_gemm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_gemmsup_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_gemmtrsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_trsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bb/bli_gemmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bb/bli_gemmtrsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bb/bli_trsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/ind/bli_gemm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/ind/bli_gemmtrsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/ind/bli_trsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64_no_zen2/frame/0/copysc/bli_copysc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/her/bli_her_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/her/bli_her_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/her/bli_her_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_blocksize.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_direct.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_packab.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_schema.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_var12.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64_no_zen2/frame/3/hemm/bli_hemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64_no_zen2/frame/3/symm/bli_symm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64_no_zen2/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64_no_zen2/frame/base/bli_apool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64_no_zen2/frame/base/bli_arch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64_no_zen2/frame/base/bli_array.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64_no_zen2/frame/base/bli_blksz.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64_no_zen2/frame/base/bli_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64_no_zen2/frame/base/bli_clock.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64_no_zen2/frame/base/bli_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64_no_zen2/frame/base/bli_cntx.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64_no_zen2/frame/base/bli_const.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64_no_zen2/frame/base/bli_cpuid.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64_no_zen2/frame/base/bli_env.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64_no_zen2/frame/base/bli_error.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64_no_zen2/frame/base/bli_func.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64_no_zen2/frame/base/bli_getopt.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64_no_zen2/frame/base/bli_gks.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64_no_zen2/frame/base/bli_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64_no_zen2/frame/base/bli_info.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64_no_zen2/frame/base/bli_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64_no_zen2/frame/base/bli_machval.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64_no_zen2/frame/base/bli_malloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64_no_zen2/frame/base/bli_mbool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64_no_zen2/frame/base/bli_memsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64_no_zen2/frame/base/bli_obj.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64_no_zen2/frame/base/bli_obj_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64_no_zen2/frame/base/bli_pack.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64_no_zen2/frame/base/bli_param_map.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64_no_zen2/frame/base/bli_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64_no_zen2/frame/base/bli_pba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64_no_zen2/frame/base/bli_pool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64_no_zen2/frame/base/bli_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64_no_zen2/frame/base/bli_query.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64_no_zen2/frame/base/bli_rntm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64_no_zen2/frame/base/bli_sba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64_no_zen2/frame/base/bli_setgetijm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64_no_zen2/frame/base/bli_setgetijv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64_no_zen2/frame/base/bli_setri.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64_no_zen2/frame/base/bli_string.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64_no_zen2/frame/base/bli_winsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64_no_zen2/frame/base/cast/bli_castm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64_no_zen2/frame/base/cast/bli_castnzm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64_no_zen2/frame/base/cast/bli_castv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64_no_zen2/frame/base/check/bli_obj_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64_no_zen2/frame/base/check/bli_part_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64_no_zen2/frame/base/noopt/bli_dlamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64_no_zen2/frame/base/noopt/bli_lsame.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64_no_zen2/frame/base/noopt/bli_slamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64_no_zen2/frame/base/proj/bli_projm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64_no_zen2/frame/base/proj/bli_projv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_pthread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrinfo_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/darwin-x86_64_no_zen3.jsonl000066400000000000000000014425421427272030600235110ustar00rootroot00000000000000{"environment": {"SYSTEM_TEAMFOUNDATIONCOLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_JOBTIMEOUT": "60", "BUILD_SOURCEBRANCH": "refs/pull/69/merge", "SYSTEM_TASKDEFINITIONSURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_RESTRICTSECRETS": "True", "SYSTEM_JOBATTEMPT": "1", "SYSTEM_PULLREQUEST_SOURCECOMMITID": "1db78ab0302a0aaf31a97ba8553f31f01106bcdd", "AGENT_VERSION": "2.202.0", "BUILD_QUEUEDBY": "GitHub", "XCODE_12_DEVELOPER_DIR": "/Applications/Xcode_12.5.1.app/Contents/Developer", "SYSTEM_HOSTTYPE": "build", "SYSTEM_COLLECTIONURI": "https://dev.azure.com/explosion-ai/", "NVM_CD_FLAGS": "", "ANDROID_HOME": "/Users/runner/Library/Android/sdk", "BUILD_REPOSITORY_GIT_SUBMODULECHECKOUT": "False", "SYSTEM_JOBPARALLELISMTAG": "Public", "CHROMEWEBDRIVER": "/usr/local/Caskroom/chromedriver/100.0.4896.60", "GOROOT_1_17_X64": "/Users/runner/hostedtoolcache/go/1.17.8/x64", "SHELL": "/bin/bash", "PIPX_BIN_DIR": "/usr/local/opt/pipx_bin", "BUILD_STAGINGDIRECTORY": "/Users/runner/work/1/a", "TMPDIR": "/var/folders/24/8k48jl6d249_n_qfxwsl6xvm0000gn/T/", "SYSTEM_PULLREQUEST_PULLREQUESTNUMBER": "69", "AGENT_MACHINENAME": "Mac-1649312315670", "SYSTEM_WORKFOLDER": "/Users/runner/work", "COMMON_TESTRESULTSDIRECTORY": "/Users/runner/work/1/TestResults", "AGENT_JOBNAME": "JSONL Python38Mac", "IMAGENAME": "macos-latest", "ANDROID_SDK_ROOT": "/Users/runner/Library/Android/sdk", "OLDPWD": "/Users/runner/work/1/s", "RCT_NO_LAUNCH_PACKAGER": "1", "JAVA_HOME_8_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "MSDEPLOY_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "BUILD_SOURCEVERSIONAUTHOR": "Dani\u00ebl de Kok", "AGENT_OSARCHITECTURE": "X64", "NUNIT_BASE_PATH": "/Library/Developer/nunit", "RUNNER_PERFLOG": "/usr/local/opt/runner/perflog", "BUILD_REQUESTEDFOREMAIL": "", "LC_ALL": "en_US.UTF-8", "NUNIT3_PATH": "/Library/Developer/nunit/3.6.0", "AGENT_ACCEPTTEEEULA": "True", "AGENT_READONLYVARIABLES": "true", "SYSTEM_STAGEATTEMPT": "1", "RUNNER_TOOL_CACHE": "/Users/runner/hostedtoolcache", "JAVA_HOME_11_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/11.0.14-101/x64/Contents/Home/", "SYSTEM_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_CULTURE": "en-US", "GIT_TERMINAL_PROMPT": "0", "NVM_DIR": "/Users/runner/.nvm", "USER": "runner", "BUILD_SOURCEBRANCHNAME": "merge", "AGENT_TEMPDIRECTORY": "/Users/runner/work/_temp", "BUILD_REPOSITORY_PROVIDER": "GitHub", "SYSTEM_JOBIDENTIFIER": "JSONL.Python38Mac", "TF_BUILD": "True", "SYSTEM_TEAMFOUNDATIONSERVERURI": "https://dev.azure.com/explosion-ai/", "ANDROID_NDK_ROOT": "/Users/runner/Library/Android/sdk/ndk-bundle", "AZURE_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "SYSTEM_TASKDISPLAYNAME": "Generate JSONL (Mac)", "BUILD_QUEUEDBYID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "ImageVersion": "20220402.1", "ANDROID_NDK_LATEST_HOME": "/Users/runner/Library/Android/sdk/ndk/23.1.7779620", "SYSTEM_STAGENAME": "__default", "SSH_AUTH_SOCK": "/private/tmp/com.apple.launchd.YQrQpXKfSr/Listeners", "AGENT_DISABLELOGPLUGIN_TESTRESULTLOGPLUGIN": "false", "__CF_USER_TEXT_ENCODING": "0x1F5:0:0", "HOMEBREW_NO_AUTO_UPDATE": "1", "AGENT_ROOTDIRECTORY": "/Users/runner/work", "SYSTEM_TEAMPROJECTID": "5c6613e9-6ccf-48bd-81de-dbc3b0a6f957", "VSTS_PROCESS_LOOKUP_ID": "vsts_ebf1e834-725e-4a78-b68a-c769f181ca69", "AGENT_TOOLSDIRECTORY": "/Users/runner/hostedtoolcache", "SYSTEM_TEAMPROJECT": "Public", "AGENT_HOMEDIRECTORY": "/Users/runner/runners/2.202.0", "BUILD_SOURCEVERSIONMESSAGE": "Merge 1db78ab0302a0aaf31a97ba8553f31f01106bcdd into 6daabf0c925bfe67f7d87874ce014eb3212711e7", "BUILD_REPOSITORY_ID": "explosion/cython-blis", "JAVA_HOME_17_X64": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/17.0.2-8/x64/Contents/Home/", "SYSTEM_PULLREQUEST_TARGETBRANCH": "master", "agent.jobstatus": "Succeeded", "AGENT_LOGTOBLOBSTORAGESERVICE": "true", "SYSTEM_JOBDISPLAYNAME": "JSONL Python38Mac", "BUILD_REPOSITORY_LOCALPATH": "/Users/runner/work/1/s", "TASK_SKIPTRANSLATORFORCHECKOUT": "False", "AGENT_BUILDDIRECTORY": "/Users/runner/work/1", "PYTHON_VERSION": "3.8", "BUILD_REASON": "PullRequest", "SYSTEM_PIPELINESTARTTIME": "2022-04-07 06:44:25+00:00", "SYSTEM": "build", "USEPYTHONVERSION_PYTHONLOCATION": "/Users/runner/hostedtoolcache/Python/3.8.12/x64", "BUILD_SOURCESDIRECTORY": "/Users/runner/work/1/s", "AGENT_OS": "Darwin", "PATH": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin:/Users/runner/hostedtoolcache/Python/3.8.12/x64:/usr/local/lib/ruby/gems/2.7.0/bin:/usr/local/opt/ruby@2.7/bin:/usr/local/opt/pipx_bin:/Users/runner/.cargo/bin:/usr/local/opt/curl/bin:/usr/local/bin:/usr/local/sbin:/Users/runner/bin:/Users/runner/.yarn/bin:/Users/runner/Library/Android/sdk/tools:/Users/runner/Library/Android/sdk/platform-tools:/Users/runner/Library/Android/sdk/ndk-bundle:/Library/Frameworks/Mono.framework/Versions/Current/Commands:/usr/bin:/bin:/usr/sbin:/sbin:/Users/runner/.dotnet/tools:/Users/runner/.ghcup/bin:/Users/runner/hostedtoolcache/stack/2.7.5/x64", "SYSTEM_PHASEATTEMPT": "1", "SYSTEM_ISSCHEDULED": "False", "SYSTEM_DEBUG": "false", "PERFLOG_LOCATION_SETTING": "RUNNER_PERFLOG", "GOROOT_1_15_X64": "/Users/runner/hostedtoolcache/go/1.15.15/x64", "SYSTEM_PULLREQUEST_SOURCEREPOSITORYURI": "https://github.com/explosion/cython-blis", "CONDA": "/usr/local/miniconda", "PWD": "/Users/runner/work/1/s/flame-blis", "EDGEWEBDRIVER": "/usr/local/share/edge_driver", "VM_ASSETS": "/usr/local/opt/runner/scripts", "DOTNET_ROOT": "/Users/runner/.dotnet", "SYSTEM_PULLREQUEST_ISFORK": "True", "BUILD_BUILDURI": "vstfs:///Build/Build/16987", "JAVA_HOME": "/Users/runner/hostedtoolcache/Java_Temurin-Hotspot_jdk/8.0.322-6/x64/Contents/Home/", "SYSTEM_DEFINITIONID": "6", "VCPKG_INSTALLATION_ROOT": "/usr/local/share/vcpkg", "SYSTEM_STAGEID": "96ac2280-8cb4-5df5-99de-dd2da759617d", "AGENT_DISABLELOGPLUGIN_TESTFILEPUBLISHERPLUGIN": "true", "LANG": "en_US.UTF-8", "SYSTEM_ENABLEACCESSTOKEN": "SecretVariable", "XCODE_13_DEVELOPER_DIR": "/Applications/Xcode_13.2.1.app/Contents/Developer", "ImageOS": "macos11", "SYSTEM_TASKINSTANCENAME": "CmdLine5", "SYSTEM_POSTLINESSPEED": "10000", "RESOURCES_TRIGGERINGCATEGORY": "", "SYSTEM_PHASEDISPLAYNAME": "JSONL", "XPC_FLAGS": "0x0", "BUILD_REPOSITORY_NAME": "explosion/cython-blis", "SYSTEM_SERVERTYPE": "Hosted", "BUILD_REPOSITORY_URI": "https://github.com/explosion/cython-blis", "PIPELINE_WORKSPACE": "/Users/runner/work/1", "PIPX_HOME": "/usr/local/opt/pipx", "AGENT_WORKFOLDER": "/Users/runner/work", "BUILD_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_JOBNAME": "Python38Mac", "BUILD_REQUESTEDFOR": "GitHub", "GECKOWEBDRIVER": "/usr/local/opt/geckodriver/bin", "XPC_SERVICE_NAME": "0", "SYSTEM_TIMELINEID": "aacac669-7e0e-4d59-bd0d-94448850e4a9", "SYSTEM_ARTIFACTSDIRECTORY": "/Users/runner/work/1/a", "HOME": "/Users/runner", "SHLVL": "3", "AGENT_ID": "93", "AGENT_RETAINDEFAULTENCODING": "false", "GRAALVM_11_ROOT": "/Library/Java/JavaVirtualMachines/graalvm-ce-java11-22.0.0.2/Contents/Home/bin", "SYSTEM_JOBPOSITIONINPHASE": "1", "BUILD_BINARIESDIRECTORY": "/Users/runner/work/1/b", "BUILD_ARTIFACTSTAGINGDIRECTORY": "/Users/runner/work/1/a", "BUILD_REQUESTEDFORID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "AGENT_USEWORKSPACEID": "true", "RESOURCES_TRIGGERINGALIAS": "", "BUILD_BUILDID": "16987", "LOGNAME": "runner", "GOROOT_1_16_X64": "/Users/runner/hostedtoolcache/go/1.16.15/x64", "SYSTEM_TASKINSTANCEID": "476bff7d-b32a-5e93-16d5-defdc40dd1cd", "BUILD_SOURCEVERSION": "2c06e79272a205923124f93e6d4ba9bc70f6a846", "LC_CTYPE": "en_US.UTF-8", "HOMEBREW_CLEANUP_PERIODIC_FULL_DAYS": "3650", "SYSTEM_PULLREQUEST_MERGEDAT": "", "HOMEBREW_CASK_OPTS": "--no-quarantine", "SYSTEM_DEFAULTWORKINGDIRECTORY": "/Users/runner/work/1/s", "POWERSHELL_DISTRIBUTION_CHANNEL": "Azure-DevOps-macos11", "SYSTEM_JOBID": "82ca4189-e9ff-5ba9-3895-5b644d1542b5", "SYSTEM_PULLREQUEST_PULLREQUESTID": "899994381", "ANDROID_NDK_HOME": "/Users/runner/Library/Android/sdk/ndk-bundle", "BOOTSTRAP_HASKELL_NONINTERACTIVE": "1", "GOROOT_1_18_X64": "/Users/runner/hostedtoolcache/go/1.18.0/x64", "SYSTEM_PULLREQUEST_SOURCEBRANCH": "update-to-blis-0.9.0", "SYSTEM_TOTALJOBSINPHASE": "2", "XCODE_11_DEVELOPER_DIR": "/Applications/Xcode_11.7.app/Contents/Developer", "SYSTEM_STAGEDISPLAYNAME": "__default", "AGENT_NAME": "Azure Pipelines 2", "SYSTEM_PLANID": "aacac669-7e0e-4d59-bd0d-94448850e4a9", "BUILD_DEFINITIONVERSION": "1", "SYSTEM_PHASEID": "ecb95708-c2a5-5456-f379-96cd8090c2a6", "ENDPOINT_URL_SYSTEMVSSCONNECTION": "https://dev.azure.com/explosion-ai/", "AGENT_JOBSTATUS": "Succeeded", "TASK_DISPLAYNAME": "Generate JSONL (Mac)", "SYSTEM_COLLECTIONID": "116cc368-5c0c-4eb4-bb44-7f3fa5bdce14", "BUILD_BUILDNUMBER": "20220407.1", "SYSTEM_PHASENAME": "JSONL", "DOTNET_MULTILEVEL_LOOKUP": "0", "SYSTEM_PARALLELEXECUTIONTYPE": "MultiConfiguration", "BUILD_CONTAINERID": "11784577", "AGENT_TASKRESTRICTIONSENFORCEMENTMODE": "Enabled", "_": "/Users/runner/hostedtoolcache/Python/3.8.12/x64/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64_no_zen3/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64_no_zen3/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64_no_zen3/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64_no_zen3/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/knl/bli_cntx_init_knl.c", "target": "obj/x86_64_no_zen3/config/knl/bli_cntx_init_knl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64_no_zen3/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64_no_zen3/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64_no_zen3/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/skx/bli_cntx_init_skx.c", "target": "obj/x86_64_no_zen3/config/skx/bli_cntx_init_skx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64_no_zen3/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen/bli_cntx_init_zen.c", "target": "obj/x86_64_no_zen3/config/zen/bli_cntx_init_zen.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen2/bli_cntx_init_zen2.c", "target": "obj/x86_64_no_zen3/config/zen2/bli_cntx_init_zen2.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c", "target": "obj/x86_64_no_zen3/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x14.c", "target": "obj/x86_64_no_zen3/kernels/skx/3/bli_dgemm_skx_asm_16x14.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c", "target": "obj/x86_64_no_zen3/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_dpackm_knl_asm_24x8.c", "target": "obj/x86_64_no_zen3/kernels/knl/1m/bli_dpackm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_spackm_knl_asm_24x16.c", "target": "obj/x86_64_no_zen3/kernels/knl/1m/bli_spackm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_dgemm_knl_asm_24x8.c", "target": "obj/x86_64_no_zen3/kernels/knl/3/bli_dgemm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_sgemm_knl_asm_24x16.c", "target": "obj/x86_64_no_zen3/kernels/knl/3/bli_sgemm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64_no_zen3/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64_no_zen3/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64_no_zen3/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64_no_zen3/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64_no_zen3/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64_no_zen3/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64_no_zen3/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/bli_cntx_skx_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_addv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_amaxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_axpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_axpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_copyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_dotv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_dotxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_invertv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_scal2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_scalv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_setv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_subv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_swapv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_xpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_axpy2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_axpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_dotaxpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_dotxaxpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_dotxf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_packm_cxk_1er_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_packm_cxk_bb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_packm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_unpackm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_gemm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_gemmsup_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_gemmtrsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_trsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bb/bli_gemmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bb/bli_gemmtrsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bb/bli_trsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/ind/bli_gemm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/ind/bli_gemmtrsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/ind/bli_trsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/bli_cntx_knl_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_addv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_amaxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_axpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_axpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_copyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_dotv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_dotxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_invertv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_scal2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_scalv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_setv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_subv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_swapv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_xpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_axpy2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_axpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_dotaxpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_dotxaxpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_dotxf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_packm_cxk_1er_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_packm_cxk_bb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_packm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_unpackm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_gemm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_gemmsup_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_gemmtrsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_trsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bb/bli_gemmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bb/bli_gemmtrsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bb/bli_trsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/ind/bli_gemm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/ind/bli_gemmtrsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/ind/bli_trsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/bli_cntx_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_addv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_amaxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_axpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_axpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_copyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_dotv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_dotxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_invertv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_scal2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_scalv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_setv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_subv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_swapv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_xpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_axpy2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_axpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_dotaxpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_dotxaxpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_dotxf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_packm_cxk_1er_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_packm_cxk_bb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_packm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_unpackm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_gemm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_gemmsup_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_gemmtrsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_trsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bb/bli_gemmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bb/bli_gemmtrsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bb/bli_trsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/ind/bli_gemm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/ind/bli_gemmtrsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/ind/bli_trsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/bli_cntx_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_addv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_amaxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_axpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_axpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_copyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_dotv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_dotxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_invertv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_scal2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_scalv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_setv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_subv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_swapv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_xpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_axpy2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_axpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_dotaxpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_dotxaxpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_dotxf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_packm_cxk_1er_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_packm_cxk_bb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_packm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_unpackm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_gemm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_gemmsup_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_gemmtrsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_trsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bb/bli_gemmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bb/bli_gemmtrsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bb/bli_trsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/ind/bli_gemm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/ind/bli_gemmtrsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/ind/bli_trsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64_no_zen3/frame/0/copysc/bli_copysc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/her/bli_her_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/her/bli_her_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/her/bli_her_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_blocksize.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_direct.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_packab.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_schema.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_var12.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64_no_zen3/frame/3/hemm/bli_hemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64_no_zen3/frame/3/symm/bli_symm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64_no_zen3/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64_no_zen3/frame/base/bli_apool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64_no_zen3/frame/base/bli_arch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64_no_zen3/frame/base/bli_array.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64_no_zen3/frame/base/bli_blksz.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64_no_zen3/frame/base/bli_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64_no_zen3/frame/base/bli_clock.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64_no_zen3/frame/base/bli_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64_no_zen3/frame/base/bli_cntx.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64_no_zen3/frame/base/bli_const.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64_no_zen3/frame/base/bli_cpuid.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64_no_zen3/frame/base/bli_env.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64_no_zen3/frame/base/bli_error.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64_no_zen3/frame/base/bli_func.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64_no_zen3/frame/base/bli_getopt.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64_no_zen3/frame/base/bli_gks.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64_no_zen3/frame/base/bli_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64_no_zen3/frame/base/bli_info.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64_no_zen3/frame/base/bli_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64_no_zen3/frame/base/bli_machval.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64_no_zen3/frame/base/bli_malloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64_no_zen3/frame/base/bli_mbool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64_no_zen3/frame/base/bli_memsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64_no_zen3/frame/base/bli_obj.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64_no_zen3/frame/base/bli_obj_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64_no_zen3/frame/base/bli_pack.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64_no_zen3/frame/base/bli_param_map.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64_no_zen3/frame/base/bli_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64_no_zen3/frame/base/bli_pba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64_no_zen3/frame/base/bli_pool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64_no_zen3/frame/base/bli_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64_no_zen3/frame/base/bli_query.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64_no_zen3/frame/base/bli_rntm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64_no_zen3/frame/base/bli_sba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64_no_zen3/frame/base/bli_setgetijm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64_no_zen3/frame/base/bli_setgetijv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64_no_zen3/frame/base/bli_setri.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64_no_zen3/frame/base/bli_string.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64_no_zen3/frame/base/bli_winsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64_no_zen3/frame/base/cast/bli_castm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64_no_zen3/frame/base/cast/bli_castnzm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64_no_zen3/frame/base/cast/bli_castv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64_no_zen3/frame/base/check/bli_obj_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64_no_zen3/frame/base/check/bli_part_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64_no_zen3/frame/base/noopt/bli_dlamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64_no_zen3/frame/base/noopt/bli_lsame.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64_no_zen3/frame/base/noopt/bli_slamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64_no_zen3/frame/base/proj/bli_projm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64_no_zen3/frame/base/proj/bli_projv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_pthread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrinfo_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/darwin-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-arm64.jsonl000066400000000000000000007201551427272030600217220ustar00rootroot00000000000000{"environment": {"HOSTNAME": "9355d959bb12", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "TERM": "xterm", "OLDPWD": "/usr/local/repos/cython-blis", "LC_ALL": "en_US.UTF-8", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "PWD": "/usr/local/repos/cython-blis/flame-blis", "LANG": "en_US.UTF-8", "SHLVL": "2", "HOME": "/root", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-10/root", "AUDITWHEEL_ARCH": "aarch64", "LANGUAGE": "en_US.UTF-8", "AUDITWHEEL_PLAT": "manylinux2014_aarch64", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "AUDITWHEEL_POLICY": "manylinux2014", "_": "/usr/local/repos/cython-blis/env3.6/bin/python"}} {"compiler": "gcc", "source": "config/armsve/bli_cntx_init_armsve.c", "target": "obj/arm64/config/armsve/bli_cntx_init_armsve.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/cortexa53/bli_cntx_init_cortexa53.c", "target": "obj/arm64/config/cortexa53/bli_cntx_init_cortexa53.o", "flags": ["-O2", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/cortexa57/bli_cntx_init_cortexa57.c", "target": "obj/arm64/config/cortexa57/bli_cntx_init_cortexa57.o", "flags": ["-O2", "-mcpu=cortex-a57", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/firestorm/bli_cntx_init_firestorm.c", "target": "obj/arm64/config/firestorm/bli_cntx_init_firestorm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/arm64/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/thunderx2/bli_cntx_init_thunderx2.c", "target": "obj/arm64/config/thunderx2/bli_cntx_init_thunderx2.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c", "target": "obj/arm64/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c", "target": "obj/arm64/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c", "target": "obj/arm64/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/3/bli_armsve_utils.c", "target": "obj/arm64/kernels/armsve/3/bli_armsve_utils.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c", "target": "obj/arm64/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c", "target": "obj/arm64/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c", "target": "obj/arm64/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c", "target": "obj/arm64/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c", "target": "obj/arm64/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c", "target": "obj/arm64/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c", "target": "obj/arm64/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c", "target": "obj/arm64/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c", "target": "obj/arm64/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c", "target": "obj/arm64/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c", "target": "obj/arm64/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c", "target": "obj/arm64/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c", "target": "obj/arm64/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c", "target": "obj/arm64/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c", "target": "obj/arm64/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c", "target": "obj/arm64/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64/ref_kernels/armsve/bli_cntx_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_addv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_amaxv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_axpbyv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_axpyv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_copyv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_dotv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_dotxv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_invertv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_scal2v_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_scalv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_setv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_subv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_swapv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1/bli_xpbyv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64/ref_kernels/armsve/1f/bli_axpy2v_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64/ref_kernels/armsve/1f/bli_axpyf_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64/ref_kernels/armsve/1f/bli_dotaxpyv_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64/ref_kernels/armsve/1f/bli_dotxaxpyf_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64/ref_kernels/armsve/1f/bli_dotxf_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64/ref_kernels/armsve/1m/bli_packm_cxk_1er_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64/ref_kernels/armsve/1m/bli_packm_cxk_bb_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64/ref_kernels/armsve/1m/bli_packm_cxk_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64/ref_kernels/armsve/1m/bli_unpackm_cxk_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64/ref_kernels/armsve/3/bli_gemm_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64/ref_kernels/armsve/3/bli_gemmsup_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64/ref_kernels/armsve/3/bli_gemmtrsm_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64/ref_kernels/armsve/3/bli_trsm_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64/ref_kernels/armsve/3/bb/bli_gemmbb_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64/ref_kernels/armsve/3/bb/bli_gemmtrsmbb_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64/ref_kernels/armsve/3/bb/bli_trsmbb_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64/ref_kernels/armsve/ind/bli_gemm1m_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64/ref_kernels/armsve/ind/bli_gemmtrsm1m_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64/ref_kernels/armsve/ind/bli_trsm1m_armsve_ref.o", "flags": ["-O3", "-ftree-vectorize", "-march=armv8-a+sve", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=armsve", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64/ref_kernels/firestorm/bli_cntx_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_addv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_amaxv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_axpbyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_axpyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_copyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_dotv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_dotxv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_invertv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_scal2v_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_scalv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_setv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_subv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_swapv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1/bli_xpbyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1f/bli_axpy2v_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1f/bli_axpyf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1f/bli_dotaxpyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1f/bli_dotxaxpyf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1f/bli_dotxf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1m/bli_packm_cxk_1er_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1m/bli_packm_cxk_bb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1m/bli_packm_cxk_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64/ref_kernels/firestorm/1m/bli_unpackm_cxk_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64/ref_kernels/firestorm/3/bli_gemm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64/ref_kernels/firestorm/3/bli_gemmsup_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64/ref_kernels/firestorm/3/bli_gemmtrsm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64/ref_kernels/firestorm/3/bli_trsm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64/ref_kernels/firestorm/3/bb/bli_gemmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64/ref_kernels/firestorm/3/bb/bli_gemmtrsmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64/ref_kernels/firestorm/3/bb/bli_trsmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64/ref_kernels/firestorm/ind/bli_gemm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64/ref_kernels/firestorm/ind/bli_gemmtrsm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64/ref_kernels/firestorm/ind/bli_trsm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/bli_cntx_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_addv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_amaxv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_axpbyv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_axpyv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_copyv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_dotv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_dotxv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_invertv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_scal2v_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_scalv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_setv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_subv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_swapv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1/bli_xpbyv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1f/bli_axpy2v_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1f/bli_axpyf_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1f/bli_dotaxpyv_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1f/bli_dotxaxpyf_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1f/bli_dotxf_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1m/bli_packm_cxk_1er_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1m/bli_packm_cxk_bb_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1m/bli_packm_cxk_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/1m/bli_unpackm_cxk_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/3/bli_gemm_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/3/bli_gemmsup_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/3/bli_gemmtrsm_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/3/bli_trsm_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/3/bb/bli_gemmbb_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/3/bb/bli_gemmtrsmbb_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/3/bb/bli_trsmbb_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/ind/bli_gemm1m_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/ind/bli_gemmtrsm1m_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64/ref_kernels/thunderx2/ind/bli_trsm1m_thunderx2_ref.o", "flags": ["-O2", "-mcpu=thunderx2t99", "-O3", "-ftree-vectorize", "-mcpu=thunderx2t99", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=thunderx2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/bli_cntx_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_addv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_amaxv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_axpbyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_axpyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_copyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_dotv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_dotxv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_invertv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_scal2v_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_scalv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_setv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_subv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_swapv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1/bli_xpbyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1f/bli_axpy2v_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1f/bli_axpyf_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1f/bli_dotaxpyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1f/bli_dotxaxpyf_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1f/bli_dotxf_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1m/bli_packm_cxk_1er_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1m/bli_packm_cxk_bb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1m/bli_packm_cxk_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/1m/bli_unpackm_cxk_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/3/bli_gemm_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/3/bli_gemmsup_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/3/bli_gemmtrsm_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/3/bli_trsm_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/3/bb/bli_gemmbb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/3/bb/bli_gemmtrsmbb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/3/bb/bli_trsmbb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/ind/bli_gemm1m_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/ind/bli_gemmtrsm1m_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64/ref_kernels/cortexa57/ind/bli_trsm1m_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/bli_cntx_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_addv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_amaxv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_axpbyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_axpyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_copyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_dotv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_dotxv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_invertv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_scal2v_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_scalv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_setv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_subv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_swapv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1/bli_xpbyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1f/bli_axpy2v_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1f/bli_axpyf_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1f/bli_dotaxpyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1f/bli_dotxaxpyf_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1f/bli_dotxf_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1m/bli_packm_cxk_1er_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1m/bli_packm_cxk_bb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1m/bli_packm_cxk_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/1m/bli_unpackm_cxk_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/3/bli_gemm_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/3/bli_gemmsup_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/3/bli_gemmtrsm_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/3/bli_trsm_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/3/bb/bli_gemmbb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/3/bb/bli_gemmtrsmbb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/3/bb/bli_trsmbb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/ind/bli_gemm1m_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/ind/bli_gemmtrsm1m_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64/ref_kernels/cortexa53/ind/bli_trsm1m_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/arm64/frame/0/bli_l0_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/arm64/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/arm64/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/arm64/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/arm64/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/arm64/frame/1/bli_l1v_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/arm64/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/arm64/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/arm64/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/arm64/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/arm64/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/arm64/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/arm64/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/arm64/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/arm64/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/arm64/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/arm64/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/arm64/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/arm64/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/arm64/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/arm64/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/arm64/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/arm64/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/arm64/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/arm64/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/arm64/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/arm64/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/arm64/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/arm64/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/arm64/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/arm64/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/arm64/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/arm64/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/arm64/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/arm64/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/arm64/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/arm64/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/arm64/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/arm64/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/arm64/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/arm64/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/arm64/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/arm64/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/arm64/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/arm64/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/arm64/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/arm64/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/arm64/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/arm64/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/arm64/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/arm64/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/arm64/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/arm64/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/arm64/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/arm64/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/arm64/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/arm64/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/arm64/frame/2/bli_l2_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/arm64/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/arm64/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/arm64/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/arm64/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/arm64/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/arm64/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/arm64/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/arm64/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/arm64/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/arm64/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/arm64/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/arm64/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/arm64/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/arm64/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/arm64/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/arm64/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/arm64/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/arm64/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/arm64/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/arm64/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/arm64/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/arm64/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/arm64/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/arm64/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/arm64/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/arm64/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/arm64/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/arm64/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/arm64/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/arm64/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/arm64/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/arm64/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/arm64/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/arm64/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/arm64/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/arm64/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/arm64/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/arm64/frame/3/bli_l3_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/arm64/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/arm64/frame/3/bli_l3_direct.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/arm64/frame/3/bli_l3_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/arm64/frame/3/bli_l3_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/arm64/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/arm64/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/arm64/frame/3/bli_l3_packab.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/arm64/frame/3/bli_l3_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/arm64/frame/3/bli_l3_schema.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/arm64/frame/3/bli_l3_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/arm64/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/arm64/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/arm64/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/arm64/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/arm64/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/arm64/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/arm64/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/arm64/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/arm64/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/arm64/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/arm64/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/arm64/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/arm64/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/arm64/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/arm64/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/arm64/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/arm64/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/arm64/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/arm64/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/arm64/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/arm64/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/arm64/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/arm64/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/arm64/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/arm64/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/arm64/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/arm64/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/arm64/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/arm64/frame/base/bli_apool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/arm64/frame/base/bli_arch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/arm64/frame/base/bli_array.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/arm64/frame/base/bli_blksz.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/arm64/frame/base/bli_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/arm64/frame/base/bli_clock.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/arm64/frame/base/bli_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/arm64/frame/base/bli_cntx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/arm64/frame/base/bli_const.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/arm64/frame/base/bli_cpuid.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/arm64/frame/base/bli_env.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/arm64/frame/base/bli_error.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/arm64/frame/base/bli_func.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/arm64/frame/base/bli_getopt.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/arm64/frame/base/bli_gks.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/arm64/frame/base/bli_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/arm64/frame/base/bli_info.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/arm64/frame/base/bli_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/arm64/frame/base/bli_machval.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/arm64/frame/base/bli_malloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/arm64/frame/base/bli_mbool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/arm64/frame/base/bli_memsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/arm64/frame/base/bli_obj.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/arm64/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/arm64/frame/base/bli_pack.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/arm64/frame/base/bli_param_map.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/arm64/frame/base/bli_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/arm64/frame/base/bli_pba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/arm64/frame/base/bli_pool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/arm64/frame/base/bli_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/arm64/frame/base/bli_query.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/arm64/frame/base/bli_rntm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/arm64/frame/base/bli_sba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/arm64/frame/base/bli_setgetijm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/arm64/frame/base/bli_setgetijv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/arm64/frame/base/bli_setri.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/arm64/frame/base/bli_string.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/arm64/frame/base/bli_winsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/arm64/frame/base/cast/bli_castm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/arm64/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/arm64/frame/base/cast/bli_castv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/arm64/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/arm64/frame/base/check/bli_part_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/arm64/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/arm64/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/arm64/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/arm64/frame/base/proj/bli_projm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/arm64/frame/base/proj/bli_projv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/arm64/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/arm64/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/arm64/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/arm64/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/arm64/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/arm64/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/arm64/frame/thread/bli_pthread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/arm64/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/arm64/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/arm64/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/arm64/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/arm64/frame/thread/bli_thread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/arm64/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/arm64/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/arm64/frame/util/bli_util_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/arm64/frame/util/bli_util_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/arm64/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/arm64/frame/util/bli_util_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/arm64/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/arm64/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/arm64/frame/util/bli_util_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/arm64/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/arm64/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-arm64_no_sve.jsonl000066400000000000000000006017361427272030600232760ustar00rootroot00000000000000{"environment": {"HOSTNAME": "60371680ee71", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "TERM": "xterm", "OLDPWD": "/usr/local/repos/cython-blis", "LC_ALL": "en_US.UTF-8", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "PWD": "/usr/local/repos/cython-blis/flame-blis", "LANG": "en_US.UTF-8", "SHLVL": "2", "HOME": "/root", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-10/root", "AUDITWHEEL_ARCH": "aarch64", "LANGUAGE": "en_US.UTF-8", "AUDITWHEEL_PLAT": "manylinux2014_aarch64", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "AUDITWHEEL_POLICY": "manylinux2014", "_": "/usr/local/repos/cython-blis/env3.6/bin/python"}} {"compiler": "gcc", "source": "config/cortexa53/bli_cntx_init_cortexa53.c", "target": "obj/arm64_no_sve/config/cortexa53/bli_cntx_init_cortexa53.o", "flags": ["-O2", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/cortexa57/bli_cntx_init_cortexa57.c", "target": "obj/arm64_no_sve/config/cortexa57/bli_cntx_init_cortexa57.o", "flags": ["-O2", "-mcpu=cortex-a57", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/firestorm/bli_cntx_init_firestorm.c", "target": "obj/arm64_no_sve/config/firestorm/bli_cntx_init_firestorm.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/arm64_no_sve/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c", "target": "obj/arm64_no_sve/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c", "target": "obj/arm64_no_sve/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c", "target": "obj/arm64_no_sve/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c", "target": "obj/arm64_no_sve/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c", "target": "obj/arm64_no_sve/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/bli_cntx_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_addv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_amaxv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_axpbyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_axpyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_copyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_dotv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_dotxv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_invertv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_scal2v_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_scalv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_setv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_subv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_swapv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1/bli_xpbyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1f/bli_axpy2v_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1f/bli_axpyf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1f/bli_dotaxpyv_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1f/bli_dotxaxpyf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1f/bli_dotxf_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1m/bli_packm_cxk_1er_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1m/bli_packm_cxk_bb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1m/bli_packm_cxk_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/1m/bli_unpackm_cxk_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/3/bli_gemm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/3/bli_gemmsup_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/3/bli_gemmtrsm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/3/bli_trsm_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/3/bb/bli_gemmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/3/bb/bli_gemmtrsmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/3/bb/bli_trsmbb_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/ind/bli_gemm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/ind/bli_gemmtrsm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/firestorm/ind/bli_trsm1m_firestorm_ref.o", "flags": ["-O2", "-march=armv8-a", "-O3", "-ftree-vectorize", "-march=armv8-a", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=firestorm", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/bli_cntx_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_addv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_amaxv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_axpbyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_axpyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_copyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_dotv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_dotxv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_invertv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_scal2v_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_scalv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_setv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_subv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_swapv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1/bli_xpbyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1f/bli_axpy2v_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1f/bli_axpyf_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1f/bli_dotaxpyv_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1f/bli_dotxaxpyf_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1f/bli_dotxf_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1m/bli_packm_cxk_1er_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1m/bli_packm_cxk_bb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1m/bli_packm_cxk_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/1m/bli_unpackm_cxk_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/3/bli_gemm_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/3/bli_gemmsup_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/3/bli_gemmtrsm_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/3/bli_trsm_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/3/bb/bli_gemmbb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/3/bb/bli_gemmtrsmbb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/3/bb/bli_trsmbb_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/ind/bli_gemm1m_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/ind/bli_gemmtrsm1m_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa57/ind/bli_trsm1m_cortexa57_ref.o", "flags": ["-O2", "-mcpu=cortex-a57", "-O3", "-ftree-vectorize", "-mcpu=cortex-a57", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa57", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/bli_cntx_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_addv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_amaxv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_axpbyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_axpyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_copyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_dotv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_dotxv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_invertv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_scal2v_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_scalv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_setv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_subv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_swapv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1/bli_xpbyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1f/bli_axpy2v_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1f/bli_axpyf_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1f/bli_dotaxpyv_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1f/bli_dotxaxpyf_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1f/bli_dotxf_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1m/bli_packm_cxk_1er_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1m/bli_packm_cxk_bb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1m/bli_packm_cxk_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/1m/bli_unpackm_cxk_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/3/bli_gemm_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/3/bli_gemmsup_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/3/bli_gemmtrsm_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/3/bli_trsm_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/3/bb/bli_gemmbb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/3/bb/bli_gemmtrsmbb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/3/bb/bli_trsmbb_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/ind/bli_gemm1m_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/ind/bli_gemmtrsm1m_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/cortexa53/ind/bli_trsm1m_cortexa53_ref.o", "flags": ["-O2", "-mcpu=cortex-a53", "-O3", "-ftree-vectorize", "-O3", "-mcpu=cortex-a53", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=cortexa53", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/arm64_no_sve/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/arm64_no_sve/frame/0/bli_l0_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/arm64_no_sve/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/arm64_no_sve/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/arm64_no_sve/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/arm64_no_sve/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/arm64_no_sve/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/arm64_no_sve/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/arm64_no_sve/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/arm64_no_sve/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/arm64_no_sve/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/arm64_no_sve/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/arm64_no_sve/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/arm64_no_sve/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/arm64_no_sve/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/arm64_no_sve/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/arm64_no_sve/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/arm64_no_sve/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/arm64_no_sve/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/arm64_no_sve/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/arm64_no_sve/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/arm64_no_sve/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/arm64_no_sve/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/arm64_no_sve/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/arm64_no_sve/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/arm64_no_sve/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/arm64_no_sve/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/arm64_no_sve/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/arm64_no_sve/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/arm64_no_sve/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/arm64_no_sve/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/arm64_no_sve/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/arm64_no_sve/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/arm64_no_sve/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/arm64_no_sve/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/arm64_no_sve/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/arm64_no_sve/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/arm64_no_sve/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/arm64_no_sve/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/arm64_no_sve/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/arm64_no_sve/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/arm64_no_sve/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/arm64_no_sve/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/arm64_no_sve/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/arm64_no_sve/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/arm64_no_sve/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_direct.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_packab.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_schema.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/arm64_no_sve/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/arm64_no_sve/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/arm64_no_sve/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/arm64_no_sve/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/arm64_no_sve/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/arm64_no_sve/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/arm64_no_sve/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/arm64_no_sve/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/arm64_no_sve/frame/base/bli_apool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/arm64_no_sve/frame/base/bli_arch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/arm64_no_sve/frame/base/bli_array.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/arm64_no_sve/frame/base/bli_blksz.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/arm64_no_sve/frame/base/bli_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/arm64_no_sve/frame/base/bli_clock.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/arm64_no_sve/frame/base/bli_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/arm64_no_sve/frame/base/bli_cntx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/arm64_no_sve/frame/base/bli_const.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/arm64_no_sve/frame/base/bli_cpuid.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/arm64_no_sve/frame/base/bli_env.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/arm64_no_sve/frame/base/bli_error.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/arm64_no_sve/frame/base/bli_func.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/arm64_no_sve/frame/base/bli_getopt.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/arm64_no_sve/frame/base/bli_gks.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/arm64_no_sve/frame/base/bli_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/arm64_no_sve/frame/base/bli_info.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/arm64_no_sve/frame/base/bli_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/arm64_no_sve/frame/base/bli_machval.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/arm64_no_sve/frame/base/bli_malloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/arm64_no_sve/frame/base/bli_mbool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/arm64_no_sve/frame/base/bli_memsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/arm64_no_sve/frame/base/bli_obj.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/arm64_no_sve/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/arm64_no_sve/frame/base/bli_pack.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/arm64_no_sve/frame/base/bli_param_map.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/arm64_no_sve/frame/base/bli_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/arm64_no_sve/frame/base/bli_pba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/arm64_no_sve/frame/base/bli_pool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/arm64_no_sve/frame/base/bli_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/arm64_no_sve/frame/base/bli_query.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/arm64_no_sve/frame/base/bli_rntm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/arm64_no_sve/frame/base/bli_sba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/arm64_no_sve/frame/base/bli_setgetijm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/arm64_no_sve/frame/base/bli_setgetijv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/arm64_no_sve/frame/base/bli_setri.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/arm64_no_sve/frame/base/bli_string.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/arm64_no_sve/frame/base/bli_winsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/arm64_no_sve/frame/base/cast/bli_castm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/arm64_no_sve/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/arm64_no_sve/frame/base/cast/bli_castv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/arm64_no_sve/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/arm64_no_sve/frame/base/check/bli_part_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/arm64_no_sve/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/arm64_no_sve/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/arm64_no_sve/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/arm64_no_sve/frame/base/proj/bli_projm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/arm64_no_sve/frame/base/proj/bli_projv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/arm64_no_sve/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/arm64_no_sve/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/arm64_no_sve/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/arm64_no_sve/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/arm64_no_sve/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/arm64_no_sve/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/arm64_no_sve/frame/thread/bli_pthread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/arm64_no_sve/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/arm64_no_sve/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/arm64_no_sve/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/arm64_no_sve/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/arm64_no_sve/frame/thread/bli_thread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/arm64_no_sve/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/arm64_no_sve/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/arm64_no_sve/frame/util/bli_util_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/arm64_no_sve/frame/util/bli_util_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/arm64_no_sve/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/arm64_no_sve/frame/util/bli_util_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/arm64_no_sve/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/arm64_no_sve/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/arm64_no_sve/frame/util/bli_util_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/arm64_no_sve/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/arm64_no_sve/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_GNU_SOURCE", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-arm64_no_sve", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-generic.jsonl000066400000000000000000003412341427272030600224020ustar00rootroot00000000000000{"environment": {"HOSTNAME": "a1ce18f3e0bf", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "TERM": "xterm", "OLDPWD": "/usr/local/repos/cython-blis", "LC_ALL": "en_US.UTF-8", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "PWD": "/usr/local/repos/cython-blis/flame-blis", "LANG": "en_US.UTF-8", "SHLVL": "2", "HOME": "/root", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-10/root", "AUDITWHEEL_ARCH": "x86_64", "LANGUAGE": "en_US.UTF-8", "AUDITWHEEL_PLAT": "manylinux2014_x86_64", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "container": "podman", "AUDITWHEEL_POLICY": "manylinux2014", "_": "/usr/local/repos/cython-blis/env3.6/bin/python"}} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/generic/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/generic/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/generic/frame/0/bli_l0_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/generic/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/generic/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/generic/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/generic/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/generic/frame/1/bli_l1v_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/generic/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/generic/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/generic/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/generic/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/generic/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/generic/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/generic/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/generic/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/generic/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/generic/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/generic/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/generic/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/generic/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/generic/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/generic/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/generic/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/generic/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/generic/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/generic/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/generic/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/generic/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/generic/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/generic/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/generic/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/generic/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/generic/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/generic/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/generic/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/generic/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/generic/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/generic/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/generic/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/generic/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/generic/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/generic/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/generic/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/generic/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/generic/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/generic/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/generic/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/generic/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/generic/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/generic/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/generic/frame/2/bli_l2_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/generic/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/generic/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/generic/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/generic/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/generic/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/generic/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/generic/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/generic/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/generic/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/generic/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/generic/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/generic/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/generic/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/generic/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/generic/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/generic/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/generic/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/generic/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/generic/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/generic/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/generic/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/generic/frame/3/bli_l3_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/generic/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/generic/frame/3/bli_l3_direct.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/generic/frame/3/bli_l3_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/generic/frame/3/bli_l3_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/generic/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/generic/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/generic/frame/3/bli_l3_packab.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/generic/frame/3/bli_l3_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/generic/frame/3/bli_l3_schema.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/generic/frame/3/bli_l3_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/generic/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/generic/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/generic/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/generic/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/generic/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/generic/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/generic/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/generic/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/generic/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/generic/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/generic/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/generic/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/generic/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/generic/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/generic/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/generic/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/generic/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/generic/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/generic/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/generic/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/generic/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/generic/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/generic/frame/base/bli_apool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/generic/frame/base/bli_arch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/generic/frame/base/bli_array.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/generic/frame/base/bli_blksz.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/generic/frame/base/bli_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/generic/frame/base/bli_clock.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/generic/frame/base/bli_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/generic/frame/base/bli_cntx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/generic/frame/base/bli_const.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/generic/frame/base/bli_cpuid.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/generic/frame/base/bli_env.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/generic/frame/base/bli_error.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/generic/frame/base/bli_func.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/generic/frame/base/bli_getopt.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/generic/frame/base/bli_gks.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/generic/frame/base/bli_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/generic/frame/base/bli_info.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/generic/frame/base/bli_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/generic/frame/base/bli_machval.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/generic/frame/base/bli_malloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/generic/frame/base/bli_mbool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/generic/frame/base/bli_memsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/generic/frame/base/bli_obj.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/generic/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/generic/frame/base/bli_pack.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/generic/frame/base/bli_param_map.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/generic/frame/base/bli_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/generic/frame/base/bli_pba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/generic/frame/base/bli_pool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/generic/frame/base/bli_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/generic/frame/base/bli_query.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/generic/frame/base/bli_rntm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/generic/frame/base/bli_sba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/generic/frame/base/bli_setgetijm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/generic/frame/base/bli_setgetijv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/generic/frame/base/bli_setri.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/generic/frame/base/bli_string.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/generic/frame/base/bli_winsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/generic/frame/base/cast/bli_castm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/generic/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/generic/frame/base/cast/bli_castv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/generic/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/generic/frame/base/check/bli_part_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/generic/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/generic/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/generic/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/generic/frame/base/proj/bli_projm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/generic/frame/base/proj/bli_projv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/generic/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/generic/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/generic/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/generic/frame/thread/bli_pthread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/generic/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/generic/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/generic/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/generic/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/generic/frame/thread/bli_thread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/generic/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/generic/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/generic/frame/util/bli_util_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/generic/frame/util/bli_util_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/generic/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/generic/frame/util/bli_util_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/generic/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/generic/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/generic/frame/util/bli_util_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/generic/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/generic/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-power9.jsonl000066400000000000000000003372641427272030600222230ustar00rootroot00000000000000{"environment": {"_": "/usr/local/repos/cython-blis/env3.6/bin/python", "AUDITWHEEL_POLICY": "manylinux2014", "container": "podman", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "AUDITWHEEL_PLAT": "manylinux2014_ppc64le", "LANGUAGE": "en_US.UTF-8", "SHLVL": "2", "AUDITWHEEL_ARCH": "ppc64le", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-10/root", "HOME": "/root", "LANG": "en_US.UTF-8", "PWD": "/usr/local/repos/cython-blis/flame-blis", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64", "LC_ALL": "en_US.UTF-8", "OLDPWD": "/usr/local/repos/cython-blis", "TERM": "xterm", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "HOSTNAME": "71f7849e7599"}} {"compiler": "gcc", "source": "config/power9/bli_cntx_init_power9.c", "target": "obj/power9/config/power9/bli_cntx_init_power9.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/power9/3/bli_gemm_power9_asm_d12x6.c", "target": "obj/power9/kernels/power9/3/bli_gemm_power9_asm_d12x6.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/power9/ref_kernels/power9/bli_cntx_power9_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_addv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_amaxv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_axpbyv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_axpyv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_copyv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_dotv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_dotxv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_invertv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_scal2v_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_scalv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_setv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_subv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_swapv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/power9/ref_kernels/power9/1/bli_xpbyv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/power9/ref_kernels/power9/1f/bli_axpy2v_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/power9/ref_kernels/power9/1f/bli_axpyf_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/power9/ref_kernels/power9/1f/bli_dotaxpyv_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/power9/ref_kernels/power9/1f/bli_dotxaxpyf_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/power9/ref_kernels/power9/1f/bli_dotxf_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/power9/ref_kernels/power9/1m/bli_packm_cxk_1er_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/power9/ref_kernels/power9/1m/bli_packm_cxk_bb_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/power9/ref_kernels/power9/1m/bli_packm_cxk_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/power9/ref_kernels/power9/1m/bli_unpackm_cxk_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/power9/ref_kernels/power9/3/bli_gemm_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/power9/ref_kernels/power9/3/bli_gemmsup_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/power9/ref_kernels/power9/3/bli_gemmtrsm_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/power9/ref_kernels/power9/3/bli_trsm_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/power9/ref_kernels/power9/3/bb/bli_gemmbb_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/power9/ref_kernels/power9/3/bb/bli_gemmtrsmbb_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/power9/ref_kernels/power9/3/bb/bli_trsmbb_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/power9/ref_kernels/power9/ind/bli_gemm1m_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/power9/ref_kernels/power9/ind/bli_gemmtrsm1m_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/power9/ref_kernels/power9/ind/bli_trsm1m_power9_ref.o", "flags": ["-O2", "-O3", "-mcpu=power9", "-mtune=power9", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-DXLC=0", "-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=power9", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/power9/frame/0/bli_l0_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/power9/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/power9/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/power9/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/power9/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/power9/frame/1/bli_l1v_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/power9/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/power9/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/power9/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/power9/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/power9/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/power9/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/power9/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/power9/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/power9/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/power9/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/power9/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/power9/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/power9/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/power9/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/power9/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/power9/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/power9/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/power9/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/power9/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/power9/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/power9/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/power9/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/power9/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/power9/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/power9/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/power9/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/power9/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/power9/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/power9/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/power9/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/power9/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/power9/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/power9/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/power9/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/power9/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/power9/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/power9/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/power9/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/power9/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/power9/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/power9/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/power9/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/power9/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/power9/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/power9/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/power9/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/power9/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/power9/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/power9/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/power9/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/power9/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/power9/frame/2/bli_l2_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/power9/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/power9/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/power9/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/power9/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/power9/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/power9/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/power9/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/power9/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/power9/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/power9/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/power9/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/power9/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/power9/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/power9/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/power9/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/power9/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/power9/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/power9/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/power9/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/power9/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/power9/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/power9/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/power9/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/power9/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/power9/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/power9/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/power9/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/power9/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/power9/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/power9/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/power9/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/power9/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/power9/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/power9/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/power9/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/power9/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/power9/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/power9/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/power9/frame/3/bli_l3_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/power9/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/power9/frame/3/bli_l3_direct.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/power9/frame/3/bli_l3_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/power9/frame/3/bli_l3_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/power9/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/power9/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/power9/frame/3/bli_l3_packab.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/power9/frame/3/bli_l3_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/power9/frame/3/bli_l3_schema.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/power9/frame/3/bli_l3_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/power9/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/power9/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/power9/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/power9/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/power9/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/power9/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/power9/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/power9/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/power9/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/power9/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/power9/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/power9/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/power9/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/power9/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/power9/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/power9/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/power9/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/power9/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/power9/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/power9/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/power9/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/power9/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/power9/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/power9/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/power9/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/power9/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/power9/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/power9/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/power9/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/power9/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/power9/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/power9/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/power9/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/power9/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/power9/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/power9/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/power9/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/power9/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/power9/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/power9/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/power9/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/power9/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/power9/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/power9/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/power9/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/power9/frame/base/bli_apool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/power9/frame/base/bli_arch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/power9/frame/base/bli_array.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/power9/frame/base/bli_blksz.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/power9/frame/base/bli_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/power9/frame/base/bli_clock.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/power9/frame/base/bli_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/power9/frame/base/bli_cntx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/power9/frame/base/bli_const.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/power9/frame/base/bli_cpuid.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/power9/frame/base/bli_env.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/power9/frame/base/bli_error.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/power9/frame/base/bli_func.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/power9/frame/base/bli_getopt.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/power9/frame/base/bli_gks.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/power9/frame/base/bli_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/power9/frame/base/bli_info.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/power9/frame/base/bli_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/power9/frame/base/bli_machval.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/power9/frame/base/bli_malloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/power9/frame/base/bli_mbool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/power9/frame/base/bli_memsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/power9/frame/base/bli_obj.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/power9/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/power9/frame/base/bli_pack.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/power9/frame/base/bli_param_map.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/power9/frame/base/bli_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/power9/frame/base/bli_pba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/power9/frame/base/bli_pool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/power9/frame/base/bli_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/power9/frame/base/bli_query.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/power9/frame/base/bli_rntm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/power9/frame/base/bli_sba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/power9/frame/base/bli_setgetijm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/power9/frame/base/bli_setgetijv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/power9/frame/base/bli_setri.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/power9/frame/base/bli_string.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/power9/frame/base/bli_winsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/power9/frame/base/cast/bli_castm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/power9/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/power9/frame/base/cast/bli_castv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/power9/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/power9/frame/base/check/bli_part_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/power9/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/power9/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/power9/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/power9/frame/base/proj/bli_projm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/power9/frame/base/proj/bli_projv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/power9/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/power9/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/power9/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/power9/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/power9/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/power9/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/power9/frame/thread/bli_pthread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/power9/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/power9/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/power9/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/power9/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/power9/frame/thread/bli_thread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/power9/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/power9/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/power9/frame/util/bli_util_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/power9/frame/util/bli_util_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/power9/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/power9/frame/util/bli_util_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/power9/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/power9/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/power9/frame/util/bli_util_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/power9/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/power9/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-power9", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-x86_64.jsonl000066400000000000000000014507771427272030600217420ustar00rootroot00000000000000{"environment": {"HOSTNAME": "c4a178893595", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "TERM": "xterm", "OLDPWD": "/usr/local/repos/cython-blis", "LC_ALL": "en_US.UTF-8", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-11/root/usr/lib64:/opt/rh/devtoolset-11/root/usr/lib:/opt/rh/devtoolset-11/root/usr/lib64/dyninst:/opt/rh/devtoolset-11/root/usr/lib/dyninst:/usr/local/lib64", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-11/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "PWD": "/usr/local/repos/cython-blis/flame-blis", "LANG": "en_US.UTF-8", "SHLVL": "2", "HOME": "/root", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-11/root", "AUDITWHEEL_ARCH": "x86_64", "LANGUAGE": "en_US.UTF-8", "AUDITWHEEL_PLAT": "manylinux2014_x86_64", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "container": "podman", "AUDITWHEEL_POLICY": "manylinux2014", "_": "/usr/local/repos/cython-blis/env3.6/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/knl/bli_cntx_init_knl.c", "target": "obj/x86_64/config/knl/bli_cntx_init_knl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/skx/bli_cntx_init_skx.c", "target": "obj/x86_64/config/skx/bli_cntx_init_skx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen/bli_cntx_init_zen.c", "target": "obj/x86_64/config/zen/bli_cntx_init_zen.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen2/bli_cntx_init_zen2.c", "target": "obj/x86_64/config/zen2/bli_cntx_init_zen2.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen3/bli_cntx_init_zen3.c", "target": "obj/x86_64/config/zen3/bli_cntx_init_zen3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c", "target": "obj/x86_64/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x14.c", "target": "obj/x86_64/kernels/skx/3/bli_dgemm_skx_asm_16x14.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c", "target": "obj/x86_64/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_dpackm_knl_asm_24x8.c", "target": "obj/x86_64/kernels/knl/1m/bli_dpackm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_spackm_knl_asm_24x16.c", "target": "obj/x86_64/kernels/knl/1m/bli_spackm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_dgemm_knl_asm_24x8.c", "target": "obj/x86_64/kernels/knl/3/bli_dgemm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_sgemm_knl_asm_24x16.c", "target": "obj/x86_64/kernels/knl/3/bli_sgemm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/skx/bli_cntx_skx_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_addv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_amaxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_axpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_axpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_copyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_dotv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_dotxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_invertv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_scal2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_scalv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_setv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_subv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_swapv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_xpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_axpy2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_axpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotaxpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotxaxpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotxf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_1er_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_bb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_unpackm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemmsup_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemmtrsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_trsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_gemmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_gemmtrsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_trsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_gemm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_gemmtrsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_trsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/knl/bli_cntx_knl_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_addv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_amaxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_axpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_axpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_copyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_dotv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_dotxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_invertv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_scal2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_scalv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_setv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_subv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_swapv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_xpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_axpy2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_axpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotaxpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotxaxpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotxf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_1er_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_bb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_unpackm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemmsup_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemmtrsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_trsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_gemmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_gemmtrsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_trsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_gemm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_gemmtrsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_trsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen3/bli_cntx_zen3_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_addv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_amaxv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_axpbyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_axpyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_copyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_dotv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_dotxv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_invertv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_scal2v_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_scalv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_setv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_subv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_swapv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_xpbyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_axpy2v_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_axpyf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotaxpyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotxaxpyf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotxf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_1er_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_bb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_unpackm_cxk_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemmsup_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemmtrsm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_trsm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_gemmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_gemmtrsmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_trsmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_gemm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_gemmtrsm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_trsm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen2/bli_cntx_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_addv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_amaxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_axpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_axpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_copyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_dotv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_dotxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_invertv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_scal2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_scalv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_setv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_subv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_swapv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_xpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_axpy2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_axpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotaxpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotxaxpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotxf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_1er_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_bb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_unpackm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemmsup_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemmtrsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_trsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_gemmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_gemmtrsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_trsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_gemm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_gemmtrsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_trsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen/bli_cntx_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_addv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_amaxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_axpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_axpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_copyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_dotv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_dotxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_invertv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_scal2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_scalv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_setv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_subv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_swapv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_xpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_axpy2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_axpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotaxpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotxaxpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotxf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_1er_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_bb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_unpackm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemmsup_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemmtrsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_trsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_gemmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_gemmtrsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_trsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_gemm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_gemmtrsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_trsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64/frame/0/bli_l0_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64/frame/1/bli_l1v_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64/frame/2/bli_l2_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64/frame/3/bli_l3_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64/frame/3/bli_l3_direct.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64/frame/3/bli_l3_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64/frame/3/bli_l3_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64/frame/3/bli_l3_packab.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64/frame/3/bli_l3_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64/frame/3/bli_l3_schema.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64/frame/3/bli_l3_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64/frame/base/bli_apool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64/frame/base/bli_arch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64/frame/base/bli_array.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64/frame/base/bli_blksz.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64/frame/base/bli_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64/frame/base/bli_clock.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64/frame/base/bli_cntl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64/frame/base/bli_cntx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64/frame/base/bli_const.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64/frame/base/bli_cpuid.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64/frame/base/bli_env.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64/frame/base/bli_error.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64/frame/base/bli_func.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64/frame/base/bli_getopt.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64/frame/base/bli_gks.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64/frame/base/bli_ind.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64/frame/base/bli_info.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64/frame/base/bli_init.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64/frame/base/bli_machval.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64/frame/base/bli_malloc.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64/frame/base/bli_mbool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64/frame/base/bli_memsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64/frame/base/bli_obj.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64/frame/base/bli_pack.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64/frame/base/bli_param_map.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64/frame/base/bli_part.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64/frame/base/bli_pba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64/frame/base/bli_pool.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64/frame/base/bli_prune.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64/frame/base/bli_query.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64/frame/base/bli_rntm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64/frame/base/bli_sba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64/frame/base/bli_setgetijm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64/frame/base/bli_setgetijv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64/frame/base/bli_setri.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64/frame/base/bli_string.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64/frame/base/bli_winsys.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64/frame/base/cast/bli_castm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64/frame/base/cast/bli_castv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64/frame/base/check/bli_part_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64/frame/base/proj/bli_projm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64/frame/base/proj/bli_projv.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64/frame/thread/bli_pthread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64/frame/thread/bli_thread.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64/frame/util/bli_util_check.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64/frame/util/bli_util_fpa.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64/frame/util/bli_util_oapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64/frame/util/bli_util_tapi.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-x86_64_no_skx.jsonl000066400000000000000000011414431427272030600233060ustar00rootroot00000000000000{"environment": {"HOSTNAME": "a1ce18f3e0bf", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "TERM": "xterm", "OLDPWD": "/usr/local/repos/cython-blis", "LC_ALL": "en_US.UTF-8", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "PWD": "/usr/local/repos/cython-blis/flame-blis", "LANG": "en_US.UTF-8", "SHLVL": "2", "HOME": "/root", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-10/root", "AUDITWHEEL_ARCH": "x86_64", "LANGUAGE": "en_US.UTF-8", "AUDITWHEEL_PLAT": "manylinux2014_x86_64", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "container": "podman", "AUDITWHEEL_POLICY": "manylinux2014", "_": "/usr/local/repos/cython-blis/env3.6/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64_no_skx/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64_no_skx/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64_no_skx/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64_no_skx/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64_no_skx/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64_no_skx/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64_no_skx/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64_no_skx/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64_no_skx/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64_no_skx/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64_no_skx/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64_no_skx/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64_no_skx/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64_no_skx/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64_no_skx/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64_no_skx/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64_no_skx/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64_no_skx/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_skx/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64_no_skx/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64_no_skx/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_skx/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64_no_skx/frame/0/bli_l0_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64_no_skx/frame/0/copysc/bli_copysc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64_no_skx/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64_no_skx/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64_no_skx/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/2/bli_l2_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/her/bli_her_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/her/bli_her_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/her/bli_her_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64_no_skx/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_blocksize.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_direct.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_packab.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_schema.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_var12.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64_no_skx/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64_no_skx/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64_no_skx/frame/3/hemm/bli_hemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64_no_skx/frame/3/symm/bli_symm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64_no_skx/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64_no_skx/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64_no_skx/frame/base/bli_apool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64_no_skx/frame/base/bli_arch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64_no_skx/frame/base/bli_array.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64_no_skx/frame/base/bli_blksz.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64_no_skx/frame/base/bli_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64_no_skx/frame/base/bli_clock.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64_no_skx/frame/base/bli_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64_no_skx/frame/base/bli_cntx.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64_no_skx/frame/base/bli_const.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64_no_skx/frame/base/bli_cpuid.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64_no_skx/frame/base/bli_env.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64_no_skx/frame/base/bli_error.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64_no_skx/frame/base/bli_func.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64_no_skx/frame/base/bli_getopt.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64_no_skx/frame/base/bli_gks.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64_no_skx/frame/base/bli_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64_no_skx/frame/base/bli_info.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64_no_skx/frame/base/bli_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64_no_skx/frame/base/bli_machval.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64_no_skx/frame/base/bli_malloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64_no_skx/frame/base/bli_mbool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64_no_skx/frame/base/bli_memsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64_no_skx/frame/base/bli_obj.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64_no_skx/frame/base/bli_obj_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64_no_skx/frame/base/bli_pack.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64_no_skx/frame/base/bli_param_map.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64_no_skx/frame/base/bli_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64_no_skx/frame/base/bli_pba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64_no_skx/frame/base/bli_pool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64_no_skx/frame/base/bli_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64_no_skx/frame/base/bli_query.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64_no_skx/frame/base/bli_rntm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64_no_skx/frame/base/bli_sba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64_no_skx/frame/base/bli_setgetijm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64_no_skx/frame/base/bli_setgetijv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64_no_skx/frame/base/bli_setri.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64_no_skx/frame/base/bli_string.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64_no_skx/frame/base/bli_winsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64_no_skx/frame/base/cast/bli_castm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64_no_skx/frame/base/cast/bli_castnzm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64_no_skx/frame/base/cast/bli_castv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64_no_skx/frame/base/check/bli_obj_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64_no_skx/frame/base/check/bli_part_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64_no_skx/frame/base/noopt/bli_dlamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64_no_skx/frame/base/noopt/bli_lsame.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64_no_skx/frame/base/noopt/bli_slamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64_no_skx/frame/base/proj/bli_projm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64_no_skx/frame/base/proj/bli_projv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64_no_skx/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64_no_skx/frame/thread/bli_pthread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrcomm_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64_no_skx/frame/thread/bli_thrinfo_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64_no_skx/frame/util/bli_util_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_skx", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-x86_64_no_zen2.jsonl000066400000000000000000013010271427272030600233530ustar00rootroot00000000000000{"environment": {"HOSTNAME": "a1ce18f3e0bf", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "TERM": "xterm", "OLDPWD": "/usr/local/repos/cython-blis", "LC_ALL": "en_US.UTF-8", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "PWD": "/usr/local/repos/cython-blis/flame-blis", "LANG": "en_US.UTF-8", "SHLVL": "2", "HOME": "/root", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-10/root", "AUDITWHEEL_ARCH": "x86_64", "LANGUAGE": "en_US.UTF-8", "AUDITWHEEL_PLAT": "manylinux2014_x86_64", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "container": "podman", "AUDITWHEEL_POLICY": "manylinux2014", "_": "/usr/local/repos/cython-blis/env3.6/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64_no_zen2/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64_no_zen2/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64_no_zen2/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64_no_zen2/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64_no_zen2/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64_no_zen2/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64_no_zen2/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/skx/bli_cntx_init_skx.c", "target": "obj/x86_64_no_zen2/config/skx/bli_cntx_init_skx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64_no_zen2/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen/bli_cntx_init_zen.c", "target": "obj/x86_64_no_zen2/config/zen/bli_cntx_init_zen.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c", "target": "obj/x86_64_no_zen2/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x14.c", "target": "obj/x86_64_no_zen2/kernels/skx/3/bli_dgemm_skx_asm_16x14.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c", "target": "obj/x86_64_no_zen2/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64_no_zen2/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64_no_zen2/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64_no_zen2/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen2/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64_no_zen2/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen2/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64_no_zen2/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64_no_zen2/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64_no_zen2/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64_no_zen2/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64_no_zen2/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64_no_zen2/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64_no_zen2/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/bli_cntx_skx_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_addv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_amaxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_axpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_axpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_copyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_dotv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_dotxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_invertv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_scal2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_scalv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_setv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_subv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_swapv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1/bli_xpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_axpy2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_axpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_dotaxpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_dotxaxpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1f/bli_dotxf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_packm_cxk_1er_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_packm_cxk_bb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_packm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/1m/bli_unpackm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_gemm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_gemmsup_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_gemmtrsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bli_trsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bb/bli_gemmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bb/bli_gemmtrsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/3/bb/bli_trsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/ind/bli_gemm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/ind/bli_gemmtrsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/skx/ind/bli_trsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/bli_cntx_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_addv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_amaxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_axpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_axpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_copyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_dotv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_dotxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_invertv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_scal2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_scalv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_setv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_subv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_swapv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1/bli_xpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_axpy2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_axpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_dotaxpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_dotxaxpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1f/bli_dotxf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_packm_cxk_1er_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_packm_cxk_bb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_packm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/1m/bli_unpackm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_gemm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_gemmsup_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_gemmtrsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bli_trsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bb/bli_gemmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bb/bli_gemmtrsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/3/bb/bli_trsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/ind/bli_gemm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/ind/bli_gemmtrsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/zen/ind/bli_trsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen2/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64_no_zen2/frame/0/bli_l0_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64_no_zen2/frame/0/copysc/bli_copysc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64_no_zen2/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64_no_zen2/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/2/bli_l2_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/her/bli_her_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/her/bli_her_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/her/bli_her_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64_no_zen2/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_blocksize.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_direct.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_packab.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_schema.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_var12.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64_no_zen2/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64_no_zen2/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64_no_zen2/frame/3/hemm/bli_hemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64_no_zen2/frame/3/symm/bli_symm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64_no_zen2/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64_no_zen2/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64_no_zen2/frame/base/bli_apool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64_no_zen2/frame/base/bli_arch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64_no_zen2/frame/base/bli_array.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64_no_zen2/frame/base/bli_blksz.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64_no_zen2/frame/base/bli_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64_no_zen2/frame/base/bli_clock.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64_no_zen2/frame/base/bli_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64_no_zen2/frame/base/bli_cntx.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64_no_zen2/frame/base/bli_const.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64_no_zen2/frame/base/bli_cpuid.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64_no_zen2/frame/base/bli_env.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64_no_zen2/frame/base/bli_error.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64_no_zen2/frame/base/bli_func.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64_no_zen2/frame/base/bli_getopt.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64_no_zen2/frame/base/bli_gks.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64_no_zen2/frame/base/bli_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64_no_zen2/frame/base/bli_info.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64_no_zen2/frame/base/bli_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64_no_zen2/frame/base/bli_machval.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64_no_zen2/frame/base/bli_malloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64_no_zen2/frame/base/bli_mbool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64_no_zen2/frame/base/bli_memsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64_no_zen2/frame/base/bli_obj.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64_no_zen2/frame/base/bli_obj_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64_no_zen2/frame/base/bli_pack.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64_no_zen2/frame/base/bli_param_map.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64_no_zen2/frame/base/bli_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64_no_zen2/frame/base/bli_pba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64_no_zen2/frame/base/bli_pool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64_no_zen2/frame/base/bli_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64_no_zen2/frame/base/bli_query.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64_no_zen2/frame/base/bli_rntm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64_no_zen2/frame/base/bli_sba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64_no_zen2/frame/base/bli_setgetijm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64_no_zen2/frame/base/bli_setgetijv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64_no_zen2/frame/base/bli_setri.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64_no_zen2/frame/base/bli_string.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64_no_zen2/frame/base/bli_winsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64_no_zen2/frame/base/cast/bli_castm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64_no_zen2/frame/base/cast/bli_castnzm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64_no_zen2/frame/base/cast/bli_castv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64_no_zen2/frame/base/check/bli_obj_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64_no_zen2/frame/base/check/bli_part_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64_no_zen2/frame/base/noopt/bli_dlamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64_no_zen2/frame/base/noopt/bli_lsame.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64_no_zen2/frame/base/noopt/bli_slamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64_no_zen2/frame/base/proj/bli_projm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64_no_zen2/frame/base/proj/bli_projv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_pthread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrcomm_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64_no_zen2/frame/thread/bli_thrinfo_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64_no_zen2/frame/util/bli_util_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen2", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/linux-x86_64_no_zen3.jsonl000066400000000000000000014300321427272030600233530ustar00rootroot00000000000000{"environment": {"HOSTNAME": "a1ce18f3e0bf", "SSL_CERT_FILE": "/opt/_internal/certs.pem", "TERM": "xterm", "OLDPWD": "/usr/local/repos/cython-blis", "LC_ALL": "en_US.UTF-8", "LD_LIBRARY_PATH": "/opt/rh/devtoolset-10/root/usr/lib64:/opt/rh/devtoolset-10/root/usr/lib:/opt/rh/devtoolset-10/root/usr/lib64/dyninst:/opt/rh/devtoolset-10/root/usr/lib/dyninst:/usr/local/lib64", "LS_COLORS": "rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36:*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=01;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=01;36:*.xspf=01;36:", "VIRTUAL_ENV": "/usr/local/repos/cython-blis/env3.6", "PATH": "/usr/local/repos/cython-blis/env3.6/bin:/opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "PWD": "/usr/local/repos/cython-blis/flame-blis", "LANG": "en_US.UTF-8", "SHLVL": "2", "HOME": "/root", "DEVTOOLSET_ROOTPATH": "/opt/rh/devtoolset-10/root", "AUDITWHEEL_ARCH": "x86_64", "LANGUAGE": "en_US.UTF-8", "AUDITWHEEL_PLAT": "manylinux2014_x86_64", "PKG_CONFIG_PATH": "/usr/local/lib/pkgconfig", "container": "podman", "AUDITWHEEL_POLICY": "manylinux2014", "_": "/usr/local/repos/cython-blis/env3.6/bin/python"}} {"compiler": "gcc", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64_no_zen3/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64_no_zen3/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64_no_zen3/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64_no_zen3/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/knl/bli_cntx_init_knl.c", "target": "obj/x86_64_no_zen3/config/knl/bli_cntx_init_knl.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64_no_zen3/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64_no_zen3/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64_no_zen3/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/skx/bli_cntx_init_skx.c", "target": "obj/x86_64_no_zen3/config/skx/bli_cntx_init_skx.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64_no_zen3/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen/bli_cntx_init_zen.c", "target": "obj/x86_64_no_zen3/config/zen/bli_cntx_init_zen.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "config/zen2/bli_cntx_init_zen2.c", "target": "obj/x86_64_no_zen3/config/zen2/bli_cntx_init_zen2.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c", "target": "obj/x86_64_no_zen3/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x14.c", "target": "obj/x86_64_no_zen3/kernels/skx/3/bli_dgemm_skx_asm_16x14.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c", "target": "obj/x86_64_no_zen3/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=skylake-avx512", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_dpackm_knl_asm_24x8.c", "target": "obj/x86_64_no_zen3/kernels/knl/1m/bli_dpackm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/1m/bli_spackm_knl_asm_24x16.c", "target": "obj/x86_64_no_zen3/kernels/knl/1m/bli_spackm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_dgemm_knl_asm_24x8.c", "target": "obj/x86_64_no_zen3/kernels/knl/3/bli_dgemm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/knl/3/bli_sgemm_knl_asm_24x16.c", "target": "obj/x86_64_no_zen3/kernels/knl/3/bli_sgemm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64_no_zen3/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64_no_zen3/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64_no_zen3/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64_no_zen3/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64_no_zen3/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64_no_zen3/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64_no_zen3/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64_no_zen3/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64_no_zen3/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64_no_zen3/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64_no_zen3/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64_no_zen3/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64_no_zen3/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/bli_cntx_skx_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_addv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_amaxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_axpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_axpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_copyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_dotv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_dotxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_invertv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_scal2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_scalv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_setv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_subv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_swapv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1/bli_xpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_axpy2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_axpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_dotaxpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_dotxaxpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1f/bli_dotxf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_packm_cxk_1er_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_packm_cxk_bb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_packm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/1m/bli_unpackm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_gemm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_gemmsup_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_gemmtrsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bli_trsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bb/bli_gemmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bb/bli_gemmtrsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/3/bb/bli_trsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/ind/bli_gemm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/ind/bli_gemmtrsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/skx/ind/bli_trsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=skylake-avx512", "-mno-avx512f", "-mno-avx512vl", "-mno-avx512bw", "-mno-avx512dq", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/bli_cntx_knl_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_addv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_amaxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_axpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_axpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_copyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_dotv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_dotxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_invertv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_scal2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_scalv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_setv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_subv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_swapv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1/bli_xpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_axpy2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_axpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_dotaxpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_dotxaxpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1f/bli_dotxf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_packm_cxk_1er_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_packm_cxk_bb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_packm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/1m/bli_unpackm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_gemm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_gemmsup_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_gemmtrsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bli_trsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bb/bli_gemmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bb/bli_gemmtrsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/3/bb/bli_trsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/ind/bli_gemm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/ind/bli_gemmtrsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/knl/ind/bli_trsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/bli_cntx_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_addv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_amaxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_axpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_axpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_copyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_dotv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_dotxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_invertv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_scal2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_scalv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_setv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_subv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_swapv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1/bli_xpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_axpy2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_axpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_dotaxpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_dotxaxpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1f/bli_dotxf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_packm_cxk_1er_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_packm_cxk_bb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_packm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/1m/bli_unpackm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_gemm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_gemmsup_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_gemmtrsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bli_trsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bb/bli_gemmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bb/bli_gemmtrsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/3/bb/bli_trsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/ind/bli_gemm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/ind/bli_gemmtrsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen/ind/bli_trsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-mno-avx256-split-unaligned-store", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/bli_cntx_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_addv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_amaxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_axpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_axpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_copyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_dotv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_dotxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_invertv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_scal2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_scalv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_setv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_subv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_swapv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1/bli_xpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_axpy2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_axpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_dotaxpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_dotxaxpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1f/bli_dotxf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_packm_cxk_1er_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_packm_cxk_bb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_packm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/1m/bli_unpackm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_gemm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_gemmsup_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_gemmtrsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bli_trsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bb/bli_gemmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bb/bli_gemmtrsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/3/bb/bli_trsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/ind/bli_gemm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/ind/bli_gemmtrsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/zen2/ind/bli_trsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64_no_zen3/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-fPIC", "-std=c99", "-fopenmp-simd", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64_no_zen3/frame/0/bli_l0_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64_no_zen3/frame/0/copysc/bli_copysc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64_no_zen3/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64_no_zen3/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/2/bli_l2_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/her/bli_her_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/her/bli_her_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/her/bli_her_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64_no_zen3/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_blocksize.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_direct.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_packab.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_schema.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_int.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_var12.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64_no_zen3/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_md.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64_no_zen3/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64_no_zen3/frame/3/hemm/bli_hemm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64_no_zen3/frame/3/symm/bli_symm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64_no_zen3/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_front.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64_no_zen3/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_apool.c", "target": "obj/x86_64_no_zen3/frame/base/bli_apool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_arch.c", "target": "obj/x86_64_no_zen3/frame/base/bli_arch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_array.c", "target": "obj/x86_64_no_zen3/frame/base/bli_array.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64_no_zen3/frame/base/bli_blksz.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_check.c", "target": "obj/x86_64_no_zen3/frame/base/bli_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_clock.c", "target": "obj/x86_64_no_zen3/frame/base/bli_clock.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64_no_zen3/frame/base/bli_cntl.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64_no_zen3/frame/base/bli_cntx.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_const.c", "target": "obj/x86_64_no_zen3/frame/base/bli_const.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64_no_zen3/frame/base/bli_cpuid.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_env.c", "target": "obj/x86_64_no_zen3/frame/base/bli_env.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_error.c", "target": "obj/x86_64_no_zen3/frame/base/bli_error.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_func.c", "target": "obj/x86_64_no_zen3/frame/base/bli_func.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64_no_zen3/frame/base/bli_getopt.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_gks.c", "target": "obj/x86_64_no_zen3/frame/base/bli_gks.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_ind.c", "target": "obj/x86_64_no_zen3/frame/base/bli_ind.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_info.c", "target": "obj/x86_64_no_zen3/frame/base/bli_info.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_init.c", "target": "obj/x86_64_no_zen3/frame/base/bli_init.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_machval.c", "target": "obj/x86_64_no_zen3/frame/base/bli_machval.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64_no_zen3/frame/base/bli_malloc.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64_no_zen3/frame/base/bli_mbool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64_no_zen3/frame/base/bli_memsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj.c", "target": "obj/x86_64_no_zen3/frame/base/bli_obj.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64_no_zen3/frame/base/bli_obj_scalar.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pack.c", "target": "obj/x86_64_no_zen3/frame/base/bli_pack.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64_no_zen3/frame/base/bli_param_map.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_part.c", "target": "obj/x86_64_no_zen3/frame/base/bli_part.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pba.c", "target": "obj/x86_64_no_zen3/frame/base/bli_pba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_pool.c", "target": "obj/x86_64_no_zen3/frame/base/bli_pool.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_prune.c", "target": "obj/x86_64_no_zen3/frame/base/bli_prune.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_query.c", "target": "obj/x86_64_no_zen3/frame/base/bli_query.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64_no_zen3/frame/base/bli_rntm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_sba.c", "target": "obj/x86_64_no_zen3/frame/base/bli_sba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64_no_zen3/frame/base/bli_setgetijm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64_no_zen3/frame/base/bli_setgetijv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_setri.c", "target": "obj/x86_64_no_zen3/frame/base/bli_setri.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_string.c", "target": "obj/x86_64_no_zen3/frame/base/bli_string.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64_no_zen3/frame/base/bli_winsys.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64_no_zen3/frame/base/cast/bli_castm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64_no_zen3/frame/base/cast/bli_castnzm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64_no_zen3/frame/base/cast/bli_castv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64_no_zen3/frame/base/check/bli_obj_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64_no_zen3/frame/base/check/bli_part_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64_no_zen3/frame/base/noopt/bli_dlamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64_no_zen3/frame/base/noopt/bli_lsame.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64_no_zen3/frame/base/noopt/bli_slamch.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64_no_zen3/frame/base/proj/bli_projm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64_no_zen3/frame/base/proj/bli_projv.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_pthread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrcomm_single.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thread.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrinfo.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64_no_zen3/frame/thread/bli_thrinfo_sup.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_check.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_fpa.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_oapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_oapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_oapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_tapi_ba.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_tapi.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_tapi_ex.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "gcc", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64_no_zen3/frame/util/bli_util_unb_var1.o", "flags": ["-O3", "-fPIC", "-std=c99", "-fvisibility=hidden"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/linux-x86_64_no_zen3", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/windows-generic.jsonl000066400000000000000000003612561427272030600227430ustar00rootroot00000000000000{"environment": {"!EXITCODE": "00000000", "ACLOCAL_PATH": "C:\\Program Files\\Git\\mingw64\\share\\aclocal;C:\\Program Files\\Git\\usr\\share\\aclocal", "AGENT_BUILDDIRECTORY": "D:\\a\\1", "AGENT_DISABLELOGPLUGIN_TESTFILEPUBLISHERPLUGIN": "true", "AGENT_DISABLELOGPLUGIN_TESTRESULTLOGPLUGIN": "false", "AGENT_HOMEDIRECTORY": "C:\\agents\\2.202.0", "AGENT_ID": "92", "AGENT_JOBNAME": "JSONL Python38Windows", "AGENT_JOBSTATUS": "Succeeded", "AGENT_LOGTOBLOBSTORAGESERVICE": "true", "AGENT_MACHINENAME": "WIN-CU8INV6766V", "AGENT_NAME": "Hosted Agent", "AGENT_OS": "Windows_NT", "AGENT_OSARCHITECTURE": "X64", "AGENT_READONLYVARIABLES": "true", "AGENT_RETAINDEFAULTENCODING": "false", "AGENT_ROOTDIRECTORY": "D:\\a", "AGENT_SERVEROMDIRECTORY": "C:\\agents\\2.202.0\\externals\\vstsom", "AGENT_TASKRESTRICTIONSENFORCEMENTMODE": "Enabled", "AGENT_TEMPDIRECTORY": "D:\\a\\_temp", "AGENT_TOOLSDIRECTORY": "C:\\hostedtoolcache\\windows", "AGENT_USEWORKSPACEID": "true", "AGENT_VERSION": "2.202.0", "AGENT_WORKFOLDER": "D:\\a", "ALLUSERSPROFILE": "C:\\ProgramData", "ANDROID_HOME": "C:\\Android\\android-sdk", "ANDROID_NDK_HOME": "C:\\Android\\android-sdk\\ndk-bundle", "ANDROID_NDK_LATEST_HOME": "C:\\Android\\android-sdk\\ndk\\23.1.7779620", "ANDROID_NDK_PATH": "C:\\Android\\android-sdk\\ndk-bundle", "ANDROID_NDK_ROOT": "C:\\Android\\android-sdk\\ndk-bundle", "ANDROID_SDK_ROOT": "C:\\Android\\android-sdk", "ANT_HOME": "C:\\ProgramData\\chocolatey\\lib\\ant\\tools\\apache-ant-1.10.12", "APPDATA": "C:\\Users\\VssAdministrator\\AppData\\Roaming", "AR": "llvm-ar", "AS": "llvm-as", "AZURE_EXTENSION_DIR": "C:\\Program Files\\Common Files\\AzureCliExtensionDirectory", "AZURE_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "BUILD_ARTIFACTSTAGINGDIRECTORY": "D:\\a\\1\\a", "BUILD_BINARIESDIRECTORY": "D:\\a\\1\\b", "BUILD_BUILDID": "17021", "BUILD_BUILDNUMBER": "20220408.7", "BUILD_BUILDURI": "vstfs:///Build/Build/17021", "BUILD_CONTAINERID": "11809685", "BUILD_DEFINITIONNAME": "explosion.cython-blis", "BUILD_DEFINITIONVERSION": "1", "BUILD_QUEUEDBY": "GitHub", "BUILD_QUEUEDBYID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "BUILD_REASON": "PullRequest", "BUILD_REPOSITORY_GIT_SUBMODULECHECKOUT": "False", "BUILD_REPOSITORY_ID": "explosion/cython-blis", "BUILD_REPOSITORY_LOCALPATH": "D:\\a\\1\\s", "BUILD_REPOSITORY_NAME": "explosion/cython-blis", "BUILD_REPOSITORY_PROVIDER": "GitHub", "BUILD_REPOSITORY_URI": "https://github.com/explosion/cython-blis", "BUILD_REQUESTEDFOR": "GitHub", "BUILD_REQUESTEDFOREMAIL": "", "BUILD_REQUESTEDFORID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "BUILD_SOURCEBRANCH": "refs/pull/69/merge", "BUILD_SOURCEBRANCHNAME": "merge", "BUILD_SOURCESDIRECTORY": "D:\\a\\1\\s", "BUILD_SOURCEVERSION": "273ec162fa5f042b5d946638cedd954583ff8111", "BUILD_SOURCEVERSIONAUTHOR": "Dani\u00ebl de Kok", "BUILD_SOURCEVERSIONMESSAGE": "Merge 1de7a1931422b892af086ce69604e7e3459e9f8e into 6daabf0c925bfe67f7d87874ce014eb3212711e7", "BUILD_STAGINGDIRECTORY": "D:\\a\\1\\a", "CABAL_DIR": "C:\\cabal", "CC": "clang", "COBERTURA_HOME": "C:\\cobertura-2.1.1", "COMMONPROGRAMFILES": "C:\\Program Files\\Common Files", "COMMON_TESTRESULTSDIRECTORY": "D:\\a\\1\\TestResults", "COMPUTERNAME": "WIN-CU8INV6766V", "COMSPEC": "C:\\Windows\\system32\\cmd.exe", "CONDA": "C:\\Miniconda", "CONFIG_SITE": "C:/Program Files/Git/etc/config.site", "CHOCOLATEYINSTALL": "C:\\ProgramData\\chocolatey", "CHROMEWEBDRIVER": "C:\\SeleniumWebDrivers\\ChromeDriver", "COMMONPROGRAMFILES(X86)": "C:\\Program Files (x86)\\Common Files", "COMMONPROGRAMW6432": "C:\\Program Files\\Common Files", "DISPLAY": "needs-to-be-defined", "DOTNET_MULTILEVEL_LOOKUP": "0", "DRIVERDATA": "C:\\Windows\\System32\\Drivers\\DriverData", "EXEPATH": "C:\\Program Files\\Git\\bin", "EDGEWEBDRIVER": "C:\\SeleniumWebDrivers\\EdgeDriver", "GCM_INTERACTIVE": "Never", "GHCUP_INSTALL_BASE_PREFIX": "C:\\", "GHCUP_MSYS2": "C:\\msys64", "GIT_TERMINAL_PROMPT": "0", "GOROOT_1_15_X64": "C:\\hostedtoolcache\\windows\\go\\1.15.15\\x64", "GOROOT_1_16_X64": "C:\\hostedtoolcache\\windows\\go\\1.16.15\\x64", "GOROOT_1_17_X64": "C:\\hostedtoolcache\\windows\\go\\1.17.8\\x64", "GOROOT_1_18_X64": "C:\\hostedtoolcache\\windows\\go\\1.18.0\\x64", "GRADLE_HOME": "C:\\ProgramData\\chocolatey\\lib\\gradle\\tools\\gradle-7.4", "GECKOWEBDRIVER": "C:\\SeleniumWebDrivers\\GeckoDriver", "HOME": "C:\\Users\\VssAdministrator", "HOMEDRIVE": "C:", "HOMEPATH": "\\Users\\VssAdministrator", "HOSTNAME": "WIN-CU8INV6766V", "IEWEBDRIVER": "C:\\SeleniumWebDrivers\\IEDriver", "IMAGENAME": "windows-latest", "INFOPATH": "C:\\Program Files\\Git\\usr\\local\\info;C:\\Program Files\\Git\\usr\\share\\info;C:\\Program Files\\Git\\usr\\info;C:\\Program Files\\Git\\share\\info", "IMAGEOS": "win22", "IMAGEVERSION": "20220330.1", "JAVA_HOME": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64", "JAVA_HOME_11_X64": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\11.0.14-101\\x64", "JAVA_HOME_17_X64": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\17.0.2-8\\x64", "JAVA_HOME_8_X64": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64", "LANG": "en_US.UTF-8", "LOCALAPPDATA": "C:\\Users\\VssAdministrator\\AppData\\Local", "LOGONSERVER": "\\\\WIN-CU8INV6766V", "M2": "C:\\ProgramData\\chocolatey\\lib\\maven\\apache-maven-3.8.5\\bin", "M2_REPO": "C:\\ProgramData\\m2", "MANPATH": "C:\\Program Files\\Git\\mingw64\\local\\man;C:\\Program Files\\Git\\mingw64\\share\\man;C:\\Program Files\\Git\\usr\\local\\man;C:\\Program Files\\Git\\usr\\share\\man;C:\\Program Files\\Git\\usr\\man;C:\\Program Files\\Git\\share\\man", "MAVEN_OPTS": "-Xms256m", "MINGW_CHOST": "x86_64-w64-mingw32", "MINGW_PACKAGE_PREFIX": "mingw-w64-x86_64", "MINGW_PREFIX": "C:/Program Files/Git/mingw64", "MSDEPLOY_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "MSYSTEM": "MINGW64", "MSYSTEM_CARCH": "x86_64", "MSYSTEM_CHOST": "x86_64-w64-mingw32", "MSYSTEM_PREFIX": "C:/Program Files/Git/mingw64", "MONAGENTCLIENTLOCATION": "C:\\Packages\\Plugins\\Microsoft.Azure.Geneva.GenevaMonitoring\\2.35.0.2\\Monitoring\\Agent", "NUMBER_OF_PROCESSORS": "4", "OLDPWD": "D:/a/1/s", "ORIGINAL_PATH": "C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Users\\VssAdministrator\\bin;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\AppData\\Roaming\\Python\\Python38\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64;C:\\agents\\2.202.0\\externals\\git\\cmd;C:\\agents\\2.202.0\\externals\\git\\mingw64\\bin;C:\\Program Files\\MongoDB\\Server\\5.0\\bin;C:\\aliyun-cli;C:\\vcpkg;C:\\Program Files (x86)\\NSIS;C:\\tools\\zstd;C:\\Program Files\\Mercurial;C:\\hostedtoolcache\\windows\\stack\\2.7.5\\x64;C:\\cabal\\bin;C:\\ghcup\\bin;C:\\tools\\ghc-9.2.2\\bin;C:\\Program Files\\dotnet;C:\\mysql\\bin;C:\\Program Files\\R\\R-4.1.3\\bin\\x64;C:\\SeleniumWebDrivers\\GeckoDriver;C:\\Program Files (x86)\\sbt\\bin;C:\\Program Files (x86)\\GitHub CLI;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files (x86)\\pipx_bin;C:\\hostedtoolcache\\windows\\go\\1.16.15\\x64\\bin;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64;C:\\hostedtoolcache\\windows\\Ruby\\3.0.3\\x64\\bin;C:\\tools\\kotlinc\\bin;C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64\\bin;C:\\npm\\prefix;C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\CLI2\\wbin;C:\\ProgramData\\kind;C:\\Program Files\\Microsoft\\jdk-11.0.12.7-hotspot\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\System32\\Wbem;C:\\Windows\\System32\\WindowsPowerShell\\v1.0;C:\\Windows\\System32\\OpenSSH;C:\\Program Files\\dotnet;C:\\ProgramData\\Chocolatey\\bin;C:\\Program Files\\Docker;C:\\Program Files\\PowerShell\\7;C:\\Program Files\\Microsoft\\Web Platform Installer;C:\\Program Files\\Microsoft SQL Server\\Client SDK\\ODBC\\170\\Tools\\Binn;C:\\Program Files\\Microsoft SQL Server\\150\\Tools\\Binn;C:\\Program Files\\nodejs;C:\\Program Files\\OpenSSL\\bin;C:\\Strawberry\\c\\bin;C:\\Strawberry\\perl\\site\\bin;C:\\Strawberry\\perl\\bin;C:\\ProgramData\\chocolatey\\lib\\pulumi\\tools\\Pulumi\\bin;C:\\Program Files\\TortoiseSVN\\bin;C:\\Program Files\\CMake\\bin;C:\\ProgramData\\chocolatey\\lib\\maven\\apache-maven-3.8.5\\bin;C:\\Program Files\\Microsoft Service Fabric\\bin\\Fabric\\Fabric.Code;C:\\Program Files\\Microsoft SDKs\\Service Fabric\\Tools\\ServiceFabricLocalClusterManager;C:\\Program Files\\Git\\cmd;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\GitHub CLI;C:\\tools\\php;C:\\Program Files (x86)\\sbt\\bin;C:\\SeleniumWebDrivers\\ChromeDriver;C:\\SeleniumWebDrivers\\EdgeDriver;C:\\Program Files\\Amazon\\AWSCLIV2;C:\\Program Files\\Amazon\\SessionManagerPlugin\\bin;C:\\Program Files\\Amazon\\AWSSAMCLI\\bin;C:\\Program Files\\Microsoft SQL Server\\130\\Tools\\Binn;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\.dotnet\\tools;C:\\Users\\VssAdministrator\\.cargo\\bin;C:\\Users\\VssAdministrator\\AppData\\Local\\Microsoft\\WindowsApps", "ORIGINAL_TEMP": "C:/Users/VSSADM~1/AppData/Local/Temp", "ORIGINAL_TMP": "C:/Users/VSSADM~1/AppData/Local/Temp", "OS": "windows", "PATH": "C:\\Users\\VssAdministrator\\bin;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\local\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Users\\VssAdministrator\\bin;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\AppData\\Roaming\\Python\\Python38\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64;C:\\agents\\2.202.0\\externals\\git\\cmd;C:\\agents\\2.202.0\\externals\\git\\mingw64\\bin;C:\\Program Files\\MongoDB\\Server\\5.0\\bin;C:\\aliyun-cli;C:\\vcpkg;C:\\Program Files (x86)\\NSIS;C:\\tools\\zstd;C:\\Program Files\\Mercurial;C:\\hostedtoolcache\\windows\\stack\\2.7.5\\x64;C:\\cabal\\bin;C:\\ghcup\\bin;C:\\tools\\ghc-9.2.2\\bin;C:\\Program Files\\dotnet;C:\\mysql\\bin;C:\\Program Files\\R\\R-4.1.3\\bin\\x64;C:\\SeleniumWebDrivers\\GeckoDriver;C:\\Program Files (x86)\\sbt\\bin;C:\\Program Files (x86)\\GitHub CLI;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files (x86)\\pipx_bin;C:\\hostedtoolcache\\windows\\go\\1.16.15\\x64\\bin;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64;C:\\hostedtoolcache\\windows\\Ruby\\3.0.3\\x64\\bin;C:\\tools\\kotlinc\\bin;C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64\\bin;C:\\npm\\prefix;C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\CLI2\\wbin;C:\\ProgramData\\kind;C:\\Program Files\\Microsoft\\jdk-11.0.12.7-hotspot\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\System32\\Wbem;C:\\Windows\\System32\\WindowsPowerShell\\v1.0;C:\\Windows\\System32\\OpenSSH;C:\\Program Files\\dotnet;C:\\ProgramData\\Chocolatey\\bin;C:\\Program Files\\Docker;C:\\Program Files\\PowerShell\\7;C:\\Program Files\\Microsoft\\Web Platform Installer;C:\\Program Files\\Microsoft SQL Server\\Client SDK\\ODBC\\170\\Tools\\Binn;C:\\Program Files\\Microsoft SQL Server\\150\\Tools\\Binn;C:\\Program Files\\nodejs;C:\\Program Files\\OpenSSL\\bin;C:\\Strawberry\\c\\bin;C:\\Strawberry\\perl\\site\\bin;C:\\Strawberry\\perl\\bin;C:\\ProgramData\\chocolatey\\lib\\pulumi\\tools\\Pulumi\\bin;C:\\Program Files\\TortoiseSVN\\bin;C:\\Program Files\\CMake\\bin;C:\\ProgramData\\chocolatey\\lib\\maven\\apache-maven-3.8.5\\bin;C:\\Program Files\\Microsoft Service Fabric\\bin\\Fabric\\Fabric.Code;C:\\Program Files\\Microsoft SDKs\\Service Fabric\\Tools\\ServiceFabricLocalClusterManager;C:\\Program Files\\Git\\cmd;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\GitHub CLI;C:\\tools\\php;C:\\Program Files (x86)\\sbt\\bin;C:\\SeleniumWebDrivers\\ChromeDriver;C:\\SeleniumWebDrivers\\EdgeDriver;C:\\Program Files\\Amazon\\AWSCLIV2;C:\\Program Files\\Amazon\\SessionManagerPlugin\\bin;C:\\Program Files\\Amazon\\AWSSAMCLI\\bin;C:\\Program Files\\Microsoft SQL Server\\130\\Tools\\Binn;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\.dotnet\\tools;C:\\Users\\VssAdministrator\\.cargo\\bin;C:\\Users\\VssAdministrator\\AppData\\Local\\Microsoft\\WindowsApps;C:\\Program Files\\Git\\usr\\bin\\vendor_perl;C:\\Program Files\\Git\\usr\\bin\\core_perl", "PATHEXT": ".COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC;.CPL", "PGBIN": "C:\\Program Files\\PostgreSQL\\14\\bin", "PGDATA": "C:\\Program Files\\PostgreSQL\\14\\data", "PGPASSWORD": "root", "PGROOT": "C:\\Program Files\\PostgreSQL\\14", "PGUSER": "postgres", "PHPROOT": "c:\\tools\\php", "PIPELINE_WORKSPACE": "D:\\a\\1", "PIPX_BIN_DIR": "C:\\Program Files (x86)\\pipx_bin", "PIPX_HOME": "C:\\Program Files (x86)\\pipx", "PKG_CONFIG_PATH": "C:\\Program Files\\Git\\mingw64\\lib\\pkgconfig;C:\\Program Files\\Git\\mingw64\\share\\pkgconfig", "PLINK_PROTOCOL": "ssh", "POWERSHELL_DISTRIBUTION_CHANNEL": "Azure-DevOps-win22", "POWERSHELL_UPDATECHECK": "Off", "PROCESSOR_ARCHITECTURE": "AMD64", "PROCESSOR_IDENTIFIER": "Intel64 Family 6 Model 79 Stepping 1, GenuineIntel", "PROCESSOR_LEVEL": "6", "PROCESSOR_REVISION": "4f01", "PROGRAMFILES": "C:\\Program Files", "PROMPT": "$P$G", "PSEXECUTIONPOLICYPREFERENCE": "Unrestricted", "PSMODULEPATH": "C:\\Users\\VssAdministrator\\Documents\\WindowsPowerShell\\Modules;C:\\\\Modules\\azurerm_2.1.0;C:\\\\Modules\\azure_2.1.0;C:\\Users\\packer\\Documents\\WindowsPowerShell\\Modules;C:\\Program Files\\WindowsPowerShell\\Modules;C:\\Windows\\system32\\WindowsPowerShell\\v1.0\\Modules;C:\\Program Files\\Microsoft SQL Server\\130\\Tools\\PowerShell\\Modules\\", "PUBLIC": "C:\\Users\\Public", "PWD": "D:/a/1/s/flame-blis", "PYTHON_VERSION": "3.8", "PROGRAMDATA": "C:\\ProgramData", "PROGRAMFILES(X86)": "C:\\Program Files (x86)", "PROGRAMW6432": "C:\\Program Files", "RANLIB": "echo", "RESOURCES_TRIGGERINGALIAS": "", "RESOURCES_TRIGGERINGCATEGORY": "", "RTOOLS40_HOME": "C:\\rtools40", "RUNNER_TOOLSDIRECTORY": "C:\\hostedtoolcache\\windows", "SBT_HOME": "C:\\Program Files (x86)\\sbt\\", "SELENIUM_JAR_PATH": "C:\\selenium\\selenium-server.jar", "SHELL": "C:\\Program Files\\Git\\usr\\bin\\bash.exe", "SHLVL": "2", "SSH_ASKPASS": "C:/Program Files/Git/mingw64/bin/git-askpass.exe", "SYSTEM": "build", "SYSTEMDRIVE": "C:", "SYSTEMROOT": "C:\\Windows", "SYSTEM_ARTIFACTSDIRECTORY": "D:\\a\\1\\a", "SYSTEM_COLLECTIONID": "116cc368-5c0c-4eb4-bb44-7f3fa5bdce14", "SYSTEM_COLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_CULTURE": "en-US", "SYSTEM_DEBUG": "false", "SYSTEM_DEFAULTWORKINGDIRECTORY": "D:\\a\\1\\s", "SYSTEM_DEFINITIONID": "6", "SYSTEM_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_ENABLEACCESSTOKEN": "SecretVariable", "SYSTEM_HOSTTYPE": "build", "SYSTEM_ISSCHEDULED": "False", "SYSTEM_JOBATTEMPT": "1", "SYSTEM_JOBDISPLAYNAME": "JSONL Python38Windows", "SYSTEM_JOBID": "efb31c5a-ec83-5597-c79c-3c04d0eba6be", "SYSTEM_JOBIDENTIFIER": "JSONL.Python38Windows", "SYSTEM_JOBNAME": "Python38Windows", "SYSTEM_JOBPARALLELISMTAG": "Public", "SYSTEM_JOBPOSITIONINPHASE": "1", "SYSTEM_JOBTIMEOUT": "60", "SYSTEM_PARALLELEXECUTIONTYPE": "MultiConfiguration", "SYSTEM_PHASEATTEMPT": "1", "SYSTEM_PHASEDISPLAYNAME": "JSONL", "SYSTEM_PHASEID": "ecb95708-c2a5-5456-f379-96cd8090c2a6", "SYSTEM_PHASENAME": "JSONL", "SYSTEM_PIPELINESTARTTIME": "2022-04-08 12:45:25+00:00", "SYSTEM_PLANID": "4bcc5172-4f8e-4a4f-b00f-1b3d5e2fe9dd", "SYSTEM_POSTLINESSPEED": "10000", "SYSTEM_PULLREQUEST_ISFORK": "True", "SYSTEM_PULLREQUEST_MERGEDAT": "", "SYSTEM_PULLREQUEST_PULLREQUESTID": "899994381", "SYSTEM_PULLREQUEST_PULLREQUESTNUMBER": "69", "SYSTEM_PULLREQUEST_SOURCEBRANCH": "update-to-blis-0.9.0", "SYSTEM_PULLREQUEST_SOURCECOMMITID": "1de7a1931422b892af086ce69604e7e3459e9f8e", "SYSTEM_PULLREQUEST_SOURCEREPOSITORYURI": "https://github.com/explosion/cython-blis", "SYSTEM_PULLREQUEST_TARGETBRANCH": "master", "SYSTEM_RESTRICTSECRETS": "True", "SYSTEM_SERVERTYPE": "Hosted", "SYSTEM_STAGEATTEMPT": "1", "SYSTEM_STAGEDISPLAYNAME": "__default", "SYSTEM_STAGEID": "96ac2280-8cb4-5df5-99de-dd2da759617d", "SYSTEM_STAGENAME": "__default", "SYSTEM_TASKDEFINITIONSURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_TASKDISPLAYNAME": "Generate JSONL (Windows)", "SYSTEM_TASKINSTANCEID": "4bae54ba-656f-5414-04c0-0cf207e9f5bd", "SYSTEM_TASKINSTANCENAME": "CmdLine5", "SYSTEM_TEAMFOUNDATIONCOLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_TEAMFOUNDATIONSERVERURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_TEAMPROJECT": "Public", "SYSTEM_TEAMPROJECTID": "5c6613e9-6ccf-48bd-81de-dbc3b0a6f957", "SYSTEM_TIMELINEID": "4bcc5172-4f8e-4a4f-b00f-1b3d5e2fe9dd", "SYSTEM_TOTALJOBSINPHASE": "1", "SYSTEM_WORKFOLDER": "D:\\a", "TASK_DISPLAYNAME": "Generate JSONL (Windows)", "TASK_SKIPTRANSLATORFORCHECKOUT": "False", "TEMP": "C:\\Users\\VSSADM~1\\AppData\\Local\\Temp", "TERM": "xterm-256color", "TF_BUILD": "True", "TMP": "C:\\Users\\VSSADM~1\\AppData\\Local\\Temp", "TMPDIR": "C:\\Users\\VSSADM~1\\AppData\\Local\\Temp", "USEPYTHONVERSION_PYTHONLOCATION": "C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64", "USERDOMAIN": "WIN-CU8INV6766V", "USERDOMAIN_ROAMINGPROFILE": "WIN-CU8INV6766V", "USERNAME": "VssAdministrator", "USERPROFILE": "C:\\Users\\VssAdministrator", "VCPKG_INSTALLATION_ROOT": "C:\\vcpkg", "VSTS_AGENT_PERFLOG": "C:\\agents\\perflog", "VSTS_PROCESS_LOOKUP_ID": "vsts_175962b3-f397-42b7-b557-a072c6b9de45", "VSTS_SECRET_VARIABLES": "", "WINDIR": "C:\\Windows", "WIX": "C:\\Program Files (x86)\\WiX Toolset v3.11\\", "_": "C:/hostedtoolcache/windows/Python/3.8.10/x64/python", "AGENT.JOBSTATUS": "Succeeded", "NPM_CONFIG_PREFIX": "C:\\npm\\prefix"}} {"compiler": "clang", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/generic/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/generic/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/generic/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/generic/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/generic/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/generic/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/generic/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/generic/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_check.c", "target": "obj/generic/frame/0/bli_l0_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_fpa.c", "target": "obj/generic/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_oapi.c", "target": "obj/generic/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_tapi.c", "target": "obj/generic/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/generic/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_check.c", "target": "obj/generic/frame/1/bli_l1v_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/generic/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/generic/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/generic/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/generic/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/generic/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/generic/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/generic/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_check.c", "target": "obj/generic/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/generic/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/generic/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/generic/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/generic/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/generic/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/generic/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/generic/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_check.c", "target": "obj/generic/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/generic/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/generic/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/generic/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/generic/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/generic/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/generic/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/generic/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_check.c", "target": "obj/generic/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/generic/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/generic/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/generic/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/generic/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/generic/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/generic/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/generic/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/generic/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/generic/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/generic/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/generic/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/generic/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/generic/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/generic/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/generic/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/generic/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/generic/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/generic/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/generic/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/generic/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/generic/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_check.c", "target": "obj/generic/frame/2/bli_l2_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_fpa.c", "target": "obj/generic/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_oapi.c", "target": "obj/generic/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/generic/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/generic/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_tapi.c", "target": "obj/generic/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/generic/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/generic/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/generic/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/generic/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/generic/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/generic/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/generic/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/generic/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/generic/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/generic/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/generic/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/generic/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/generic/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/generic/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/generic/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/generic/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/generic/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/generic/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/generic/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/generic/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/generic/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_check.c", "target": "obj/generic/frame/3/bli_l3_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_cntl.c", "target": "obj/generic/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_direct.c", "target": "obj/generic/frame/3/bli_l3_direct.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ind.c", "target": "obj/generic/frame/3/bli_l3_ind.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_int.c", "target": "obj/generic/frame/3/bli_l3_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_oapi.c", "target": "obj/generic/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/generic/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_packab.c", "target": "obj/generic/frame/3/bli_l3_packab.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_prune.c", "target": "obj/generic/frame/3/bli_l3_prune.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_schema.c", "target": "obj/generic/frame/3/bli_l3_schema.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup.c", "target": "obj/generic/frame/3/bli_l3_sup.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/generic/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/generic/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/generic/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/generic/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/generic/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_tapi.c", "target": "obj/generic/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/generic/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/generic/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/generic/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/generic/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/generic/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/generic/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/generic/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/generic/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/generic/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/generic/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/generic/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/generic/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/generic/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/generic/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/generic/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/generic/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/generic/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/generic/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/generic/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/generic/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/generic/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/generic/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_apool.c", "target": "obj/generic/frame/base/bli_apool.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_arch.c", "target": "obj/generic/frame/base/bli_arch.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_array.c", "target": "obj/generic/frame/base/bli_array.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_blksz.c", "target": "obj/generic/frame/base/bli_blksz.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_check.c", "target": "obj/generic/frame/base/bli_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_clock.c", "target": "obj/generic/frame/base/bli_clock.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_cntl.c", "target": "obj/generic/frame/base/bli_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_cntx.c", "target": "obj/generic/frame/base/bli_cntx.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_const.c", "target": "obj/generic/frame/base/bli_const.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_cpuid.c", "target": "obj/generic/frame/base/bli_cpuid.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_env.c", "target": "obj/generic/frame/base/bli_env.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_error.c", "target": "obj/generic/frame/base/bli_error.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_func.c", "target": "obj/generic/frame/base/bli_func.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_getopt.c", "target": "obj/generic/frame/base/bli_getopt.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_gks.c", "target": "obj/generic/frame/base/bli_gks.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_ind.c", "target": "obj/generic/frame/base/bli_ind.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_info.c", "target": "obj/generic/frame/base/bli_info.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_init.c", "target": "obj/generic/frame/base/bli_init.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_machval.c", "target": "obj/generic/frame/base/bli_machval.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_malloc.c", "target": "obj/generic/frame/base/bli_malloc.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_mbool.c", "target": "obj/generic/frame/base/bli_mbool.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_memsys.c", "target": "obj/generic/frame/base/bli_memsys.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_obj.c", "target": "obj/generic/frame/base/bli_obj.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_obj_scalar.c", "target": "obj/generic/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_pack.c", "target": "obj/generic/frame/base/bli_pack.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_param_map.c", "target": "obj/generic/frame/base/bli_param_map.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_part.c", "target": "obj/generic/frame/base/bli_part.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_pba.c", "target": "obj/generic/frame/base/bli_pba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_pool.c", "target": "obj/generic/frame/base/bli_pool.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_prune.c", "target": "obj/generic/frame/base/bli_prune.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_query.c", "target": "obj/generic/frame/base/bli_query.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_rntm.c", "target": "obj/generic/frame/base/bli_rntm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_sba.c", "target": "obj/generic/frame/base/bli_sba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_setgetijm.c", "target": "obj/generic/frame/base/bli_setgetijm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_setgetijv.c", "target": "obj/generic/frame/base/bli_setgetijv.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_setri.c", "target": "obj/generic/frame/base/bli_setri.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_string.c", "target": "obj/generic/frame/base/bli_string.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_winsys.c", "target": "obj/generic/frame/base/bli_winsys.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/cast/bli_castm.c", "target": "obj/generic/frame/base/cast/bli_castm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/generic/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/cast/bli_castv.c", "target": "obj/generic/frame/base/cast/bli_castv.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/check/bli_obj_check.c", "target": "obj/generic/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/check/bli_part_check.c", "target": "obj/generic/frame/base/check/bli_part_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/generic/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/generic/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/generic/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/proj/bli_projm.c", "target": "obj/generic/frame/base/proj/bli_projm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/proj/bli_projv.c", "target": "obj/generic/frame/base/proj/bli_projv.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/generic/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/generic/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/generic/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/generic/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_pthread.c", "target": "obj/generic/frame/thread/bli_pthread.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm.c", "target": "obj/generic/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/generic/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/generic/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/generic/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thread.c", "target": "obj/generic/frame/thread/bli_thread.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrinfo.c", "target": "obj/generic/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/generic/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_check.c", "target": "obj/generic/frame/util/bli_util_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_fpa.c", "target": "obj/generic/frame/util/bli_util_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_oapi.c", "target": "obj/generic/frame/util/bli_util_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/generic/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/generic/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_tapi.c", "target": "obj/generic/frame/util/bli_util_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/generic/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/generic/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/generic/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-generic", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/make/windows-x86_64.jsonl000066400000000000000000014247251427272030600222670ustar00rootroot00000000000000{"environment": {"!EXITCODE": "00000000", "ACLOCAL_PATH": "C:\\Program Files\\Git\\mingw64\\share\\aclocal;C:\\Program Files\\Git\\usr\\share\\aclocal", "AGENT_BUILDDIRECTORY": "D:\\a\\1", "AGENT_DISABLELOGPLUGIN_TESTFILEPUBLISHERPLUGIN": "true", "AGENT_DISABLELOGPLUGIN_TESTRESULTLOGPLUGIN": "false", "AGENT_HOMEDIRECTORY": "C:\\agents\\2.202.0", "AGENT_ID": "92", "AGENT_JOBNAME": "JSONL Python38Windows", "AGENT_JOBSTATUS": "Succeeded", "AGENT_LOGTOBLOBSTORAGESERVICE": "true", "AGENT_MACHINENAME": "WIN-CU8INV6766V", "AGENT_NAME": "Hosted Agent", "AGENT_OS": "Windows_NT", "AGENT_OSARCHITECTURE": "X64", "AGENT_READONLYVARIABLES": "true", "AGENT_RETAINDEFAULTENCODING": "false", "AGENT_ROOTDIRECTORY": "D:\\a", "AGENT_SERVEROMDIRECTORY": "C:\\agents\\2.202.0\\externals\\vstsom", "AGENT_TASKRESTRICTIONSENFORCEMENTMODE": "Enabled", "AGENT_TEMPDIRECTORY": "D:\\a\\_temp", "AGENT_TOOLSDIRECTORY": "C:\\hostedtoolcache\\windows", "AGENT_USEWORKSPACEID": "true", "AGENT_VERSION": "2.202.0", "AGENT_WORKFOLDER": "D:\\a", "ALLUSERSPROFILE": "C:\\ProgramData", "ANDROID_HOME": "C:\\Android\\android-sdk", "ANDROID_NDK_HOME": "C:\\Android\\android-sdk\\ndk-bundle", "ANDROID_NDK_LATEST_HOME": "C:\\Android\\android-sdk\\ndk\\23.1.7779620", "ANDROID_NDK_PATH": "C:\\Android\\android-sdk\\ndk-bundle", "ANDROID_NDK_ROOT": "C:\\Android\\android-sdk\\ndk-bundle", "ANDROID_SDK_ROOT": "C:\\Android\\android-sdk", "ANT_HOME": "C:\\ProgramData\\chocolatey\\lib\\ant\\tools\\apache-ant-1.10.12", "APPDATA": "C:\\Users\\VssAdministrator\\AppData\\Roaming", "AR": "llvm-ar", "AS": "llvm-as", "AZURE_EXTENSION_DIR": "C:\\Program Files\\Common Files\\AzureCliExtensionDirectory", "AZURE_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "BUILD_ARTIFACTSTAGINGDIRECTORY": "D:\\a\\1\\a", "BUILD_BINARIESDIRECTORY": "D:\\a\\1\\b", "BUILD_BUILDID": "17021", "BUILD_BUILDNUMBER": "20220408.7", "BUILD_BUILDURI": "vstfs:///Build/Build/17021", "BUILD_CONTAINERID": "11809685", "BUILD_DEFINITIONNAME": "explosion.cython-blis", "BUILD_DEFINITIONVERSION": "1", "BUILD_QUEUEDBY": "GitHub", "BUILD_QUEUEDBYID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "BUILD_REASON": "PullRequest", "BUILD_REPOSITORY_GIT_SUBMODULECHECKOUT": "False", "BUILD_REPOSITORY_ID": "explosion/cython-blis", "BUILD_REPOSITORY_LOCALPATH": "D:\\a\\1\\s", "BUILD_REPOSITORY_NAME": "explosion/cython-blis", "BUILD_REPOSITORY_PROVIDER": "GitHub", "BUILD_REPOSITORY_URI": "https://github.com/explosion/cython-blis", "BUILD_REQUESTEDFOR": "GitHub", "BUILD_REQUESTEDFOREMAIL": "", "BUILD_REQUESTEDFORID": "38e7e9f7-fc06-4f5a-b6dd-1782f4ef7c25", "BUILD_SOURCEBRANCH": "refs/pull/69/merge", "BUILD_SOURCEBRANCHNAME": "merge", "BUILD_SOURCESDIRECTORY": "D:\\a\\1\\s", "BUILD_SOURCEVERSION": "273ec162fa5f042b5d946638cedd954583ff8111", "BUILD_SOURCEVERSIONAUTHOR": "Dani\u00ebl de Kok", "BUILD_SOURCEVERSIONMESSAGE": "Merge 1de7a1931422b892af086ce69604e7e3459e9f8e into 6daabf0c925bfe67f7d87874ce014eb3212711e7", "BUILD_STAGINGDIRECTORY": "D:\\a\\1\\a", "CABAL_DIR": "C:\\cabal", "CC": "clang", "COBERTURA_HOME": "C:\\cobertura-2.1.1", "COMMONPROGRAMFILES": "C:\\Program Files\\Common Files", "COMMON_TESTRESULTSDIRECTORY": "D:\\a\\1\\TestResults", "COMPUTERNAME": "WIN-CU8INV6766V", "COMSPEC": "C:\\Windows\\system32\\cmd.exe", "CONDA": "C:\\Miniconda", "CONFIG_SITE": "C:/Program Files/Git/etc/config.site", "CHOCOLATEYINSTALL": "C:\\ProgramData\\chocolatey", "CHROMEWEBDRIVER": "C:\\SeleniumWebDrivers\\ChromeDriver", "COMMONPROGRAMFILES(X86)": "C:\\Program Files (x86)\\Common Files", "COMMONPROGRAMW6432": "C:\\Program Files\\Common Files", "DISPLAY": "needs-to-be-defined", "DOTNET_MULTILEVEL_LOOKUP": "0", "DRIVERDATA": "C:\\Windows\\System32\\Drivers\\DriverData", "EXEPATH": "C:\\Program Files\\Git\\bin", "EDGEWEBDRIVER": "C:\\SeleniumWebDrivers\\EdgeDriver", "GCM_INTERACTIVE": "Never", "GHCUP_INSTALL_BASE_PREFIX": "C:\\", "GHCUP_MSYS2": "C:\\msys64", "GIT_TERMINAL_PROMPT": "0", "GOROOT_1_15_X64": "C:\\hostedtoolcache\\windows\\go\\1.15.15\\x64", "GOROOT_1_16_X64": "C:\\hostedtoolcache\\windows\\go\\1.16.15\\x64", "GOROOT_1_17_X64": "C:\\hostedtoolcache\\windows\\go\\1.17.8\\x64", "GOROOT_1_18_X64": "C:\\hostedtoolcache\\windows\\go\\1.18.0\\x64", "GRADLE_HOME": "C:\\ProgramData\\chocolatey\\lib\\gradle\\tools\\gradle-7.4", "GECKOWEBDRIVER": "C:\\SeleniumWebDrivers\\GeckoDriver", "HOME": "C:\\Users\\VssAdministrator", "HOMEDRIVE": "C:", "HOMEPATH": "\\Users\\VssAdministrator", "HOSTNAME": "WIN-CU8INV6766V", "IEWEBDRIVER": "C:\\SeleniumWebDrivers\\IEDriver", "IMAGENAME": "windows-latest", "INFOPATH": "C:\\Program Files\\Git\\usr\\local\\info;C:\\Program Files\\Git\\usr\\share\\info;C:\\Program Files\\Git\\usr\\info;C:\\Program Files\\Git\\share\\info", "IMAGEOS": "win22", "IMAGEVERSION": "20220330.1", "JAVA_HOME": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64", "JAVA_HOME_11_X64": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\11.0.14-101\\x64", "JAVA_HOME_17_X64": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\17.0.2-8\\x64", "JAVA_HOME_8_X64": "C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64", "LANG": "en_US.UTF-8", "LOCALAPPDATA": "C:\\Users\\VssAdministrator\\AppData\\Local", "LOGONSERVER": "\\\\WIN-CU8INV6766V", "M2": "C:\\ProgramData\\chocolatey\\lib\\maven\\apache-maven-3.8.5\\bin", "M2_REPO": "C:\\ProgramData\\m2", "MANPATH": "C:\\Program Files\\Git\\mingw64\\local\\man;C:\\Program Files\\Git\\mingw64\\share\\man;C:\\Program Files\\Git\\usr\\local\\man;C:\\Program Files\\Git\\usr\\share\\man;C:\\Program Files\\Git\\usr\\man;C:\\Program Files\\Git\\share\\man", "MAVEN_OPTS": "-Xms256m", "MINGW_CHOST": "x86_64-w64-mingw32", "MINGW_PACKAGE_PREFIX": "mingw-w64-x86_64", "MINGW_PREFIX": "C:/Program Files/Git/mingw64", "MSDEPLOY_HTTP_USER_AGENT": "VSTS_116cc368-5c0c-4eb4-bb44-7f3fa5bdce14_build_6_0", "MSYSTEM": "MINGW64", "MSYSTEM_CARCH": "x86_64", "MSYSTEM_CHOST": "x86_64-w64-mingw32", "MSYSTEM_PREFIX": "C:/Program Files/Git/mingw64", "MONAGENTCLIENTLOCATION": "C:\\Packages\\Plugins\\Microsoft.Azure.Geneva.GenevaMonitoring\\2.35.0.2\\Monitoring\\Agent", "NUMBER_OF_PROCESSORS": "4", "OLDPWD": "D:/a/1/s", "ORIGINAL_PATH": "C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Users\\VssAdministrator\\bin;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\AppData\\Roaming\\Python\\Python38\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64;C:\\agents\\2.202.0\\externals\\git\\cmd;C:\\agents\\2.202.0\\externals\\git\\mingw64\\bin;C:\\Program Files\\MongoDB\\Server\\5.0\\bin;C:\\aliyun-cli;C:\\vcpkg;C:\\Program Files (x86)\\NSIS;C:\\tools\\zstd;C:\\Program Files\\Mercurial;C:\\hostedtoolcache\\windows\\stack\\2.7.5\\x64;C:\\cabal\\bin;C:\\ghcup\\bin;C:\\tools\\ghc-9.2.2\\bin;C:\\Program Files\\dotnet;C:\\mysql\\bin;C:\\Program Files\\R\\R-4.1.3\\bin\\x64;C:\\SeleniumWebDrivers\\GeckoDriver;C:\\Program Files (x86)\\sbt\\bin;C:\\Program Files (x86)\\GitHub CLI;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files (x86)\\pipx_bin;C:\\hostedtoolcache\\windows\\go\\1.16.15\\x64\\bin;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64;C:\\hostedtoolcache\\windows\\Ruby\\3.0.3\\x64\\bin;C:\\tools\\kotlinc\\bin;C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64\\bin;C:\\npm\\prefix;C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\CLI2\\wbin;C:\\ProgramData\\kind;C:\\Program Files\\Microsoft\\jdk-11.0.12.7-hotspot\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\System32\\Wbem;C:\\Windows\\System32\\WindowsPowerShell\\v1.0;C:\\Windows\\System32\\OpenSSH;C:\\Program Files\\dotnet;C:\\ProgramData\\Chocolatey\\bin;C:\\Program Files\\Docker;C:\\Program Files\\PowerShell\\7;C:\\Program Files\\Microsoft\\Web Platform Installer;C:\\Program Files\\Microsoft SQL Server\\Client SDK\\ODBC\\170\\Tools\\Binn;C:\\Program Files\\Microsoft SQL Server\\150\\Tools\\Binn;C:\\Program Files\\nodejs;C:\\Program Files\\OpenSSL\\bin;C:\\Strawberry\\c\\bin;C:\\Strawberry\\perl\\site\\bin;C:\\Strawberry\\perl\\bin;C:\\ProgramData\\chocolatey\\lib\\pulumi\\tools\\Pulumi\\bin;C:\\Program Files\\TortoiseSVN\\bin;C:\\Program Files\\CMake\\bin;C:\\ProgramData\\chocolatey\\lib\\maven\\apache-maven-3.8.5\\bin;C:\\Program Files\\Microsoft Service Fabric\\bin\\Fabric\\Fabric.Code;C:\\Program Files\\Microsoft SDKs\\Service Fabric\\Tools\\ServiceFabricLocalClusterManager;C:\\Program Files\\Git\\cmd;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\GitHub CLI;C:\\tools\\php;C:\\Program Files (x86)\\sbt\\bin;C:\\SeleniumWebDrivers\\ChromeDriver;C:\\SeleniumWebDrivers\\EdgeDriver;C:\\Program Files\\Amazon\\AWSCLIV2;C:\\Program Files\\Amazon\\SessionManagerPlugin\\bin;C:\\Program Files\\Amazon\\AWSSAMCLI\\bin;C:\\Program Files\\Microsoft SQL Server\\130\\Tools\\Binn;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\.dotnet\\tools;C:\\Users\\VssAdministrator\\.cargo\\bin;C:\\Users\\VssAdministrator\\AppData\\Local\\Microsoft\\WindowsApps", "ORIGINAL_TEMP": "C:/Users/VSSADM~1/AppData/Local/Temp", "ORIGINAL_TMP": "C:/Users/VSSADM~1/AppData/Local/Temp", "OS": "windows", "PATH": "C:\\Users\\VssAdministrator\\bin;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\local\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Users\\VssAdministrator\\bin;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\AppData\\Roaming\\Python\\Python38\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64;C:\\agents\\2.202.0\\externals\\git\\cmd;C:\\agents\\2.202.0\\externals\\git\\mingw64\\bin;C:\\Program Files\\MongoDB\\Server\\5.0\\bin;C:\\aliyun-cli;C:\\vcpkg;C:\\Program Files (x86)\\NSIS;C:\\tools\\zstd;C:\\Program Files\\Mercurial;C:\\hostedtoolcache\\windows\\stack\\2.7.5\\x64;C:\\cabal\\bin;C:\\ghcup\\bin;C:\\tools\\ghc-9.2.2\\bin;C:\\Program Files\\dotnet;C:\\mysql\\bin;C:\\Program Files\\R\\R-4.1.3\\bin\\x64;C:\\SeleniumWebDrivers\\GeckoDriver;C:\\Program Files (x86)\\sbt\\bin;C:\\Program Files (x86)\\GitHub CLI;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files (x86)\\pipx_bin;C:\\hostedtoolcache\\windows\\go\\1.16.15\\x64\\bin;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64\\Scripts;C:\\hostedtoolcache\\windows\\Python\\3.9.12\\x64;C:\\hostedtoolcache\\windows\\Ruby\\3.0.3\\x64\\bin;C:\\tools\\kotlinc\\bin;C:\\hostedtoolcache\\windows\\Java_Temurin-Hotspot_jdk\\8.0.322-6\\x64\\bin;C:\\npm\\prefix;C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\CLI2\\wbin;C:\\ProgramData\\kind;C:\\Program Files\\Microsoft\\jdk-11.0.12.7-hotspot\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\System32\\Wbem;C:\\Windows\\System32\\WindowsPowerShell\\v1.0;C:\\Windows\\System32\\OpenSSH;C:\\Program Files\\dotnet;C:\\ProgramData\\Chocolatey\\bin;C:\\Program Files\\Docker;C:\\Program Files\\PowerShell\\7;C:\\Program Files\\Microsoft\\Web Platform Installer;C:\\Program Files\\Microsoft SQL Server\\Client SDK\\ODBC\\170\\Tools\\Binn;C:\\Program Files\\Microsoft SQL Server\\150\\Tools\\Binn;C:\\Program Files\\nodejs;C:\\Program Files\\OpenSSL\\bin;C:\\Strawberry\\c\\bin;C:\\Strawberry\\perl\\site\\bin;C:\\Strawberry\\perl\\bin;C:\\ProgramData\\chocolatey\\lib\\pulumi\\tools\\Pulumi\\bin;C:\\Program Files\\TortoiseSVN\\bin;C:\\Program Files\\CMake\\bin;C:\\ProgramData\\chocolatey\\lib\\maven\\apache-maven-3.8.5\\bin;C:\\Program Files\\Microsoft Service Fabric\\bin\\Fabric\\Fabric.Code;C:\\Program Files\\Microsoft SDKs\\Service Fabric\\Tools\\ServiceFabricLocalClusterManager;C:\\Program Files\\Git\\cmd;C:\\Program Files\\Git\\mingw64\\bin;C:\\Program Files\\Git\\usr\\bin;C:\\Program Files\\GitHub CLI;C:\\tools\\php;C:\\Program Files (x86)\\sbt\\bin;C:\\SeleniumWebDrivers\\ChromeDriver;C:\\SeleniumWebDrivers\\EdgeDriver;C:\\Program Files\\Amazon\\AWSCLIV2;C:\\Program Files\\Amazon\\SessionManagerPlugin\\bin;C:\\Program Files\\Amazon\\AWSSAMCLI\\bin;C:\\Program Files\\Microsoft SQL Server\\130\\Tools\\Binn;C:\\Program Files\\LLVM\\bin;C:\\Users\\VssAdministrator\\.dotnet\\tools;C:\\Users\\VssAdministrator\\.cargo\\bin;C:\\Users\\VssAdministrator\\AppData\\Local\\Microsoft\\WindowsApps;C:\\Program Files\\Git\\usr\\bin\\vendor_perl;C:\\Program Files\\Git\\usr\\bin\\core_perl", "PATHEXT": ".COM;.EXE;.BAT;.CMD;.VBS;.VBE;.JS;.JSE;.WSF;.WSH;.MSC;.CPL", "PGBIN": "C:\\Program Files\\PostgreSQL\\14\\bin", "PGDATA": "C:\\Program Files\\PostgreSQL\\14\\data", "PGPASSWORD": "root", "PGROOT": "C:\\Program Files\\PostgreSQL\\14", "PGUSER": "postgres", "PHPROOT": "c:\\tools\\php", "PIPELINE_WORKSPACE": "D:\\a\\1", "PIPX_BIN_DIR": "C:\\Program Files (x86)\\pipx_bin", "PIPX_HOME": "C:\\Program Files (x86)\\pipx", "PKG_CONFIG_PATH": "C:\\Program Files\\Git\\mingw64\\lib\\pkgconfig;C:\\Program Files\\Git\\mingw64\\share\\pkgconfig", "PLINK_PROTOCOL": "ssh", "POWERSHELL_DISTRIBUTION_CHANNEL": "Azure-DevOps-win22", "POWERSHELL_UPDATECHECK": "Off", "PROCESSOR_ARCHITECTURE": "AMD64", "PROCESSOR_IDENTIFIER": "Intel64 Family 6 Model 79 Stepping 1, GenuineIntel", "PROCESSOR_LEVEL": "6", "PROCESSOR_REVISION": "4f01", "PROGRAMFILES": "C:\\Program Files", "PROMPT": "$P$G", "PSEXECUTIONPOLICYPREFERENCE": "Unrestricted", "PSMODULEPATH": "C:\\Users\\VssAdministrator\\Documents\\WindowsPowerShell\\Modules;C:\\\\Modules\\azurerm_2.1.0;C:\\\\Modules\\azure_2.1.0;C:\\Users\\packer\\Documents\\WindowsPowerShell\\Modules;C:\\Program Files\\WindowsPowerShell\\Modules;C:\\Windows\\system32\\WindowsPowerShell\\v1.0\\Modules;C:\\Program Files\\Microsoft SQL Server\\130\\Tools\\PowerShell\\Modules\\", "PUBLIC": "C:\\Users\\Public", "PWD": "D:/a/1/s/flame-blis", "PYTHON_VERSION": "3.8", "PROGRAMDATA": "C:\\ProgramData", "PROGRAMFILES(X86)": "C:\\Program Files (x86)", "PROGRAMW6432": "C:\\Program Files", "RANLIB": "echo", "RESOURCES_TRIGGERINGALIAS": "", "RESOURCES_TRIGGERINGCATEGORY": "", "RTOOLS40_HOME": "C:\\rtools40", "RUNNER_TOOLSDIRECTORY": "C:\\hostedtoolcache\\windows", "SBT_HOME": "C:\\Program Files (x86)\\sbt\\", "SELENIUM_JAR_PATH": "C:\\selenium\\selenium-server.jar", "SHELL": "C:\\Program Files\\Git\\usr\\bin\\bash.exe", "SHLVL": "2", "SSH_ASKPASS": "C:/Program Files/Git/mingw64/bin/git-askpass.exe", "SYSTEM": "build", "SYSTEMDRIVE": "C:", "SYSTEMROOT": "C:\\Windows", "SYSTEM_ARTIFACTSDIRECTORY": "D:\\a\\1\\a", "SYSTEM_COLLECTIONID": "116cc368-5c0c-4eb4-bb44-7f3fa5bdce14", "SYSTEM_COLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_CULTURE": "en-US", "SYSTEM_DEBUG": "false", "SYSTEM_DEFAULTWORKINGDIRECTORY": "D:\\a\\1\\s", "SYSTEM_DEFINITIONID": "6", "SYSTEM_DEFINITIONNAME": "explosion.cython-blis", "SYSTEM_ENABLEACCESSTOKEN": "SecretVariable", "SYSTEM_HOSTTYPE": "build", "SYSTEM_ISSCHEDULED": "False", "SYSTEM_JOBATTEMPT": "1", "SYSTEM_JOBDISPLAYNAME": "JSONL Python38Windows", "SYSTEM_JOBID": "efb31c5a-ec83-5597-c79c-3c04d0eba6be", "SYSTEM_JOBIDENTIFIER": "JSONL.Python38Windows", "SYSTEM_JOBNAME": "Python38Windows", "SYSTEM_JOBPARALLELISMTAG": "Public", "SYSTEM_JOBPOSITIONINPHASE": "1", "SYSTEM_JOBTIMEOUT": "60", "SYSTEM_PARALLELEXECUTIONTYPE": "MultiConfiguration", "SYSTEM_PHASEATTEMPT": "1", "SYSTEM_PHASEDISPLAYNAME": "JSONL", "SYSTEM_PHASEID": "ecb95708-c2a5-5456-f379-96cd8090c2a6", "SYSTEM_PHASENAME": "JSONL", "SYSTEM_PIPELINESTARTTIME": "2022-04-08 12:45:25+00:00", "SYSTEM_PLANID": "4bcc5172-4f8e-4a4f-b00f-1b3d5e2fe9dd", "SYSTEM_POSTLINESSPEED": "10000", "SYSTEM_PULLREQUEST_ISFORK": "True", "SYSTEM_PULLREQUEST_MERGEDAT": "", "SYSTEM_PULLREQUEST_PULLREQUESTID": "899994381", "SYSTEM_PULLREQUEST_PULLREQUESTNUMBER": "69", "SYSTEM_PULLREQUEST_SOURCEBRANCH": "update-to-blis-0.9.0", "SYSTEM_PULLREQUEST_SOURCECOMMITID": "1de7a1931422b892af086ce69604e7e3459e9f8e", "SYSTEM_PULLREQUEST_SOURCEREPOSITORYURI": "https://github.com/explosion/cython-blis", "SYSTEM_PULLREQUEST_TARGETBRANCH": "master", "SYSTEM_RESTRICTSECRETS": "True", "SYSTEM_SERVERTYPE": "Hosted", "SYSTEM_STAGEATTEMPT": "1", "SYSTEM_STAGEDISPLAYNAME": "__default", "SYSTEM_STAGEID": "96ac2280-8cb4-5df5-99de-dd2da759617d", "SYSTEM_STAGENAME": "__default", "SYSTEM_TASKDEFINITIONSURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_TASKDISPLAYNAME": "Generate JSONL (Windows)", "SYSTEM_TASKINSTANCEID": "4bae54ba-656f-5414-04c0-0cf207e9f5bd", "SYSTEM_TASKINSTANCENAME": "CmdLine5", "SYSTEM_TEAMFOUNDATIONCOLLECTIONURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_TEAMFOUNDATIONSERVERURI": "https://dev.azure.com/explosion-ai/", "SYSTEM_TEAMPROJECT": "Public", "SYSTEM_TEAMPROJECTID": "5c6613e9-6ccf-48bd-81de-dbc3b0a6f957", "SYSTEM_TIMELINEID": "4bcc5172-4f8e-4a4f-b00f-1b3d5e2fe9dd", "SYSTEM_TOTALJOBSINPHASE": "1", "SYSTEM_WORKFOLDER": "D:\\a", "TASK_DISPLAYNAME": "Generate JSONL (Windows)", "TASK_SKIPTRANSLATORFORCHECKOUT": "False", "TEMP": "C:\\Users\\VSSADM~1\\AppData\\Local\\Temp", "TERM": "xterm-256color", "TF_BUILD": "True", "TMP": "C:\\Users\\VSSADM~1\\AppData\\Local\\Temp", "TMPDIR": "C:\\Users\\VSSADM~1\\AppData\\Local\\Temp", "USEPYTHONVERSION_PYTHONLOCATION": "C:\\hostedtoolcache\\windows\\Python\\3.8.10\\x64", "USERDOMAIN": "WIN-CU8INV6766V", "USERDOMAIN_ROAMINGPROFILE": "WIN-CU8INV6766V", "USERNAME": "VssAdministrator", "USERPROFILE": "C:\\Users\\VssAdministrator", "VCPKG_INSTALLATION_ROOT": "C:\\vcpkg", "VSTS_AGENT_PERFLOG": "C:\\agents\\perflog", "VSTS_PROCESS_LOOKUP_ID": "vsts_175962b3-f397-42b7-b557-a072c6b9de45", "VSTS_SECRET_VARIABLES": "", "WINDIR": "C:\\Windows", "WIX": "C:\\Program Files (x86)\\WiX Toolset v3.11\\", "_": "C:/hostedtoolcache/windows/Python/3.8.10/x64/python", "AGENT.JOBSTATUS": "Succeeded", "NPM_CONFIG_PREFIX": "C:\\npm\\prefix"}} {"compiler": "clang", "source": "config/bulldozer/bli_cntx_init_bulldozer.c", "target": "obj/x86_64/config/bulldozer/bli_cntx_init_bulldozer.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/excavator/bli_cntx_init_excavator.c", "target": "obj/x86_64/config/excavator/bli_cntx_init_excavator.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/generic/bli_cntx_init_generic.c", "target": "obj/x86_64/config/generic/bli_cntx_init_generic.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/haswell/bli_cntx_init_haswell.c", "target": "obj/x86_64/config/haswell/bli_cntx_init_haswell.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/knl/bli_cntx_init_knl.c", "target": "obj/x86_64/config/knl/bli_cntx_init_knl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/penryn/bli_cntx_init_penryn.c", "target": "obj/x86_64/config/penryn/bli_cntx_init_penryn.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/piledriver/bli_cntx_init_piledriver.c", "target": "obj/x86_64/config/piledriver/bli_cntx_init_piledriver.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/sandybridge/bli_cntx_init_sandybridge.c", "target": "obj/x86_64/config/sandybridge/bli_cntx_init_sandybridge.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/skx/bli_cntx_init_skx.c", "target": "obj/x86_64/config/skx/bli_cntx_init_skx.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/steamroller/bli_cntx_init_steamroller.c", "target": "obj/x86_64/config/steamroller/bli_cntx_init_steamroller.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/zen/bli_cntx_init_zen.c", "target": "obj/x86_64/config/zen/bli_cntx_init_zen.o", "flags": ["-O2", "-fomit-frame-pointer", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/zen2/bli_cntx_init_zen2.c", "target": "obj/x86_64/config/zen2/bli_cntx_init_zen2.o", "flags": ["-O2", "-fomit-frame-pointer", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "config/zen3/bli_cntx_init_zen3.c", "target": "obj/x86_64/config/zen3/bli_cntx_init_zen3.o", "flags": ["-O3", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c", "target": "obj/x86_64/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/skx/3/bli_dgemm_skx_asm_16x14.c", "target": "obj/x86_64/kernels/skx/3/bli_dgemm_skx_asm_16x14.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c", "target": "obj/x86_64/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx512f", "-mavx512dq", "-mavx512bw", "-mavx512vl", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/knl/1m/bli_dpackm_knl_asm_24x8.c", "target": "obj/x86_64/kernels/knl/1m/bli_dpackm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/knl/1m/bli_spackm_knl_asm_24x16.c", "target": "obj/x86_64/kernels/knl/1m/bli_spackm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/knl/3/bli_dgemm_knl_asm_24x8.c", "target": "obj/x86_64/kernels/knl/3/bli_dgemm_knl_asm_24x8.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/knl/3/bli_sgemm_knl_asm_24x16.c", "target": "obj/x86_64/kernels/knl/3/bli_sgemm_knl_asm_24x16.o", "flags": ["-O2", "-O3", "-mavx512f", "-mavx512pf", "-mfpmath=sse", "-march=knl", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c", "target": "obj/x86_64/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c", "target": "obj/x86_64/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/1/bli_axpyv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1/bli_axpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/1/bli_dotv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1/bli_dotv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/1f/bli_axpy2v_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_axpy2v_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/1f/bli_axpyf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_axpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/1f/bli_dotaxpyv_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotaxpyv_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/1f/bli_dotxf_penryn_int.c", "target": "obj/x86_64/kernels/penryn/1f/bli_dotxf_penryn_int.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c", "target": "obj/x86_64/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c", "target": "obj/x86_64/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c", "target": "obj/x86_64/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c", "target": "obj/x86_64/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c", "target": "obj/x86_64/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_amaxv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_amaxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_axpyv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_axpyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_axpyv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_axpyv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_copyv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_copyv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_dotv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_dotv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_dotv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_dotv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_dotxv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_dotxv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_scalv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_scalv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_scalv_zen_int10.c", "target": "obj/x86_64/kernels/zen/1/bli_scalv_zen_int10.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_setv_zen_int.c", "target": "obj/x86_64/kernels/zen/1/bli_setv_zen_int.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1/bli_swapv_zen_int8.c", "target": "obj/x86_64/kernels/zen/1/bli_swapv_zen_int8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1f/bli_axpyf_zen_int_4.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_4.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1f/bli_axpyf_zen_int_5.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_5.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1f/bli_axpyf_zen_int_8.c", "target": "obj/x86_64/kernels/zen/1f/bli_axpyf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/1f/bli_dotxf_zen_int_8.c", "target": "obj/x86_64/kernels/zen/1f/bli_dotxf_zen_int_8.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/3/bli_gemm_small.c", "target": "obj/x86_64/kernels/zen/3/bli_gemm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/3/bli_gemmt_small.c", "target": "obj/x86_64/kernels/zen/3/bli_gemmt_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/zen/3/bli_trsm_small.c", "target": "obj/x86_64/kernels/zen/3/bli_trsm_small.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-march=znver1", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c", "target": "obj/x86_64/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c", "target": "obj/x86_64/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/skx/bli_cntx_skx_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_addv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_amaxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_axpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_axpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_copyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_dotv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_dotxv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_invertv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_scal2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_scalv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_setv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_subv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_swapv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1/bli_xpbyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_axpy2v_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_axpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotaxpyv_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotxaxpyf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/skx/1f/bli_dotxf_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_1er_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_bb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_packm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/skx/1m/bli_unpackm_cxk_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemmsup_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_gemmtrsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bli_trsm_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_gemmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_gemmtrsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/skx/3/bb/bli_trsmbb_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_gemm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_gemmtrsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/skx/ind/bli_trsm1m_skx_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=skx", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/knl/bli_cntx_knl_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_addv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_amaxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_axpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_axpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_copyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_dotv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_dotxv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_invertv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_scal2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_scalv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_setv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_subv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_swapv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1/bli_xpbyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_axpy2v_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_axpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotaxpyv_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotxaxpyf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/knl/1f/bli_dotxf_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_1er_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_bb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_packm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/knl/1m/bli_unpackm_cxk_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemmsup_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_gemmtrsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bli_trsm_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_gemmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_gemmtrsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/knl/3/bb/bli_trsmbb_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_gemm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_gemmtrsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/knl/ind/bli_trsm1m_knl_ref.o", "flags": ["-O2", "-O3", "-march=knl", "-mno-avx512f", "-mno-avx512pf", "-mno-avx512er", "-mno-avx512cd", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=knl", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/haswell/bli_cntx_haswell_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_addv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_amaxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_axpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_axpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_copyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_dotv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_dotxv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_invertv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_scal2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_scalv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_setv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_subv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_swapv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1/bli_xpbyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_axpy2v_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_axpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotaxpyv_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotxaxpyf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1f/bli_dotxf_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_1er_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_bb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_packm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/haswell/1m/bli_unpackm_cxk_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemmsup_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_gemmtrsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bli_trsm_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_gemmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_gemmtrsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/haswell/3/bb/bli_trsmbb_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_gemm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_gemmtrsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/haswell/ind/bli_trsm1m_haswell_ref.o", "flags": ["-O2", "-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-march=haswell", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=haswell", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/bli_cntx_sandybridge_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_addv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_amaxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_axpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_axpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_copyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_dotv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_dotxv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_invertv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_scal2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_scalv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_setv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_subv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_swapv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1/bli_xpbyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_axpy2v_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_axpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotaxpyv_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotxaxpyf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1f/bli_dotxf_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_1er_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_bb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_packm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/1m/bli_unpackm_cxk_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemmsup_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_gemmtrsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bli_trsm_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_gemmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_gemmtrsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/3/bb/bli_trsmbb_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_gemm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_gemmtrsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/sandybridge/ind/bli_trsm1m_sandybridge_ref.o", "flags": ["-O2", "-O3", "-mavx", "-mfpmath=sse", "-march=sandybridge", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=sandybridge", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/penryn/bli_cntx_penryn_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_addv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_amaxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_axpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_axpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_copyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_dotv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_dotxv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_invertv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_scal2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_scalv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_setv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_subv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_swapv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1/bli_xpbyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_axpy2v_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_axpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotaxpyv_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotxaxpyf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1f/bli_dotxf_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_1er_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_bb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_packm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/penryn/1m/bli_unpackm_cxk_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemmsup_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_gemmtrsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bli_trsm_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_gemmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_gemmtrsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/penryn/3/bb/bli_trsmbb_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_gemm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_gemmtrsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/penryn/ind/bli_trsm1m_penryn_ref.o", "flags": ["-O2", "-O3", "-mssse3", "-mfpmath=sse", "-march=core2", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=penryn", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen3/bli_cntx_zen3_ref.o", "flags": ["-O3", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_addv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_amaxv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_axpbyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_axpyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_copyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_dotv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_dotxv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_invertv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_scal2v_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_scalv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_setv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_subv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_swapv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1/bli_xpbyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_axpy2v_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_axpyf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotaxpyv_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotxaxpyf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1f/bli_dotxf_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_1er_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_bb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_packm_cxk_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen3/1m/bli_unpackm_cxk_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemmsup_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_gemmtrsm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bli_trsm_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_gemmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_gemmtrsmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen3/3/bb/bli_trsmbb_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_gemm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_gemmtrsm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen3/ind/bli_trsm1m_zen3_ref.o", "flags": ["-O3", "-fomit-frame-pointer", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver3", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen3", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen2/bli_cntx_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_addv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_amaxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_axpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_axpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_copyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_dotv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_dotxv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_invertv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_scal2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_scalv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_setv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_subv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_swapv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1/bli_xpbyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_axpy2v_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_axpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotaxpyv_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotxaxpyf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1f/bli_dotxf_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_1er_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_bb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_packm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen2/1m/bli_unpackm_cxk_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemmsup_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_gemmtrsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bli_trsm_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_gemmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_gemmtrsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen2/3/bb/bli_trsmbb_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_gemm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_gemmtrsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen2/ind/bli_trsm1m_zen2_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver2", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen2", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/zen/bli_cntx_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_addv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_amaxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_axpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_axpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_copyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_dotv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_dotxv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_invertv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_scal2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_scalv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_setv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_subv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_swapv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1/bli_xpbyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_axpy2v_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_axpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotaxpyv_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotxaxpyf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/zen/1f/bli_dotxf_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_1er_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_bb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_packm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/zen/1m/bli_unpackm_cxk_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemmsup_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_gemmtrsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bli_trsm_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_gemmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_gemmtrsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/zen/3/bb/bli_trsmbb_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_gemm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_gemmtrsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/zen/ind/bli_trsm1m_zen_ref.o", "flags": ["-O2", "-fomit-frame-pointer", "-O3", "-mavx2", "-mfma", "-mfpmath=sse", "-funsafe-math-optimizations", "-ffp-contract=fast", "-march=znver1", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=zen", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/excavator/bli_cntx_excavator_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_addv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_amaxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_axpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_axpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_copyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_dotv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_dotxv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_invertv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_scal2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_scalv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_setv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_subv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_swapv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1/bli_xpbyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_axpy2v_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_axpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotaxpyv_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotxaxpyf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1f/bli_dotxf_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_1er_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_bb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_packm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/excavator/1m/bli_unpackm_cxk_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemmsup_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_gemmtrsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bli_trsm_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_gemmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_gemmtrsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/excavator/3/bb/bli_trsmbb_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_gemm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_gemmtrsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/excavator/ind/bli_trsm1m_excavator_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver4", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=excavator", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/bli_cntx_steamroller_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_addv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_amaxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_axpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_axpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_copyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_dotv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_dotxv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_invertv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_scal2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_scalv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_setv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_subv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_swapv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1/bli_xpbyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_axpy2v_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_axpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotaxpyv_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotxaxpyf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1f/bli_dotxf_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_1er_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_bb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_packm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/1m/bli_unpackm_cxk_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemmsup_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_gemmtrsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bli_trsm_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_gemmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_gemmtrsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/3/bb/bli_trsmbb_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_gemm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_gemmtrsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/steamroller/ind/bli_trsm1m_steamroller_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver3", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=steamroller", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/bli_cntx_piledriver_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_addv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_amaxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_axpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_axpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_copyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_dotv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_dotxv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_invertv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_scal2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_scalv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_setv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_subv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_swapv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1/bli_xpbyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_axpy2v_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_axpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotaxpyv_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotxaxpyf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1f/bli_dotxf_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_1er_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_bb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_packm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/1m/bli_unpackm_cxk_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemmsup_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_gemmtrsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bli_trsm_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_gemmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_gemmtrsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/3/bb/bli_trsmbb_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_gemm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_gemmtrsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/piledriver/ind/bli_trsm1m_piledriver_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma", "-march=bdver2", "-mno-fma4", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=piledriver", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/bli_cntx_bulldozer_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_addv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_amaxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_axpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_axpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_copyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_dotv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_dotxv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_invertv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_scal2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_scalv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_setv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_subv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_swapv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1/bli_xpbyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_axpy2v_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_axpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotaxpyv_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotxaxpyf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1f/bli_dotxf_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_1er_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_bb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_packm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/1m/bli_unpackm_cxk_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemmsup_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_gemmtrsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bli_trsm_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_gemmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_gemmtrsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/3/bb/bli_trsmbb_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_gemm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_gemmtrsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/bulldozer/ind/bli_trsm1m_bulldozer_ref.o", "flags": ["-O2", "-O3", "-mfpmath=sse", "-mavx", "-mfma4", "-march=bdver1", "-mno-tbm", "-mno-xop", "-mno-lwp", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=bulldozer", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/bli_cntx_ref.c", "target": "obj/x86_64/ref_kernels/generic/bli_cntx_generic_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_addv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_addv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_amaxv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_amaxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpbyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_axpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_axpyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_axpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_copyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_copyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_dotv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_dotxv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_dotxv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_invertv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_invertv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scal2v_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_scal2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_scalv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_scalv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_setv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_setv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_subv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_subv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_swapv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_swapv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1/bli_xpbyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1/bli_xpbyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpy2v_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_axpy2v_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_axpyf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_axpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotaxpyv_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotaxpyv_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxaxpyf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotxaxpyf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1f/bli_dotxf_ref.c", "target": "obj/x86_64/ref_kernels/generic/1f/bli_dotxf_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_1er_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_1er_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_bb_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_bb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_packm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_packm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/1m/bli_unpackm_cxk_ref.c", "target": "obj/x86_64/ref_kernels/generic/1m/bli_unpackm_cxk_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmsup_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemmsup_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_gemmtrsm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_gemmtrsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bli_trsm_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bli_trsm_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_gemmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_gemmtrsmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_gemmtrsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/3/bb/bli_trsmbb_ref.c", "target": "obj/x86_64/ref_kernels/generic/3/bb/bli_trsmbb_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_gemm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_gemmtrsm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_gemmtrsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "ref_kernels/ind/bli_trsm1m_ref.c", "target": "obj/x86_64/ref_kernels/generic/ind/bli_trsm1m_generic_ref.o", "flags": ["-O2", "-O3", "-funsafe-math-optimizations", "-ffp-contract=fast", "-std=c99", "-fopenmp-simd"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_CNAME=generic", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_check.c", "target": "obj/x86_64/frame/0/bli_l0_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_fpa.c", "target": "obj/x86_64/frame/0/bli_l0_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_oapi.c", "target": "obj/x86_64/frame/0/bli_l0_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/bli_l0_tapi.c", "target": "obj/x86_64/frame/0/bli_l0_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/0/copysc/bli_copysc.c", "target": "obj/x86_64/frame/0/copysc/bli_copysc.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_check.c", "target": "obj/x86_64/frame/1/bli_l1v_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_fpa.c", "target": "obj/x86_64/frame/1/bli_l1v_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_oapi.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_oapi_ba.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_oapi_ex.c", "target": "obj/x86_64/frame/1/bli_l1v_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_tapi.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_tapi_ba.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1/bli_l1v_tapi_ex.c", "target": "obj/x86_64/frame/1/bli_l1v_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_check.c", "target": "obj/x86_64/frame/1d/bli_l1d_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_fpa.c", "target": "obj/x86_64/frame/1d/bli_l1d_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_oapi.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_oapi_ba.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_oapi_ex.c", "target": "obj/x86_64/frame/1d/bli_l1d_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_tapi.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_tapi_ba.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1d/bli_l1d_tapi_ex.c", "target": "obj/x86_64/frame/1d/bli_l1d_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_check.c", "target": "obj/x86_64/frame/1f/bli_l1f_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_fpa.c", "target": "obj/x86_64/frame/1f/bli_l1f_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_oapi.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_oapi_ba.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_oapi_ex.c", "target": "obj/x86_64/frame/1f/bli_l1f_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_tapi.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_tapi_ba.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1f/bli_l1f_tapi_ex.c", "target": "obj/x86_64/frame/1f/bli_l1f_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_check.c", "target": "obj/x86_64/frame/1m/bli_l1m_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_fpa.c", "target": "obj/x86_64/frame/1m/bli_l1m_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_oapi.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_oapi_ba.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_oapi_ex.c", "target": "obj/x86_64/frame/1m/bli_l1m_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_tapi.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_tapi_ba.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_tapi_ex.c", "target": "obj/x86_64/frame/1m/bli_l1m_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/bli_l1m_unb_var1.c", "target": "obj/x86_64/frame/1m/bli_l1m_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_alloc.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_alloc.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_blk_var1.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_check.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_cntl.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_cxk.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cxk.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_cxk_1er.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_cxk_1er.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_init.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_init.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_int.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_part.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_part.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_scalar.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_scalar.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_struc_cxk.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_struc_cxk_1er.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk_1er.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_struc_cxk_md.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_struc_cxk_md.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/packm/bli_packm_thrinfo.c", "target": "obj/x86_64/frame/1m/packm/bli_packm_thrinfo.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_blk_var1.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_check.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_cntl.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_cxk.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_cxk.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/1m/unpackm/bli_unpackm_int.c", "target": "obj/x86_64/frame/1m/unpackm/bli_unpackm_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_check.c", "target": "obj/x86_64/frame/2/bli_l2_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_fpa.c", "target": "obj/x86_64/frame/2/bli_l2_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_oapi.c", "target": "obj/x86_64/frame/2/bli_l2_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_oapi_ba.c", "target": "obj/x86_64/frame/2/bli_l2_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_oapi_ex.c", "target": "obj/x86_64/frame/2/bli_l2_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_tapi.c", "target": "obj/x86_64/frame/2/bli_l2_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_tapi_ba.c", "target": "obj/x86_64/frame/2/bli_l2_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/bli_l2_tapi_ex.c", "target": "obj/x86_64/frame/2/bli_l2_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unb_var1.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unb_var2.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unf_var1.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_unf_var2.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_unf_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/gemv/bli_gemv_var_oapi.c", "target": "obj/x86_64/frame/2/gemv/bli_gemv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/ger/bli_ger_unb_var1.c", "target": "obj/x86_64/frame/2/ger/bli_ger_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/ger/bli_ger_unb_var2.c", "target": "obj/x86_64/frame/2/ger/bli_ger_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/ger/bli_ger_var_oapi.c", "target": "obj/x86_64/frame/2/ger/bli_ger_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var1.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var2.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var3.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unb_var4.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unb_var4.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var1.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var1a.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var1a.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var3.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_unf_var3a.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_unf_var3a.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/hemv/bli_hemv_var_oapi.c", "target": "obj/x86_64/frame/2/hemv/bli_hemv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her/bli_her_unb_var1.c", "target": "obj/x86_64/frame/2/her/bli_her_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her/bli_her_unb_var2.c", "target": "obj/x86_64/frame/2/her/bli_her_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her/bli_her_var_oapi.c", "target": "obj/x86_64/frame/2/her/bli_her_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var1.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var2.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var3.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unb_var4.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unb_var4.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unf_var1.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_unf_var4.c", "target": "obj/x86_64/frame/2/her2/bli_her2_unf_var4.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/her2/bli_her2_var_oapi.c", "target": "obj/x86_64/frame/2/her2/bli_her2_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unb_var1.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unb_var2.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unf_var1.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_unf_var2.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_unf_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trmv/bli_trmv_var_oapi.c", "target": "obj/x86_64/frame/2/trmv/bli_trmv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unb_var1.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unb_var2.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unb_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unf_var1.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unf_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_unf_var2.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_unf_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/2/trsv/bli_trsv_var_oapi.c", "target": "obj/x86_64/frame/2/trsv/bli_trsv_var_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_blocksize.c", "target": "obj/x86_64/frame/3/bli_l3_blocksize.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_check.c", "target": "obj/x86_64/frame/3/bli_l3_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_cntl.c", "target": "obj/x86_64/frame/3/bli_l3_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_direct.c", "target": "obj/x86_64/frame/3/bli_l3_direct.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ind.c", "target": "obj/x86_64/frame/3/bli_l3_ind.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_int.c", "target": "obj/x86_64/frame/3/bli_l3_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_oapi.c", "target": "obj/x86_64/frame/3/bli_l3_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_oapi_ex.c", "target": "obj/x86_64/frame/3/bli_l3_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_packab.c", "target": "obj/x86_64/frame/3/bli_l3_packab.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_prune.c", "target": "obj/x86_64/frame/3/bli_l3_prune.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_schema.c", "target": "obj/x86_64/frame/3/bli_l3_schema.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup.c", "target": "obj/x86_64/frame/3/bli_l3_sup.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_int.c", "target": "obj/x86_64/frame/3/bli_l3_sup_int.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_packm_a.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_a.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_packm_b.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_b.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_packm_var.c", "target": "obj/x86_64/frame/3/bli_l3_sup_packm_var.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_ref.c", "target": "obj/x86_64/frame/3/bli_l3_sup_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_var12.c", "target": "obj/x86_64/frame/3/bli_l3_sup_var12.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_sup_var1n2m.c", "target": "obj/x86_64/frame/3/bli_l3_sup_var1n2m.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_tapi.c", "target": "obj/x86_64/frame/3/bli_l3_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_tapi_ex.c", "target": "obj/x86_64/frame/3/bli_l3_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_thrinfo.c", "target": "obj/x86_64/frame/3/bli_l3_thrinfo.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ukr_fpa.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ukr_oapi.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/bli_l3_ukr_tapi.c", "target": "obj/x86_64/frame/3/bli_l3_ukr_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_blk_var1.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_blk_var2.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_blk_var3.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_blk_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_cntl.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_front.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_ker_var1.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_ker_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_ker_var2.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_md.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_md.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemm/bli_gemm_md_c2r_ref.c", "target": "obj/x86_64/frame/3/gemm/bli_gemm_md_c2r_ref.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_front.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_l_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_l_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_u_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_u_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/gemmt/bli_gemmt_x_ker_var2.c", "target": "obj/x86_64/frame/3/gemmt/bli_gemmt_x_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/hemm/bli_hemm_front.c", "target": "obj/x86_64/frame/3/hemm/bli_hemm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/symm/bli_symm_front.c", "target": "obj/x86_64/frame/3/symm/bli_symm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_front.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_ll_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_ll_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_lu_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_lu_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_rl_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_rl_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_ru_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_ru_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm/bli_trmm_xx_ker_var2.c", "target": "obj/x86_64/frame/3/trmm/bli_trmm_xx_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trmm3/bli_trmm3_front.c", "target": "obj/x86_64/frame/3/trmm3/bli_trmm3_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_blk_var1.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_blk_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_blk_var3.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_blk_var3.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_cntl.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_front.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_front.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_ll_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_ll_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_lu_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_lu_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_rl_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_rl_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_ru_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_ru_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/3/trsm/bli_trsm_xx_ker_var2.c", "target": "obj/x86_64/frame/3/trsm/bli_trsm_xx_ker_var2.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_apool.c", "target": "obj/x86_64/frame/base/bli_apool.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_arch.c", "target": "obj/x86_64/frame/base/bli_arch.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_array.c", "target": "obj/x86_64/frame/base/bli_array.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_blksz.c", "target": "obj/x86_64/frame/base/bli_blksz.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_check.c", "target": "obj/x86_64/frame/base/bli_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_clock.c", "target": "obj/x86_64/frame/base/bli_clock.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_cntl.c", "target": "obj/x86_64/frame/base/bli_cntl.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_cntx.c", "target": "obj/x86_64/frame/base/bli_cntx.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_const.c", "target": "obj/x86_64/frame/base/bli_const.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_cpuid.c", "target": "obj/x86_64/frame/base/bli_cpuid.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_env.c", "target": "obj/x86_64/frame/base/bli_env.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_error.c", "target": "obj/x86_64/frame/base/bli_error.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_func.c", "target": "obj/x86_64/frame/base/bli_func.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_getopt.c", "target": "obj/x86_64/frame/base/bli_getopt.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_gks.c", "target": "obj/x86_64/frame/base/bli_gks.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_ind.c", "target": "obj/x86_64/frame/base/bli_ind.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_info.c", "target": "obj/x86_64/frame/base/bli_info.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_init.c", "target": "obj/x86_64/frame/base/bli_init.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_machval.c", "target": "obj/x86_64/frame/base/bli_machval.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_malloc.c", "target": "obj/x86_64/frame/base/bli_malloc.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_mbool.c", "target": "obj/x86_64/frame/base/bli_mbool.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_memsys.c", "target": "obj/x86_64/frame/base/bli_memsys.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_obj.c", "target": "obj/x86_64/frame/base/bli_obj.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_obj_scalar.c", "target": "obj/x86_64/frame/base/bli_obj_scalar.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_pack.c", "target": "obj/x86_64/frame/base/bli_pack.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_param_map.c", "target": "obj/x86_64/frame/base/bli_param_map.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_part.c", "target": "obj/x86_64/frame/base/bli_part.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_pba.c", "target": "obj/x86_64/frame/base/bli_pba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_pool.c", "target": "obj/x86_64/frame/base/bli_pool.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_prune.c", "target": "obj/x86_64/frame/base/bli_prune.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_query.c", "target": "obj/x86_64/frame/base/bli_query.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_rntm.c", "target": "obj/x86_64/frame/base/bli_rntm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_sba.c", "target": "obj/x86_64/frame/base/bli_sba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_setgetijm.c", "target": "obj/x86_64/frame/base/bli_setgetijm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_setgetijv.c", "target": "obj/x86_64/frame/base/bli_setgetijv.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_setri.c", "target": "obj/x86_64/frame/base/bli_setri.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_string.c", "target": "obj/x86_64/frame/base/bli_string.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/bli_winsys.c", "target": "obj/x86_64/frame/base/bli_winsys.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/cast/bli_castm.c", "target": "obj/x86_64/frame/base/cast/bli_castm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/cast/bli_castnzm.c", "target": "obj/x86_64/frame/base/cast/bli_castnzm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/cast/bli_castv.c", "target": "obj/x86_64/frame/base/cast/bli_castv.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/check/bli_obj_check.c", "target": "obj/x86_64/frame/base/check/bli_obj_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/check/bli_part_check.c", "target": "obj/x86_64/frame/base/check/bli_part_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/noopt/bli_dlamch.c", "target": "obj/x86_64/frame/base/noopt/bli_dlamch.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/noopt/bli_lsame.c", "target": "obj/x86_64/frame/base/noopt/bli_lsame.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/noopt/bli_slamch.c", "target": "obj/x86_64/frame/base/noopt/bli_slamch.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/proj/bli_projm.c", "target": "obj/x86_64/frame/base/proj/bli_projm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/base/proj/bli_projv.c", "target": "obj/x86_64/frame/base/proj/bli_projv.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_decor_openmp.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_openmp.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_decor_pthreads.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_pthreads.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_decor_single.c", "target": "obj/x86_64/frame/thread/bli_l3_decor_single.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_sup_decor_openmp.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_openmp.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_sup_decor_pthreads.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_pthreads.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_l3_sup_decor_single.c", "target": "obj/x86_64/frame/thread/bli_l3_sup_decor_single.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_pthread.c", "target": "obj/x86_64/frame/thread/bli_pthread.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm.c", "target": "obj/x86_64/frame/thread/bli_thrcomm.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm_openmp.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_openmp.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm_pthreads.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_pthreads.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrcomm_single.c", "target": "obj/x86_64/frame/thread/bli_thrcomm_single.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thread.c", "target": "obj/x86_64/frame/thread/bli_thread.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrinfo.c", "target": "obj/x86_64/frame/thread/bli_thrinfo.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/thread/bli_thrinfo_sup.c", "target": "obj/x86_64/frame/thread/bli_thrinfo_sup.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_check.c", "target": "obj/x86_64/frame/util/bli_util_check.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_fpa.c", "target": "obj/x86_64/frame/util/bli_util_fpa.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_oapi.c", "target": "obj/x86_64/frame/util/bli_util_oapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_oapi_ba.c", "target": "obj/x86_64/frame/util/bli_util_oapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_oapi_ex.c", "target": "obj/x86_64/frame/util/bli_util_oapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_tapi.c", "target": "obj/x86_64/frame/util/bli_util_tapi.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_tapi_ba.c", "target": "obj/x86_64/frame/util/bli_util_tapi_ba.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_tapi_ex.c", "target": "obj/x86_64/frame/util/bli_util_tapi_ex.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} {"compiler": "clang", "source": "frame/util/bli_util_unb_var1.c", "target": "obj/x86_64/frame/util/bli_util_unb_var1.o", "flags": ["-O2", "-std=c99"], "macros": ["-D_POSIX_C_SOURCE=200112L", "-DBLIS_VERSION_STRING=\"0.9.0\"", "-DBLIS_IS_BUILDING_LIBRARY"], "include": ["-Iinclude/windows-x86_64", "-I./frame/3/", "-I./frame/1m/", "-I./frame/1f/", "-I./frame/1/", "-I./frame/include"]} cython-blis-0.9.1/blis/_src/ref_kernels/000077500000000000000000000000001427272030600201355ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/1/000077500000000000000000000000001427272030600202755ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_addv_ref.c000066400000000000000000000054471427272030600230530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ ctype* restrict chi1 = x; \ ctype* restrict psi1 = y; \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,addjs)( chi1[i], psi1[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,addjs)( *chi1, *psi1 ); \ \ chi1 += incx; \ psi1 += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,adds)( chi1[i], psi1[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,adds)( *chi1, *psi1 ); \ \ chi1 += incx; \ psi1 += incy; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( addv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_amaxv_ref.c000066400000000000000000000116431427272030600232440ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // Define BLAS-like interfaces with typed operands. // #undef GENTFUNCR #define GENTFUNCR( ctype, ctype_r, ch, chr, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ dim_t* restrict i_max, \ cntx_t* restrict cntx \ ) \ { \ ctype_r* minus_one = PASTEMAC(chr,m1); \ dim_t* zero_i = PASTEMAC(i,0); \ \ ctype_r chi1_r; \ ctype_r chi1_i; \ ctype_r abs_chi1; \ ctype_r abs_chi1_max; \ dim_t i_max_l; \ \ /* If the vector length is zero, return early. This directly emulates the behavior of netlib BLAS's i?amax() routines. */ \ if ( bli_zero_dim1( n ) ) \ { \ PASTEMAC(i,copys)( *zero_i, *i_max ); \ return; \ } \ \ /* Initialize the index of the maximum absolute value to zero. */ \ PASTEMAC(i,copys)( *zero_i, i_max_l ); \ \ /* Initialize the maximum absolute value search candidate with -1, which is guaranteed to be less than all values we will compute. */ \ PASTEMAC(chr,copys)( *minus_one, abs_chi1_max ); \ \ if ( incx == 1 ) \ { \ ctype* chi1 = x; \ \ for ( dim_t i = 0; i < n; ++i ) \ { \ /* Get the real and imaginary components of chi1. */ \ PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ \ /* Replace chi1_r and chi1_i with their absolute values. */ \ PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ \ /* Add the real and imaginary absolute values together. */ \ PASTEMAC(chr,set0s)( abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ \ /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's ?lange(). */ \ if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \ { \ abs_chi1_max = abs_chi1; \ i_max_l = i; \ } \ \ chi1 += 1; \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ ctype* chi1 = x + (i )*incx; \ \ /* Get the real and imaginary components of chi1. */ \ PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \ \ /* Replace chi1_r and chi1_i with their absolute values. */ \ PASTEMAC(chr,abval2s)( chi1_r, chi1_r ); \ PASTEMAC(chr,abval2s)( chi1_i, chi1_i ); \ \ /* Add the real and imaginary absolute values together. */ \ PASTEMAC(chr,set0s)( abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_r, abs_chi1 ); \ PASTEMAC(chr,adds)( chi1_i, abs_chi1 ); \ \ /* If the absolute value of the current element exceeds that of the previous largest, save it and its index. If NaN is encountered, then treat it the same as if it were a valid value that was smaller than any previously seen. This behavior mimics that of LAPACK's ?lange(). */ \ if ( abs_chi1_max < abs_chi1 || ( bli_isnan( abs_chi1 ) && !bli_isnan( abs_chi1_max ) ) ) \ { \ abs_chi1_max = abs_chi1; \ i_max_l = i; \ } \ } \ } \ \ /* Store the final index to the output variable. */ \ PASTEMAC(i,copys)( i_max_l, *i_max ); \ } INSERT_GENTFUNCR_BASIC2( amaxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_axpbyv_ref.c000066400000000000000000000142501427272030600234360ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* If alpha is zero and beta is zero, set to zero. */ \ \ ctype* zero = PASTEMAC(ch,0); \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ BLIS_NO_CONJUGATE, \ n, \ zero, \ y, incy, \ cntx \ ); \ return; \ } \ else if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ /* If alpha is zero and beta is one, return. */ \ return; \ } \ else \ { \ /* If alpha is zero, scale by beta. */ \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); \ \ scalv_p \ ( \ BLIS_NO_CONJUGATE, \ n, \ beta, \ y, incy, \ cntx \ ); \ return; \ } \ \ } \ else if ( PASTEMAC(ch,eq1)( *alpha ) ) \ { \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* If alpha is one and beta is zero, use copyv. */ \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ conjx, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ else if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ /* If alpha is one and beta is one, use addv. */ \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ conjx, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ else \ { \ /* If alpha is one and beta is something else, use xpbyv. */ \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_XPBYV_KER, cntx ); \ \ xpbyv_p \ ( \ conjx, \ n, \ x, incx, \ beta, \ y, incy, \ cntx \ ); \ return; \ } \ } \ else \ { \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* If alpha is something else and beta is zero, use scal2v. */ \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx ); \ \ scal2v_p \ ( \ conjx, \ n, \ alpha, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ else if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ /* If alpha is something else and beta is one, use axpyv. */ \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ axpyv_p \ ( \ conjx, \ n, \ alpha, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ } \ \ /* If execution reaches here, alpha and beta are both non-zero/non-unit. */ \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpbyjs)( *alpha, x[i], *beta, y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpbyjs)( *alpha, *x, *beta, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpbys)( *alpha, x[i], *beta, y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpbys)( *alpha, *x, *beta, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( axpbyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_axpyv_ref.c000066400000000000000000000122211427272030600232700ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if 0 #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ /* If alpha is zero, return. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* If alpha is one, use addv. */ \ if ( PASTEMAC(ch,eq1)( *alpha ) ) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ conjx, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ \ ctype* restrict chi1 = x; \ ctype* restrict psi1 = y; \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ /*PASTEMAC(ch,axpyjs)( *alpha, chi1[i], psi1[i] );*/ \ psi1[i] = fma( *alpha, chi1[i], psi1[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpyjs)( *alpha, *chi1, *psi1 ); \ \ chi1 += incx; \ psi1 += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ /*PASTEMAC(ch,axpys)( *alpha, chi1[i], psi1[i] );*/ \ psi1[i] = fma( *alpha, chi1[i], psi1[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpys)( *alpha, *chi1, *psi1 ); \ \ chi1 += incx; \ psi1 += incy; \ } \ } \ } \ } //INSERT_GENTFUNC_BASIC2( axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( double, d, axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #endif #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ /* If alpha is zero, return. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* If alpha is one, use addv. */ \ if ( PASTEMAC(ch,eq1)( *alpha ) ) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ conjx, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpyjs)( *alpha, x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpyjs)( *alpha, *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpys)( *alpha, x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpys)( *alpha, *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( axpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_copyv_ref.c000066400000000000000000000053141427272030600232660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copyjs)( x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copyjs)( *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copys)( x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copys)( *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( copyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_dotv_ref.c000066400000000000000000000063651427272030600231110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ) \ { \ ctype dotxy; \ \ if ( bli_zero_dim1( n ) ) \ { \ PASTEMAC(ch,set0s)( *rho ); \ return; \ } \ \ PASTEMAC(ch,set0s)( dotxy ); \ \ conj_t conjx_use = conjx; \ \ /* If y must be conjugated, we do so indirectly by first toggling the effective conjugation of x and then conjugating the resulting dot product. */ \ if ( bli_is_conj( conjy ) ) \ bli_toggle_conj( &conjx_use ); \ \ if ( bli_is_conj( conjx_use ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dotjs)( *x, *y, dotxy ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dots)( *x, *y, dotxy ); \ \ x += incx; \ y += incy; \ } \ } \ } \ \ if ( bli_is_conj( conjy ) ) \ PASTEMAC(ch,conjs)( dotxy ); \ \ PASTEMAC(ch,copys)( dotxy, *rho ); \ } INSERT_GENTFUNC_BASIC2( dotv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_dotxv_ref.c000066400000000000000000000070741427272030600232770ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict beta, \ ctype* restrict rho, \ cntx_t* restrict cntx \ ) \ { \ ctype dotxy; \ \ /* If beta is zero, clear rho. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,set0s)( *rho ); \ } \ else \ { \ PASTEMAC(ch,scals)( *beta, *rho ); \ } \ \ /* If the vectors are empty or if alpha is zero, return early. */ \ if ( bli_zero_dim1( n ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ PASTEMAC(ch,set0s)( dotxy ); \ \ /* If y must be conjugated, we do so indirectly by first toggling the effective conjugation of x and then conjugating the resulting dot product. */ \ conj_t conjx_use = conjx; \ \ if ( bli_is_conj( conjy ) ) \ bli_toggle_conj( &conjx_use ); \ \ if ( bli_is_conj( conjx_use ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dotjs)( *x, *y, dotxy ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,dots)( *x, *y, dotxy ); \ \ x += incx; \ y += incy; \ } \ } \ } \ \ if ( bli_is_conj( conjy ) ) \ PASTEMAC(ch,conjs)( dotxy ); \ \ PASTEMAC(ch,axpys)( *alpha, dotxy, *rho ); \ } INSERT_GENTFUNC_BASIC2( dotxv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_invertv_ref.c000066400000000000000000000043511427272030600236230ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( incx == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,inverts)( x[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,inverts)( *x ); \ \ x += incx; \ } \ } \ } INSERT_GENTFUNC_BASIC2( invertv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_scal2v_ref.c000066400000000000000000000071561427272030600233260ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ /* If alpha is zero, use setv. */ \ \ ctype* zero = PASTEMAC(ch,0); \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ BLIS_NO_CONJUGATE, \ n, \ zero, \ y, incy, \ cntx \ ); \ return; \ } \ else if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ /* If alpha is one, use copyv. */ \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ BLIS_NO_CONJUGATE, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,scal2js)( *alpha, x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,scal2js)( *alpha, *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,scal2s)( *alpha, x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,scal2s)( *alpha, *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( scal2v, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_scalv_ref.c000066400000000000000000000056341427272030600232430ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ /* If alpha is one, return. */ \ if ( PASTEMAC(ch,eq1)( *alpha ) ) return; \ \ /* If alpha is zero, use setv. */ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ ctype* zero = PASTEMAC(ch,0); \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \ \ setv_p \ ( \ BLIS_NO_CONJUGATE, \ n, \ zero, \ x, incx, \ cntx \ ); \ return; \ } \ \ ctype alpha_conj; \ \ PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \ \ if ( incx == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,scals)( alpha_conj, x[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,scals)( alpha_conj, *x ); \ \ x += incx; \ } \ } \ } INSERT_GENTFUNC_BASIC2( scalv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_setv_ref.c000066400000000000000000000053501427272030600231070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjalpha, \ dim_t n, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( PASTEMAC(ch,eq0)( *alpha ) ) \ { \ if ( incx == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,set0s)( x[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,set0s)( *x ); \ \ x += incx; \ } \ } \ } \ else \ { \ ctype alpha_conj; \ \ PASTEMAC(ch,copycjs)( conjalpha, *alpha, alpha_conj ); \ \ if ( incx == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copys)( alpha_conj, x[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copys)( alpha_conj, *x ); \ \ x += incx; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( setv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_subv_ref.c000066400000000000000000000053071427272030600231070ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,subjs)( x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,subjs)( *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,subs)( x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,subs)( *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( subv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_swapv_ref.c000066400000000000000000000044631427272030600232720ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,swaps)( x[i], y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,swaps)( *x, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } INSERT_GENTFUNC_BASIC2( swapv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1/bli_xpbyv_ref.c000066400000000000000000000070421427272030600232760ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ dim_t n, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ /* If beta is zero, use copyv. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \ \ copyv_p \ ( \ conjx, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ /* If alpha is one, use addv. */ \ else if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \ \ addv_p \ ( \ conjx, \ n, \ x, incx, \ y, incy, \ cntx \ ); \ return; \ } \ \ if ( bli_is_conj( conjx ) ) \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,xpbyjs)( x[i], *beta, y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,xpbyjs)( *x, *beta, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ else \ { \ if ( incx == 1 && incy == 1 ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,xpbys)( x[i], *beta, y[i] ); \ } \ } \ else \ { \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,xpbys)( *x, *beta, *y ); \ \ x += incx; \ y += incy; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( xpbyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1f/000077500000000000000000000000001427272030600204435ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/1f/bli_axpy2v_ref.c000066400000000000000000000075511427272030600235320ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjx, \ conj_t conjy, \ dim_t n, \ ctype* restrict alphax, \ ctype* restrict alphay, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( n ) ) return; \ \ if ( incz == 1 && incx == 1 && incy == 1 ) \ { \ ctype chic, psic; \ \ if ( bli_is_noconj( conjx ) ) \ { \ if ( bli_is_noconj( conjy ) ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \ PASTEMAC(ch,axpys)( *alphay, y[i], z[i] ); \ } \ } \ else /* if ( bli_is_conj( conjy ) ) */ \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,axpys)( *alphax, x[i], z[i] ); \ PASTEMAC(ch,copyjs)( y[i], psic ); \ PASTEMAC(ch,axpys)( *alphay, psic, z[i] ); \ } \ } \ } \ else /* if ( bli_is_conj( conjx ) ) */ \ { \ if ( bli_is_noconj( conjy ) ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copyjs)( x[i], chic ); \ PASTEMAC(ch,axpys)( *alphax, chic, z[i] ); \ PASTEMAC(ch,axpys)( *alphay, y[i], z[i] ); \ } \ } \ else /* if ( bli_is_conj( conjy ) ) */ \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < n; ++i ) \ { \ PASTEMAC(ch,copyjs)( x[i], chic ); \ PASTEMAC(ch,axpys)( *alphax, chic, z[i] ); \ PASTEMAC(ch,copyjs)( y[i], psic ); \ PASTEMAC(ch,axpys)( *alphay, psic, z[i] ); \ } \ } \ } \ } \ else \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ kfp_av \ ( \ conjx, \ n, \ alphax, \ x, incx, \ z, incz, \ cntx \ ); \ \ kfp_av \ ( \ conjy, \ n, \ alphay, \ y, incy, \ z, incz, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC2( axpy2v, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1f/bli_axpyf_ref.c000066400000000000000000000077701427272030600234330ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( m ) ) return; \ \ if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \ { \ ctype ax[ ff ]; \ \ /* Scale x by alpha, storing to a temporary array ax. */ \ if ( bli_is_conj( conjx ) ) \ { \ PRAGMA_SIMD \ for ( dim_t j = 0; j < ff; ++j ) \ PASTEMAC(ch,scal2js)( *alpha, x[j], ax[j] ); \ } \ else \ { \ PRAGMA_SIMD \ for ( dim_t j = 0; j < ff; ++j ) \ PASTEMAC(ch,scal2s)( *alpha, x[j], ax[j] ); \ } \ \ /* Accumulate ff separate axpyv's into y. */ \ if ( bli_is_noconj( conja ) ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < m; ++i ) \ for ( dim_t j = 0; j < ff; ++j ) \ { \ PASTEMAC(ch,axpys)( ax[j], a[i + j*lda], y[i] ); \ } \ } \ else \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < m; ++i ) \ for ( dim_t j = 0; j < ff; ++j ) \ { \ PASTEMAC(ch,axpyjs)( ax[j], a[i + j*lda], y[i] ); \ } \ } \ } \ else \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ for ( dim_t i = 0; i < b_n; ++i ) \ { \ ctype* restrict a1 = a + (0 )*inca + (i )*lda; \ ctype* restrict chi1 = x + (i )*incx; \ ctype* restrict y1 = y + (0 )*incy; \ \ ctype alpha_chi1; \ \ PASTEMAC(ch,copycjs)( conjx, *chi1, alpha_chi1 ); \ PASTEMAC(ch,scals)( *alpha, alpha_chi1 ); \ \ kfp_av \ ( \ conja, \ m, \ &alpha_chi1, \ a1, inca, \ y1, incy, \ cntx \ ); \ } \ } \ } //INSERT_GENTFUNC_BASIC2( axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 ) GENTFUNC( double, d, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 ) GENTFUNC( scomplex, c, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 ) GENTFUNC( dcomplex, z, axpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 8 ) cython-blis-0.9.1/blis/_src/ref_kernels/1f/bli_dotaxpyv_ref.c000066400000000000000000000105261427272030600241530ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjxt, \ conj_t conjx, \ conj_t conjy, \ dim_t m, \ ctype* restrict alpha, \ ctype* restrict x, inc_t incx, \ ctype* restrict y, inc_t incy, \ ctype* restrict rho, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ) \ { \ if ( bli_zero_dim1( m ) ) return; \ \ if ( incz == 1 && incx == 1 && incy == 1 ) \ { \ if ( bli_is_noconj( conjx ) ) \ { \ conj_t conjxt_use = conjxt; \ ctype dotxy; \ \ PASTEMAC(ch,set0s)( dotxy ); \ \ if ( bli_is_conj( conjy ) ) \ bli_toggle_conj( &conjxt_use ); \ \ if ( bli_is_noconj( conjxt_use ) ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \ PASTEMAC(ch,axpys)( *alpha, x[i], z[i] ); \ } \ } \ else /* bli_is_conj( conjxt_use ) ) */ \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \ PASTEMAC(ch,axpys)( *alpha, x[i], z[i] ); \ } \ } \ \ if ( bli_is_conj( conjy ) ) \ PASTEMAC(ch,conjs)( dotxy ); \ \ PASTEMAC(ch,copys)( dotxy, *rho ); \ } \ else /* bli_is_conj( conjx ) ) */ \ { \ conj_t conjxt_use = conjxt; \ ctype dotxy; \ \ PASTEMAC(ch,set0s)( dotxy ); \ \ if ( bli_is_conj( conjy ) ) \ bli_toggle_conj( &conjxt_use ); \ \ if ( bli_is_noconj( conjxt_use ) ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,dots)( x[i], y[i], dotxy ); \ PASTEMAC(ch,axpyjs)( *alpha, x[i], z[i] ); \ } \ } \ else /* bli_is_conj( conjxt_use ) ) */ \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,dotjs)( x[i], y[i], dotxy ); \ PASTEMAC(ch,axpyjs)( *alpha, x[i], z[i] ); \ } \ } \ \ if ( bli_is_conj( conjy ) ) \ PASTEMAC(ch,conjs)( dotxy ); \ \ PASTEMAC(ch,copys)( dotxy, *rho ); \ } \ } \ else \ { \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotv_ker_ft) kfp_dv \ = \ bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \ PASTECH(ch,axpyv_ker_ft) kfp_av \ = \ bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ kfp_dv \ ( \ conjxt, \ conjy, \ m, \ x, incx, \ y, incy, \ rho, \ cntx \ ); \ \ kfp_av \ ( \ conjx, \ m, \ alpha, \ x, incx, \ z, incz, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC2( dotaxpyv, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1f/bli_dotxaxpyf_ref.c000066400000000000000000000136261427272030600243270ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* restrict cntx \ ) \ { \ /* A is m x n. */ \ /* y = beta * y + alpha * A^T w; */ \ /* z = z + alpha * A x; */ \ \ if ( 1 && inca == 1 && incw == 1 && incx == 1 && \ incy == 1 && incz == 1 && b_n == ff ) \ { \ ctype r[ ff ]; \ ctype ax[ ff ]; \ \ /* If beta is zero, clear y. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( y[i] ); \ } \ else \ { \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \ } \ \ /* If the vectors are empty or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize r vector to 0. */ \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \ \ /* Scale x by alpha, storing to a temporary array ax. */ \ if ( bli_is_conj( conjx ) ) \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < ff; ++i ) \ PASTEMAC(ch,scal2js)( *alpha, x[i], ax[i] ); \ } \ else \ { \ PRAGMA_SIMD \ for ( dim_t i = 0; i < ff; ++i ) \ PASTEMAC(ch,scal2s)( *alpha, x[i], ax[i] ); \ } \ \ /* If a must be conjugated, we do so indirectly by first toggling the effective conjugation of w and then conjugating the resulting dot products. */ \ conj_t conjw_use = conjw; \ \ if ( bli_is_conj( conjat ) ) \ bli_toggle_conj( &conjw_use ); \ \ if ( bli_is_noconj( conjw_use ) ) \ { \ if ( bli_is_noconj( conja ) ) \ { \ PRAGMA_SIMD \ for ( dim_t p = 0; p < m; ++p ) \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \ PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \ } \ } \ else \ { \ PRAGMA_SIMD \ for ( dim_t p = 0; p < m; ++p ) \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \ PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \ } \ } \ } \ else \ { \ if ( bli_is_noconj( conja ) ) \ { \ PRAGMA_SIMD \ for ( dim_t p = 0; p < m; ++p ) \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \ PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \ } \ } \ else \ { \ PRAGMA_SIMD \ for ( dim_t p = 0; p < m; ++p ) \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \ PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \ } \ } \ } \ \ if ( bli_is_conj( conjat ) ) \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,conjs)( r[i] ); \ \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \ } \ } \ else \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotxf_ker_ft) kfp_df \ = \ bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ PASTECH(ch,axpyf_ker_ft) kfp_af \ = \ bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \ \ kfp_df \ ( \ conjat, \ conjw, \ m, \ b_n, \ alpha, \ a, inca, lda, \ w, incw, \ beta, \ y, incy, \ cntx \ ); \ \ kfp_af \ ( \ conja, \ conjx, \ m, \ b_n, \ alpha, \ a, inca, lda, \ x, incx, \ z, incz, \ cntx \ ); \ } \ } //INSERT_GENTFUNC_BASIC2( dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 ) GENTFUNC( double, d, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 ) GENTFUNC( scomplex, c, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 ) GENTFUNC( dcomplex, z, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 ) cython-blis-0.9.1/blis/_src/ref_kernels/1f/bli_dotxf_ref.c000066400000000000000000000107461427272030600234250ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjat, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ cntx_t* restrict cntx \ ) \ { \ if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \ { \ ctype r[ ff ]; \ \ /* If beta is zero, clear y. Otherwise, scale by beta. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( y[i] ); \ } \ else \ { \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \ } \ \ /* If the vectors are empty or if alpha is zero, return early. */ \ if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \ \ /* Initialize r vector to 0. */ \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \ \ /* If a must be conjugated, we do so indirectly by first toggling the effective conjugation of x and then conjugating the resulting dot products. */ \ conj_t conjx_use = conjx; \ \ if ( bli_is_conj( conjat ) ) \ bli_toggle_conj( &conjx_use ); \ \ if ( bli_is_noconj( conjx_use ) ) \ { \ PRAGMA_SIMD \ for ( dim_t p = 0; p < m; ++p ) \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpys)( a[p + i*lda], x[p], r[i] ); \ } \ } \ else \ { \ PRAGMA_SIMD \ for ( dim_t p = 0; p < m; ++p ) \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpyjs)( a[p + i*lda], x[p], r[i] ); \ } \ } \ \ if ( bli_is_conj( conjat ) ) \ for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,conjs)( r[i] ); \ \ for ( dim_t i = 0; i < ff; ++i ) \ { \ PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \ } \ } \ else \ { \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotxv_ker_ft) kfp_dv \ = \ bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ \ for ( dim_t i = 0; i < b_n; ++i ) \ { \ ctype* restrict a1 = a + (0 )*inca + (i )*lda; \ ctype* restrict x1 = x + (0 )*incx; \ ctype* restrict psi1 = y + (i )*incy; \ \ kfp_dv \ ( \ conjat, \ conjx, \ m, \ alpha, \ a1, inca, \ x1, incx, \ beta, \ psi1, \ cntx \ ); \ } \ } \ } //INSERT_GENTFUNC_BASIC2( dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 ) GENTFUNC( double, d, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 ) GENTFUNC( scomplex, c, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 ) GENTFUNC( dcomplex, z, dotxf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 6 ) cython-blis-0.9.1/blis/_src/ref_kernels/1f/other/000077500000000000000000000000001427272030600215645ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c000066400000000000000000000066771427272030600263200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ \ void PASTEMAC(ch,varname) \ ( \ conj_t conjat, \ conj_t conja, \ conj_t conjw, \ conj_t conjx, \ dim_t m, \ dim_t b_n, \ ctype* restrict alpha, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict w, inc_t incw, \ ctype* restrict x, inc_t incx, \ ctype* restrict beta, \ ctype* restrict y, inc_t incy, \ ctype* restrict z, inc_t incz, \ cntx_t* cntx \ ) \ { \ ctype* a1; \ ctype* chi1; \ ctype* w1; \ ctype* psi1; \ ctype* z1; \ ctype conjx_chi1; \ ctype alpha_chi1; \ dim_t i; \ \ /* Query the context for the kernel function pointer. */ \ const num_t dt = PASTEMAC(ch,type); \ PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \ PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \ \ /* A is m x n. */ \ /* y = beta * y + alpha * A^T w; */ \ /* z = z + alpha * A x; */ \ for ( i = 0; i < b_n; ++i ) \ { \ a1 = a + (0 )*inca + (i )*lda; \ w1 = w + (0 )*incw; \ psi1 = y + (i )*incy; \ \ kfp_dv \ ( \ conjat, \ conjw, \ m, \ alpha, \ a1, inca, \ w1, incw, \ beta, \ psi1, \ cntx \ ); \ } \ \ for ( i = 0; i < b_n; ++i ) \ { \ a1 = a + (0 )*inca + (i )*lda; \ chi1 = x + (i )*incx; \ z1 = z + (0 )*incz; \ \ PASTEMAC(ch,copycjs)( conjx, *chi1, conjx_chi1 ); \ PASTEMAC(ch,scal2s)( *alpha, conjx_chi1, alpha_chi1 ); \ \ kfp_av \ ( \ conja, \ m, \ &alpha_chi1, \ a1, inca, \ z1, incz, \ cntx \ ); \ } \ } INSERT_GENTFUNC_BASIC0( dotxaxpyf_ref_var1 ) cython-blis-0.9.1/blis/_src/ref_kernels/1m/000077500000000000000000000000001427272030600204525ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/1m/bli_packm_cxk_1er_ref.c000066400000000000000000002732721427272030600250240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_2xk_1er, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_4xk_1er, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_6xk_1er, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_8xk_1er, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_10xk_1er, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_12xk_1er, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_14xk_1er, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ if ( cdim == mnr ) \ { \ if ( bli_is_1e_packed( schema ) ) \ { \ const inc_t inca1 = inca; \ const inc_t lda1 = lda; \ const inc_t ldp1 = ldp; \ \ ctype* restrict kappa_cast = ( ctype* )kappa; \ ctype* restrict alpha1_ri = ( ctype* )a; \ ctype* restrict pi1_ri = ( ctype* )p; \ ctype* restrict pi1_ir = ( ctype* )p + ldp1/2; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \ PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \ \ alpha1_ri += lda1; \ pi1_ri += ldp1; \ pi1_ir += ldp1; \ } \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema ) ) */ \ { \ const inc_t inca2 = 2 * inca; \ const inc_t lda2 = 2 * lda; \ const inc_t ldp2 = 2 * ldp; \ \ ctype* kappa_cast = kappa; \ ctype_r* restrict kappa_r = ( ctype_r* )kappa; \ ctype_r* restrict kappa_i = ( ctype_r* )kappa + 1; \ ctype_r* restrict alpha1_r = ( ctype_r* )a; \ ctype_r* restrict alpha1_i = ( ctype_r* )a + 1; \ ctype_r* restrict pi1_r = ( ctype_r* )p; \ ctype_r* restrict pi1_i = ( ctype_r* )p + ldp; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \ PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \ \ alpha1_r += lda2; \ alpha1_i += lda2; \ pi1_r += ldp2; \ pi1_i += ldp2; \ } \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal21ms_mxn) \ ( \ schema, \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = cdim; \ const dim_t offn = 0; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ ctype* restrict zero = PASTEMAC(ch,0); \ const dim_t offm = 0; \ const dim_t offn = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ \ PASTEMAC(ch,set1ms_mxn) \ ( \ schema, \ offm, \ offn, \ m_edge, \ n_edge, \ zero, \ p, 1, ldp, ldp \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( packm_16xk_1er, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1m/bli_packm_cxk_bb_ref.c000066400000000000000000000565021427272030600247130ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // -- 6xk, duplication factor 2 ------------------------------------------------ #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ const dim_t dfac = 2; \ \ /* Handle the packing of B (column panel schemas) separately from packing of A (row panel schemas). */ \ if ( bli_is_col_packed( schema ) ) \ { \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal2bbs_mxn) \ ( \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, dfac, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*dfac; \ \ PASTEMAC(ch,set0bbs_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, dfac, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0bbs_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, dfac, ldp \ ); \ } \ } \ else /* if ( bli_is_row_packed( schema ) ) */ \ { \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal2s_mxn) \ ( \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ } INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) // -- 6xk, duplication factor 4 ------------------------------------------------ #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ const dim_t dfac = 4; \ \ /* Handle the packing of B (column panel schemas) separately from packing of A (row panel schemas). */ \ if ( bli_is_col_packed( schema ) ) \ { \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 10) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 11) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 12) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 13) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 14) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 15) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 16) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 17) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 18) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 19) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 20) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 21) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 22) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 10) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 11) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 12) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 13) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 14) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 15) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 16) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 17) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 18) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 19) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 20) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 21) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 22) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal2bbs_mxn) \ ( \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, dfac, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*dfac; \ \ PASTEMAC(ch,set0bbs_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, dfac, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0bbs_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, dfac, ldp \ ); \ } \ } \ else /* if ( bli_is_row_packed( schema ) ) */ \ { \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else /* if ( bli_is_noconj( conja ) ) */ \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC(ch,scal2s_mxn) \ ( \ conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ } INSERT_GENTFUNC_BASIC3( packm_6xk_bb4, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1m/bli_packm_cxk_ref.c000066400000000000000000001543311427272030600242470ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ dim_t n_iter = n / 4; \ dim_t n_left = n % 4; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( ; n_iter != 0; --n_iter ) \ { \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \ \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \ \ alpha1 += 4*lda; \ pi1 += 4*ldp; \ } \ \ for ( ; n_left != 0; --n_left ) \ { \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ dim_t n_iter = n / 4; \ dim_t n_left = n % 4; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( ; n_iter != 0; --n_iter ) \ { \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 2*lda), *(pi1 + 2 + 2*ldp) ); \ \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 3*lda), *(pi1 + 2 + 3*ldp) ); \ \ alpha1 += 4*lda; \ pi1 += 4*ldp; \ } \ \ for ( ; n_left != 0; --n_left ) \ { \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ dim_t n_iter = n / 2; \ dim_t n_left = n % 2; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( ; n_iter != 0; --n_iter ) \ { \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \ \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \ \ alpha1 += 2*lda; \ pi1 += 2*ldp; \ } \ \ for ( ; n_left != 0; --n_left ) \ { \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ dim_t n_iter = n / 2; \ dim_t n_left = n % 2; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( ; n_iter != 0; --n_iter ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 0*lda), *(pi1 + 4 + 0*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 0*lda), *(pi1 + 5 + 0*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 0*lda), *(pi1 + 6 + 0*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 0*lda), *(pi1 + 7 + 0*ldp) ); \ \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 1*lda), *(pi1 + 4 + 1*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 1*lda), *(pi1 + 5 + 1*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 1*lda), *(pi1 + 6 + 1*ldp) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 1*lda), *(pi1 + 7 + 1*ldp) ); \ \ alpha1 += 2*lda; \ pi1 += 2*ldp; \ } \ \ for ( ; n_left != 0; --n_left ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ pack_t schema, \ dim_t cdim, \ dim_t n, \ dim_t n_max, \ ctype* restrict kappa, \ ctype* restrict a, inc_t inca, inc_t lda, \ ctype* restrict p, inc_t ldp, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict alpha1 = a; \ ctype* restrict pi1 = p; \ \ if ( cdim == mnr ) \ { \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +16*inca), *(pi1 +16) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +17*inca), *(pi1 +17) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +18*inca), *(pi1 +18) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +19*inca), *(pi1 +19) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +20*inca), *(pi1 +20) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +21*inca), *(pi1 +21) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +22*inca), *(pi1 +22) ); \ PASTEMAC(ch,copyjs)( *(alpha1 +23*inca), *(pi1 +23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \ PASTEMAC(ch,copys)( *(alpha1 +16*inca), *(pi1 +16) ); \ PASTEMAC(ch,copys)( *(alpha1 +17*inca), *(pi1 +17) ); \ PASTEMAC(ch,copys)( *(alpha1 +18*inca), *(pi1 +18) ); \ PASTEMAC(ch,copys)( *(alpha1 +19*inca), *(pi1 +19) ); \ PASTEMAC(ch,copys)( *(alpha1 +20*inca), *(pi1 +20) ); \ PASTEMAC(ch,copys)( *(alpha1 +21*inca), *(pi1 +21) ); \ PASTEMAC(ch,copys)( *(alpha1 +22*inca), *(pi1 +22) ); \ PASTEMAC(ch,copys)( *(alpha1 +23*inca), *(pi1 +23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conja ) ) \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \ PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ else \ { \ for ( dim_t k = n; k != 0; --k ) \ { \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \ PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \ \ alpha1 += lda; \ pi1 += ldp; \ } \ } \ } \ } \ else /* if ( cdim < mnr ) */ \ { \ PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \ ( \ 0, \ BLIS_NONUNIT_DIAG, \ BLIS_DENSE, \ ( trans_t )conja, \ cdim, \ n, \ kappa, \ a, inca, lda, \ p, 1, ldp, \ cntx, \ NULL \ ); \ \ /* if ( cdim < mnr ) */ \ { \ const dim_t i = cdim; \ const dim_t m_edge = mnr - cdim; \ const dim_t n_edge = n_max; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (i )*1; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } \ \ if ( n < n_max ) \ { \ const dim_t j = n; \ const dim_t m_edge = mnr; \ const dim_t n_edge = n_max - n; \ ctype* restrict p_cast = p; \ ctype* restrict p_edge = p_cast + (j )*ldp; \ \ PASTEMAC(ch,set0s_mxn) \ ( \ m_edge, \ n_edge, \ p_edge, 1, ldp \ ); \ } \ } INSERT_GENTFUNC_BASIC3( packm_24xk, 24, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/1m/bli_unpackm_cxk_ref.c000066400000000000000000000754651427272030600246240ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_2xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_4xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_6xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_8xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_10xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_12xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_14xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conjp, \ dim_t n, \ void* restrict kappa, \ void* restrict p, inc_t ldp, \ void* restrict a, inc_t inca, inc_t lda, \ cntx_t* restrict cntx \ ) \ { \ ctype* restrict kappa_cast = kappa; \ ctype* restrict pi1 = p; \ ctype* restrict alpha1 = a; \ \ if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ PASTEMAC2(ch,ch,copyjs)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 14), *(alpha1 + 14*inca) ); \ PASTEMAC2(ch,ch,copys)( *(pi1 + 15), *(alpha1 + 15*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ else \ { \ if ( bli_is_conj( conjp ) ) \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ else \ { \ for ( ; n != 0; --n ) \ { \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \ PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \ \ pi1 += ldp; \ alpha1 += lda; \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( unpackm_16xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/3/000077500000000000000000000000001427272030600202775ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/3/bb/000077500000000000000000000000001427272030600206625ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/3/bb/bli_gemmbb_ref.c000066400000000000000000000106651427272030600237510ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // An implementation that indexes through B with the assumption that all // elements were broadcast (duplicated) by a factor of NP/NR. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ \ /* Assume that the degree of duplication is equal to packnr / nr. */ \ const inc_t cs_b = packnr / nr; \ \ ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ab = 1; \ const inc_t cs_ab = mr; \ \ dim_t l, j, i; \ \ ctype ai; \ ctype bj; \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ for ( i = 0; i < m * n; ++i ) \ { \ PASTEMAC(ch,set0s)( *(ab + i) ); \ } \ \ /* Perform a series of k rank-1 updates into ab. */ \ for ( l = 0; l < k; ++l ) \ { \ ctype* restrict abij = ab; \ \ /* In an optimized implementation, these two loops over MR and NR are typically fully unrolled. */ \ for ( j = 0; j < n; ++j ) \ { \ bj = *(b + j*cs_b); \ \ for ( i = 0; i < m; ++i ) \ { \ ai = *(a + i); \ \ PASTEMAC(ch,dots)( ai, bj, *abij ); \ \ abij += rs_ab; \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ /* Scale the result in ab by alpha. */ \ for ( i = 0; i < m * n; ++i ) \ { \ PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ } \ \ /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, scale by beta and then add the scaled redult in ab. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,copys_mxn)( m, \ n, \ ab, rs_ab, cs_ab, \ c, rs_c, cs_c ); \ } \ else \ { \ PASTEMAC(ch,xpbys_mxn)( m, \ n, \ ab, rs_ab, cs_ab, \ beta, \ c, rs_c, cs_c ); \ } \ } INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c000066400000000000000000000111751427272030600246540ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // An implementation that indexes through B with the assumption that all // elements were broadcast (duplicated) by a factor of NP/NR. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const inc_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const inc_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t rs_b = packnr; \ \ /* Assume that the degree of duplication is equal to packnr / nr. */ \ const inc_t cs_b = packnr / nr; \ /* printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \ printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \ */ \ \ ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ \ /* PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \ (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \ (double*)b11, rs_b, 1, "%5.2f", "" ); \ */ \ \ /* lower: b11 = alpha * b11 - a10 * b01; */ \ /* upper: b11 = alpha * b11 - a12 * b21; */ \ gemm_ukr \ ( \ mr, \ nr, \ k, \ minus_one, \ a1x, \ bx1, \ alpha, \ b11, rs_b, cs_b, \ data, \ cntx \ ); \ /* PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \ (double*)b11, rs_b, 1, "%5.2f", "" ); \ */ \ \ /* b11 = inv(a11) * b11; c11 = b11; */ \ trsm_ukr \ ( \ a11, \ b11, \ c11, rs_c, cs_c, \ data, \ cntx \ ); \ /* PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \ (double*)b11, rs_b, 1, "%5.2f", "" ); \ */ \ \ /* Broadcast the elements of the updated b11 submatrix to their duplicated neighbors. */ \ PASTEMAC(ch,bcastbbs_mxn) \ ( \ mr, \ nr, \ b11, rs_b, cs_b \ ); \ \ /* PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \ ( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \ ( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR ) INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR ) cython-blis-0.9.1/blis/_src/ref_kernels/3/bb/bli_trsmbb_ref.c000066400000000000000000000164251427272030600240110ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // An implementation that indexes through B with the assumption that all // elements were broadcast (duplicated) by a factor of NP/NR. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const dim_t m = mr; \ const dim_t n = nr; \ \ const inc_t rs_a = 1; \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ \ /* Assume that the degree of duplication is equal to packnr / nr. */ \ const inc_t cs_b = packnr / nr; \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_behind = i; \ \ ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \ ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \ ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype beta11c = *beta11; \ ctype rho11; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0s)( rho11 ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha10 = a10t + (l )*cs_a; \ ctype* restrict beta01 = b01 + (l )*rs_b; \ \ PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \ } \ PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,scals)( *alpha11, beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ PASTEMAC(ch,copys)( beta11c, *beta11 ); \ } \ } \ } #ifdef BLIS_ENABLE_TRSM_PREINVERSION INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) #else INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) #endif #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const dim_t m = mr; \ const dim_t n = nr; \ \ const inc_t rs_a = 1; \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ \ /* Assume that the degree of duplication is equal to packnr / nr. */ \ const inc_t cs_b = packnr / nr; \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_behind = iter; \ \ ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype beta11c = *beta11; \ ctype rho11; \ \ /* beta11 = beta11 - a12t * b21; */ \ PASTEMAC(ch,set0s)( rho11 ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha12 = a12t + (l )*cs_a; \ ctype* restrict beta21 = b21 + (l )*rs_b; \ \ PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \ } \ PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,diagop)( *alpha11, beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ PASTEMAC(ch,copys)( beta11c, *beta11 ); \ } \ } \ } #ifdef BLIS_ENABLE_TRSM_PREINVERSION INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) #else INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) #endif cython-blis-0.9.1/blis/_src/ref_kernels/3/bli_gemm_ref.c000066400000000000000000000165051427272030600230610ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if 1 // An implementation that attempts to facilitate emission of vectorized // instructions via constant loop bounds + #pragma omp simd directives. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ab = nr; \ const inc_t cs_ab = 1; \ \ const inc_t cs_a = mr; \ const inc_t rs_b = nr; \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ PRAGMA_SIMD \ for ( dim_t i = 0; i < mr * nr; ++i ) \ { \ PASTEMAC(ch,set0s)( ab[ i ] ); \ } \ \ /* Perform a series of k rank-1 updates into ab. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ for ( dim_t i = 0; i < mr; ++i ) \ { \ PRAGMA_SIMD \ for ( dim_t j = 0; j < nr; ++j ) \ { \ PASTEMAC(ch,dots) \ ( \ a[ i ], \ b[ j ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ /* Scale the result in ab by alpha. */ \ PRAGMA_SIMD \ for ( dim_t i = 0; i < mr * nr; ++i ) \ { \ PASTEMAC(ch,scals)( *alpha, ab[ i ] ); \ } \ \ /* Output/accumulate intermediate result ab based on the storage of c and the value of beta. */ \ if ( cs_c == 1 ) \ { \ /* C is row-stored. */ \ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ for ( dim_t j = 0; j < n; ++j ) \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ c [ i*rs_c + j*1 ] \ ); \ } \ else \ { \ for ( dim_t i = 0; i < m; ++i ) \ for ( dim_t j = 0; j < n; ++j ) \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ *beta, \ c [ i*rs_c + j*1 ] \ ); \ } \ } \ else \ { \ /* C is column-stored or general-stored. */ \ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ c [ i*rs_c + j*cs_c ] \ ); \ } \ else \ { \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ *beta, \ c [ i*rs_c + j*cs_c ] \ ); \ } \ } \ } //INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 ) GENTFUNC( double, d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 ) #else // An implementation that uses variable loop bounds (queried from the context) // and makes no use of #pragma omp simd. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ \ ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ab = 1; \ const inc_t cs_ab = mr; \ \ dim_t l, j, i; \ \ ctype ai; \ ctype bj; \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ for ( i = 0; i < m * n; ++i ) \ { \ PASTEMAC(ch,set0s)( *(ab + i) ); \ } \ \ /* Perform a series of k rank-1 updates into ab. */ \ for ( l = 0; l < k; ++l ) \ { \ ctype* restrict abij = ab; \ \ /* In an optimized implementation, these two loops over MR and NR are typically fully unrolled. */ \ for ( j = 0; j < n; ++j ) \ { \ bj = *(b + j); \ \ for ( i = 0; i < m; ++i ) \ { \ ai = *(a + i); \ \ PASTEMAC(ch,dots)( ai, bj, *abij ); \ \ abij += rs_ab; \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ /* Scale the result in ab by alpha. */ \ for ( i = 0; i < m * n; ++i ) \ { \ PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \ } \ \ /* If beta is zero, overwrite c with the scaled result in ab. Otherwise, scale by beta and then add the scaled redult in ab. */ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,copys_mxn)( m, \ n, \ ab, rs_ab, cs_ab, \ c, rs_c, cs_c ); \ } \ else \ { \ PASTEMAC(ch,xpbys_mxn)( m, \ n, \ ab, rs_ab, cs_ab, \ beta, \ c, rs_c, cs_c ); \ } \ } INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #endif cython-blis-0.9.1/blis/_src/ref_kernels/3/bli_gemmsup_ref.c000066400000000000000000000544231427272030600236120ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2019, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // -- Row storage case --------------------------------------------------------- // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ { \ /* Traverse c by rows. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict ci = &c[ i*rs_c ]; \ ctype* restrict ai = &a[ i*rs_a ]; \ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cij = &ci[ j*cs_c ]; \ ctype* restrict bj = &b [ j*cs_b ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ PASTEMAC(ch,conjs)( ab ); \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) // // -- Column storage case ------------------------------------------------------ // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ /* NOTE: This microkernel can actually handle arbitrarily large values of m, n, and k. */ \ \ if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \ } \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \ { \ /* Traverse c by columns. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ ctype* restrict cj = &c[ j*cs_c ]; \ ctype* restrict bj = &b[ j*cs_b ]; \ \ for ( dim_t i = 0; i < m; ++i ) \ { \ ctype* restrict cij = &cj[ i*rs_c ]; \ ctype* restrict ai = &a [ i*rs_a ]; \ ctype ab; \ \ PASTEMAC(ch,set0s)( ab ); \ \ /* Perform a dot product to update the (i,j) element of c. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ ctype* restrict aij = &ai[ l*cs_a ]; \ ctype* restrict bij = &bj[ l*rs_b ]; \ \ PASTEMAC(ch,dots)( *aij, *bij, ab ); \ } \ \ /* Conjugate the result to simulate conj(a^T) * conj(b). */ \ PASTEMAC(ch,conjs)( ab ); \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \ } \ else \ { \ PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \ } \ } \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) // // -- General storage case ----------------------------------------------------- // INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #if 0 // // -- Row storage case --------------------------------------------------------- // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const dim_t mn = m * n; \ \ ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ab = n; \ const inc_t cs_ab = 1; \ \ \ /* Assumptions: m <= mr, n <= nr so that the temporary array ab is sufficiently large enough to hold the m x n microtile. The ability to handle m < mr and n < nr is being provided so that optimized ukernels can call one of these reference implementations for their edge cases, if they choose. When they do so, they will need to call the function directly, by its configuration-mangled name, since it will have been overwritten in the context when the optimized ukernel functions are registered. */ \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ for ( dim_t i = 0; i < mn; ++i ) \ { \ PASTEMAC(ch,set0s)( ab[i] ); \ } \ \ /* Perform a series of k rank-1 updates into ab. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ /* Traverse ab by rows; assume cs_ab = 1. */ \ for ( dim_t i = 0; i < m; ++i ) \ { \ for ( dim_t j = 0; j < n; ++j ) \ { \ PASTEMAC(ch,dots) \ ( \ a[ i*rs_a ], \ b[ j*cs_b ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ /* Scale the result in ab by alpha. */ \ for ( dim_t i = 0; i < mn; ++i ) \ { \ PASTEMAC(ch,scals)( *alpha, ab[i] ); \ } \ \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ /* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \ for ( dim_t i = 0; i < m; ++i ) \ for ( dim_t j = 0; j < n; ++j ) \ { \ PASTEMAC(ch,adds) \ ( \ ab[ i*rs_ab + j*1 ], \ c[ i*rs_c + j*1 ] \ ) \ } \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ \ /* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \ for ( dim_t i = 0; i < m; ++i ) \ for ( dim_t j = 0; j < n; ++j ) \ { \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*1 ], \ c[ i*rs_c + j*1 ] \ ) \ } \ } \ else /* beta != 0 && beta != 1 */ \ { \ /* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \ for ( dim_t i = 0; i < m; ++i ) \ for ( dim_t j = 0; j < n; ++j ) \ { \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*1 ], \ *beta, \ c[ i*rs_c + j*1 ] \ ) \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) // // -- Column storage case ------------------------------------------------------ // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const dim_t mn = m * n; \ \ ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ab = 1; \ const inc_t cs_ab = m; \ \ \ /* Assumptions: m <= mr, n <= nr so that the temporary array ab is sufficiently large enough to hold the m x n microtile. The ability to handle m < mr and n < nr is being provided so that optimized ukernels can call one of these reference implementations for their edge cases, if they choose. When they do so, they will need to call the function directly, by its configuration-mangled name, since it will have been overwritten in the context when the optimized ukernel functions are registered. */ \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ for ( dim_t i = 0; i < mn; ++i ) \ { \ PASTEMAC(ch,set0s)( ab[i] ); \ } \ \ /* Perform a series of k rank-1 updates into ab. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ /* Traverse ab by columns; assume rs_ab = 1. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,dots) \ ( \ a[ i*rs_a ], \ b[ j*cs_b ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ /* Scale the result in ab by alpha. */ \ for ( dim_t i = 0; i < mn; ++i ) \ { \ PASTEMAC(ch,scals)( *alpha, ab[i] ); \ } \ \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ /* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,adds) \ ( \ ab[ i*1 + j*cs_ab ], \ c[ i*1 + j*cs_c ] \ ) \ } \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,copys) \ ( \ ab[ i*1 + j*cs_ab ], \ c[ i*1 + j*cs_c ] \ ) \ } \ } \ else /* beta != 0 && beta != 1 */ \ { \ /* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*1 + j*cs_ab ], \ *beta, \ c[ i*1 + j*cs_c ] \ ) \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) // // -- General storage case ----------------------------------------------------- // #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ conj_t conja, \ conj_t conjb, \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, inc_t rs_a, inc_t cs_a, \ ctype* restrict b, inc_t rs_b, inc_t cs_b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const dim_t mn = m * n; \ \ ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ab = 1; \ const inc_t cs_ab = m; \ \ \ /* Assumptions: m <= mr, n <= nr so that the temporary array ab is sufficiently large enough to hold the m x n microtile. The ability to handle m < mr and n < nr is being provided so that optimized ukernels can call one of these reference implementations for their edge cases, if they choose. When they do so, they will need to call the function directly, by its configuration-mangled name, since it will have been overwritten in the context when the optimized ukernel functions are registered. */ \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ for ( dim_t i = 0; i < mn; ++i ) \ { \ PASTEMAC(ch,set0s)( ab[i] ); \ } \ \ /* Perform a series of k rank-1 updates into ab. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ /* General storage: doesn't matter how we traverse ab. */ \ for ( dim_t j = 0; j < n; ++j ) \ { \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,dots) \ ( \ a[ i*rs_a ], \ b[ j*cs_b ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ /* Scale the result in ab by alpha. */ \ for ( dim_t i = 0; i < mn; ++i ) \ { \ PASTEMAC(ch,scals)( *alpha, ab[i] ); \ } \ \ \ /* If beta is one, add ab into c. If beta is zero, overwrite c with the result in ab. Otherwise, scale by beta and accumulate ab to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ /* General storage: doesn't matter how we traverse ab and c. */ \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,adds) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ c[ i*rs_c + j*cs_c ] \ ) \ } \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ /* General storage: doesn't matter how we traverse ab and c. */ \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ c[ i*rs_c + j*cs_c ] \ ) \ } \ } \ else /* beta != 0 && beta != 1 */ \ { \ /* General storage: doesn't matter how we traverse ab and c. */ \ for ( dim_t j = 0; j < n; ++j ) \ for ( dim_t i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ *beta, \ c[ i*rs_c + j*cs_c ] \ ) \ } \ } \ } INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) #endif cython-blis-0.9.1/blis/_src/ref_kernels/3/bli_gemmtrsm_ref.c000066400000000000000000000107561427272030600237710ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const inc_t rs_b = packnr; \ const inc_t cs_b = 1; \ \ ctype* minus_one = PASTEMAC(ch,m1); \ \ PASTECH(ch,gemm_ukr_ft) \ gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ PASTECH(ch,trsm_ukr_ft) \ trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR instead? */ \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : nr ); \ const inc_t cs_ct = ( col_pref ? mr : 1 ); \ \ const bool use_ct = ( m < mr || n < nr ); \ \ ctype* restrict c11_use = c11; \ inc_t rs_c_use = rs_c; \ inc_t cs_c_use = cs_c; \ \ if ( use_ct ) \ { \ c11_use = ct; \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ } \ \ /* lower: b11 = alpha * b11 - a10 * b01; */ \ /* upper: b11 = alpha * b11 - a12 * b21; */ \ gemm_ukr \ ( \ m, \ n, \ k, \ minus_one, \ a1x, \ bx1, \ alpha, \ b11, rs_b, cs_b, \ data, \ cntx \ ); \ \ /* b11 = inv(a11) * b11; c11 = b11; */ \ trsm_ukr \ ( \ a11, \ b11, \ c11_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ \ if ( use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c \ ); \ } \ \ /* PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \ ( double* )b01, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \ ( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \ */ \ } INSERT_GENTFUNC_BASIC3( gemmtrsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR ) INSERT_GENTFUNC_BASIC3( gemmtrsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR ) cython-blis-0.9.1/blis/_src/ref_kernels/3/bli_trsm_ref.c000066400000000000000000000164411427272030600231200ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if 0 // An implementation that attempts to facilitate emission of vectorized // instructions via constant loop bounds + #pragma omp simd directives. // (Deleted. See 'old' directory.) #else // An implementation that uses variable loop bounds (queried from the context) // and makes no use of #pragma omp simd. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const dim_t m = mr; \ const dim_t n = nr; \ \ const inc_t rs_a = 1; \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ const inc_t cs_b = 1; \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_behind = i; \ \ ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ ctype* restrict a10t = a + (i )*rs_a + (0 )*cs_a; \ ctype* restrict B0 = b + (0 )*rs_b + (0 )*cs_b; \ ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype* restrict b01 = B0 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype beta11c = *beta11; \ ctype rho11; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0s)( rho11 ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha10 = a10t + (l )*cs_a; \ ctype* restrict beta01 = b01 + (l )*rs_b; \ \ PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \ } \ PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,diagop)( *alpha11, beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ PASTEMAC(ch,copys)( beta11c, *beta11 ); \ } \ } \ } #ifdef BLIS_ENABLE_TRSM_PREINVERSION INSERT_GENTFUNC_BASIC3( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) #else INSERT_GENTFUNC_BASIC3( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) #endif #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const dim_t m = mr; \ const dim_t n = nr; \ \ const inc_t rs_a = 1; \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ const inc_t cs_b = 1; \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_behind = iter; \ \ ctype* restrict alpha11 = a + (i )*rs_a + (i )*cs_a; \ ctype* restrict a12t = a + (i )*rs_a + (i+1)*cs_a; \ ctype* restrict b1 = b + (i )*rs_b + (0 )*cs_b; \ ctype* restrict B2 = b + (i+1)*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype* restrict beta11 = b1 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict b21 = B2 + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype beta11c = *beta11; \ ctype rho11; \ \ /* beta11 = beta11 - a12t * b21; */ \ PASTEMAC(ch,set0s)( rho11 ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha12 = a12t + (l )*cs_a; \ ctype* restrict beta21 = b21 + (l )*rs_b; \ \ PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \ } \ PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,diagop)( *alpha11, beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, *gamma11 ); \ \ /* Store the local value back to b11. */ \ PASTEMAC(ch,copys)( beta11c, *beta11 ); \ } \ } \ } #ifdef BLIS_ENABLE_TRSM_PREINVERSION INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals ) #else INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals ) #endif #endif cython-blis-0.9.1/blis/_src/ref_kernels/3/old/000077500000000000000000000000001427272030600210555ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/3/old/bli_gemm_simd_ref.c000066400000000000000000000121121427272030600246410ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \ \ void PASTEMAC4(ch,opname,arch,_simd,suf) \ ( \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ ctype ab[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ const inc_t rs_ab = nr; \ const inc_t cs_ab = 1; \ \ const inc_t cs_a = mr; \ const inc_t rs_b = nr; \ \ \ /* Initialize the accumulator elements in ab to zero. */ \ PRAGMA_SIMD \ for ( dim_t i = 0; i < mr * nr; ++i ) \ { \ PASTEMAC(ch,set0s)( ab[ i ] ); \ } \ \ /* const dim_t pre = 16; \ dim_t k16; \ if ( k >= pre ) { k16 = k - pre; k = pre; } \ else { k16 = 0; } \ \ for ( dim_t l = 0; l < k16; ++l ) \ { \ for ( dim_t i = 0; i < mr; ++i ) \ { \ PRAGMA_SIMD \ for ( dim_t j = 0; j < nr; ++j ) \ { \ PASTEMAC(ch,dots) \ ( \ a[ i ], \ b[ j ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ __builtin_prefetch( c + 0*cs_c, 1, 0 ); \ __builtin_prefetch( c + 1*cs_c, 1, 0 ); \ __builtin_prefetch( c + 2*cs_c, 1, 0 ); \ __builtin_prefetch( c + 3*cs_c, 1, 0 ); \ */ \ \ /* Perform a series of k rank-1 updates into ab. */ \ for ( dim_t l = 0; l < k; ++l ) \ { \ for ( dim_t i = 0; i < mr; ++i ) \ { \ PRAGMA_SIMD \ for ( dim_t j = 0; j < nr; ++j ) \ { \ PASTEMAC(ch,dots) \ ( \ a[ i ], \ b[ j ], \ ab[ i*rs_ab + j*cs_ab ] \ ); \ } \ } \ \ a += cs_a; \ b += rs_b; \ } \ \ /* Scale the result in ab by alpha. */ \ PRAGMA_SIMD \ for ( dim_t i = 0; i < mr * nr; ++i ) \ { \ PASTEMAC(ch,scals)( *alpha, ab[ i ] ); \ } \ \ /* Output/accumulate intermediate result ab based on the storage of c and the value of beta. */ \ if ( cs_c == 1 ) \ { \ /* C is row-stored. */ \ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( dim_t i = 0; i < mr; ++i ) \ for ( dim_t j = 0; j < nr; ++j ) \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ c [ i*rs_c + j*1 ] \ ); \ } \ else \ { \ for ( dim_t i = 0; i < mr; ++i ) \ for ( dim_t j = 0; j < nr; ++j ) \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ *beta, \ c [ i*rs_c + j*1 ] \ ); \ } \ } \ else \ { \ /* C is column-stored or general-stored. */ \ \ if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( dim_t j = 0; j < nr; ++j ) \ for ( dim_t i = 0; i < mr; ++i ) \ PASTEMAC(ch,copys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ c [ i*rs_c + j*1 ] \ ); \ } \ else \ { \ for ( dim_t j = 0; j < nr; ++j ) \ for ( dim_t i = 0; i < mr; ++i ) \ PASTEMAC(ch,xpbys) \ ( \ ab[ i*rs_ab + j*cs_ab ], \ *beta, \ c [ i*rs_c + j*1 ] \ ); \ } \ } \ } //INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 ) GENTFUNC( double, d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 ) cython-blis-0.9.1/blis/_src/ref_kernels/3/old/bli_gemm_unrl_ref.c000066400000000000000000000302671427272030600247000ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // // -- dgemm -------------------------------------------------------------------- // #undef CH #define CH d #undef CTYPE #define CTYPE double #undef ZERO #define ZERO 0.0 #undef MR #define MR 4 #undef NR #define NR 8 //void PASTEMAC4(CH,gemm,BLIS_CNAME_INFIX,BLIS_REF_SUF,_4x8) void PASTEMAC6(CH,gemm,BLIS_CNAME_REF_SUFFIX,_,MR,x,NR) ( dim_t k, CTYPE* restrict alpha, CTYPE* restrict a, CTYPE* restrict b, CTYPE* restrict beta, CTYPE* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { const dim_t cs_a = MR; const dim_t rs_b = NR; CTYPE ab00 = ZERO, ab01 = ZERO, ab02 = ZERO, ab03 = ZERO; CTYPE ab10 = ZERO, ab11 = ZERO, ab12 = ZERO, ab13 = ZERO; CTYPE ab20 = ZERO, ab21 = ZERO, ab22 = ZERO, ab23 = ZERO; CTYPE ab30 = ZERO, ab31 = ZERO, ab32 = ZERO, ab33 = ZERO; CTYPE ab04 = ZERO, ab05 = ZERO, ab06 = ZERO, ab07 = ZERO; CTYPE ab14 = ZERO, ab15 = ZERO, ab16 = ZERO, ab17 = ZERO; CTYPE ab24 = ZERO, ab25 = ZERO, ab26 = ZERO, ab27 = ZERO; CTYPE ab34 = ZERO, ab35 = ZERO, ab36 = ZERO, ab37 = ZERO; // Perform a series of k rank-1 updates into ab. for ( ; k != 0; --k ) { const CTYPE a0 = a[0]; ab00 += a0*b[0]; ab01 += a0*b[1]; ab02 += a0*b[2]; ab03 += a0*b[3]; ab04 += a0*b[4]; ab05 += a0*b[5]; ab06 += a0*b[6]; ab07 += a0*b[7]; const CTYPE a1 = a[1]; ab10 += a1*b[0]; ab11 += a1*b[1]; ab12 += a1*b[2]; ab13 += a1*b[3]; ab14 += a1*b[4]; ab15 += a1*b[5]; ab16 += a1*b[6]; ab17 += a1*b[7]; const CTYPE a2 = a[2]; ab20 += a2*b[0]; ab21 += a2*b[1]; ab22 += a2*b[2]; ab23 += a2*b[3]; ab24 += a2*b[4]; ab25 += a2*b[5]; ab26 += a2*b[6]; ab27 += a2*b[7]; const CTYPE a3 = a[3]; ab30 += a3*b[0]; ab31 += a3*b[1]; ab32 += a3*b[2]; ab33 += a3*b[3]; ab34 += a3*b[4]; ab35 += a3*b[5]; ab36 += a3*b[6]; ab37 += a3*b[7]; a += cs_a; b += rs_b; } // Scale each element of ab by alpha. if ( !PASTEMAC(CH,eq1)( *alpha ) ) { const CTYPE alpha0 = *alpha; PASTEMAC(CH,scals)( alpha0, ab00 ); PASTEMAC(CH,scals)( alpha0, ab01 ); PASTEMAC(CH,scals)( alpha0, ab02 ); PASTEMAC(CH,scals)( alpha0, ab02 ); PASTEMAC(CH,scals)( alpha0, ab04 ); PASTEMAC(CH,scals)( alpha0, ab05 ); PASTEMAC(CH,scals)( alpha0, ab06 ); PASTEMAC(CH,scals)( alpha0, ab07 ); PASTEMAC(CH,scals)( alpha0, ab10 ); PASTEMAC(CH,scals)( alpha0, ab11 ); PASTEMAC(CH,scals)( alpha0, ab12 ); PASTEMAC(CH,scals)( alpha0, ab12 ); PASTEMAC(CH,scals)( alpha0, ab14 ); PASTEMAC(CH,scals)( alpha0, ab15 ); PASTEMAC(CH,scals)( alpha0, ab16 ); PASTEMAC(CH,scals)( alpha0, ab17 ); PASTEMAC(CH,scals)( alpha0, ab20 ); PASTEMAC(CH,scals)( alpha0, ab21 ); PASTEMAC(CH,scals)( alpha0, ab22 ); PASTEMAC(CH,scals)( alpha0, ab22 ); PASTEMAC(CH,scals)( alpha0, ab24 ); PASTEMAC(CH,scals)( alpha0, ab25 ); PASTEMAC(CH,scals)( alpha0, ab26 ); PASTEMAC(CH,scals)( alpha0, ab27 ); PASTEMAC(CH,scals)( alpha0, ab30 ); PASTEMAC(CH,scals)( alpha0, ab31 ); PASTEMAC(CH,scals)( alpha0, ab32 ); PASTEMAC(CH,scals)( alpha0, ab32 ); PASTEMAC(CH,scals)( alpha0, ab34 ); PASTEMAC(CH,scals)( alpha0, ab35 ); PASTEMAC(CH,scals)( alpha0, ab36 ); PASTEMAC(CH,scals)( alpha0, ab37 ); } // Output/accumulate intermediate result ab based on the storage // of c and the value of beta. if ( cs_c == 1 ) { // C is row-stored. if ( PASTEMAC(CH,eq0)( *beta ) ) { // beta == 0: // c := ab PASTEMAC(CH,copys)( ab00, c[ 0*rs_c + 0 ] ); PASTEMAC(CH,copys)( ab01, c[ 0*rs_c + 1 ] ); PASTEMAC(CH,copys)( ab02, c[ 0*rs_c + 2 ] ); PASTEMAC(CH,copys)( ab03, c[ 0*rs_c + 3 ] ); PASTEMAC(CH,copys)( ab04, c[ 0*rs_c + 4 ] ); PASTEMAC(CH,copys)( ab05, c[ 0*rs_c + 5 ] ); PASTEMAC(CH,copys)( ab06, c[ 0*rs_c + 6 ] ); PASTEMAC(CH,copys)( ab07, c[ 0*rs_c + 7 ] ); PASTEMAC(CH,copys)( ab10, c[ 1*rs_c + 0 ] ); PASTEMAC(CH,copys)( ab11, c[ 1*rs_c + 1 ] ); PASTEMAC(CH,copys)( ab12, c[ 1*rs_c + 2 ] ); PASTEMAC(CH,copys)( ab13, c[ 1*rs_c + 3 ] ); PASTEMAC(CH,copys)( ab14, c[ 1*rs_c + 4 ] ); PASTEMAC(CH,copys)( ab15, c[ 1*rs_c + 5 ] ); PASTEMAC(CH,copys)( ab16, c[ 1*rs_c + 6 ] ); PASTEMAC(CH,copys)( ab17, c[ 1*rs_c + 7 ] ); PASTEMAC(CH,copys)( ab20, c[ 2*rs_c + 0 ] ); PASTEMAC(CH,copys)( ab21, c[ 2*rs_c + 1 ] ); PASTEMAC(CH,copys)( ab22, c[ 2*rs_c + 2 ] ); PASTEMAC(CH,copys)( ab23, c[ 2*rs_c + 3 ] ); PASTEMAC(CH,copys)( ab24, c[ 2*rs_c + 4 ] ); PASTEMAC(CH,copys)( ab25, c[ 2*rs_c + 5 ] ); PASTEMAC(CH,copys)( ab26, c[ 2*rs_c + 6 ] ); PASTEMAC(CH,copys)( ab27, c[ 2*rs_c + 7 ] ); PASTEMAC(CH,copys)( ab30, c[ 3*rs_c + 0 ] ); PASTEMAC(CH,copys)( ab31, c[ 3*rs_c + 1 ] ); PASTEMAC(CH,copys)( ab32, c[ 3*rs_c + 2 ] ); PASTEMAC(CH,copys)( ab33, c[ 3*rs_c + 3 ] ); PASTEMAC(CH,copys)( ab34, c[ 3*rs_c + 4 ] ); PASTEMAC(CH,copys)( ab35, c[ 3*rs_c + 5 ] ); PASTEMAC(CH,copys)( ab36, c[ 3*rs_c + 6 ] ); PASTEMAC(CH,copys)( ab37, c[ 3*rs_c + 7 ] ); } else { const CTYPE beta0 = *beta; // beta != 0: // c := beta * c + ab PASTEMAC(CH,xpbys)( ab00, beta0, c[ 0*rs_c + 0 ] ); PASTEMAC(CH,xpbys)( ab01, beta0, c[ 0*rs_c + 1 ] ); PASTEMAC(CH,xpbys)( ab02, beta0, c[ 0*rs_c + 2 ] ); PASTEMAC(CH,xpbys)( ab03, beta0, c[ 0*rs_c + 3 ] ); PASTEMAC(CH,xpbys)( ab04, beta0, c[ 0*rs_c + 4 ] ); PASTEMAC(CH,xpbys)( ab05, beta0, c[ 0*rs_c + 5 ] ); PASTEMAC(CH,xpbys)( ab06, beta0, c[ 0*rs_c + 6 ] ); PASTEMAC(CH,xpbys)( ab07, beta0, c[ 0*rs_c + 7 ] ); PASTEMAC(CH,xpbys)( ab10, beta0, c[ 1*rs_c + 0 ] ); PASTEMAC(CH,xpbys)( ab11, beta0, c[ 1*rs_c + 1 ] ); PASTEMAC(CH,xpbys)( ab12, beta0, c[ 1*rs_c + 2 ] ); PASTEMAC(CH,xpbys)( ab13, beta0, c[ 1*rs_c + 3 ] ); PASTEMAC(CH,xpbys)( ab14, beta0, c[ 1*rs_c + 4 ] ); PASTEMAC(CH,xpbys)( ab15, beta0, c[ 1*rs_c + 5 ] ); PASTEMAC(CH,xpbys)( ab16, beta0, c[ 1*rs_c + 6 ] ); PASTEMAC(CH,xpbys)( ab17, beta0, c[ 1*rs_c + 7 ] ); PASTEMAC(CH,xpbys)( ab20, beta0, c[ 2*rs_c + 0 ] ); PASTEMAC(CH,xpbys)( ab21, beta0, c[ 2*rs_c + 1 ] ); PASTEMAC(CH,xpbys)( ab22, beta0, c[ 2*rs_c + 2 ] ); PASTEMAC(CH,xpbys)( ab23, beta0, c[ 2*rs_c + 3 ] ); PASTEMAC(CH,xpbys)( ab24, beta0, c[ 2*rs_c + 4 ] ); PASTEMAC(CH,xpbys)( ab25, beta0, c[ 2*rs_c + 5 ] ); PASTEMAC(CH,xpbys)( ab26, beta0, c[ 2*rs_c + 6 ] ); PASTEMAC(CH,xpbys)( ab27, beta0, c[ 2*rs_c + 7 ] ); PASTEMAC(CH,xpbys)( ab30, beta0, c[ 3*rs_c + 0 ] ); PASTEMAC(CH,xpbys)( ab31, beta0, c[ 3*rs_c + 1 ] ); PASTEMAC(CH,xpbys)( ab32, beta0, c[ 3*rs_c + 2 ] ); PASTEMAC(CH,xpbys)( ab33, beta0, c[ 3*rs_c + 3 ] ); PASTEMAC(CH,xpbys)( ab34, beta0, c[ 3*rs_c + 4 ] ); PASTEMAC(CH,xpbys)( ab35, beta0, c[ 3*rs_c + 5 ] ); PASTEMAC(CH,xpbys)( ab36, beta0, c[ 3*rs_c + 6 ] ); PASTEMAC(CH,xpbys)( ab37, beta0, c[ 3*rs_c + 7 ] ); } } else { // C is general-stored (or column-stored). if ( PASTEMAC(CH,eq0)( *beta ) ) { // beta == 0: // c := ab PASTEMAC(CH,copys)( ab00, c[ 0*rs_c + 0*cs_c ] ); PASTEMAC(CH,copys)( ab01, c[ 0*rs_c + 1*cs_c ] ); PASTEMAC(CH,copys)( ab02, c[ 0*rs_c + 2*cs_c ] ); PASTEMAC(CH,copys)( ab03, c[ 0*rs_c + 3*cs_c ] ); PASTEMAC(CH,copys)( ab04, c[ 0*rs_c + 4*cs_c ] ); PASTEMAC(CH,copys)( ab05, c[ 0*rs_c + 5*cs_c ] ); PASTEMAC(CH,copys)( ab06, c[ 0*rs_c + 6*cs_c ] ); PASTEMAC(CH,copys)( ab07, c[ 0*rs_c + 7*cs_c ] ); PASTEMAC(CH,copys)( ab10, c[ 1*rs_c + 0*cs_c ] ); PASTEMAC(CH,copys)( ab11, c[ 1*rs_c + 1*cs_c ] ); PASTEMAC(CH,copys)( ab12, c[ 1*rs_c + 2*cs_c ] ); PASTEMAC(CH,copys)( ab13, c[ 1*rs_c + 3*cs_c ] ); PASTEMAC(CH,copys)( ab14, c[ 1*rs_c + 4*cs_c ] ); PASTEMAC(CH,copys)( ab15, c[ 1*rs_c + 5*cs_c ] ); PASTEMAC(CH,copys)( ab16, c[ 1*rs_c + 6*cs_c ] ); PASTEMAC(CH,copys)( ab17, c[ 1*rs_c + 7*cs_c ] ); PASTEMAC(CH,copys)( ab20, c[ 2*rs_c + 0*cs_c ] ); PASTEMAC(CH,copys)( ab21, c[ 2*rs_c + 1*cs_c ] ); PASTEMAC(CH,copys)( ab22, c[ 2*rs_c + 2*cs_c ] ); PASTEMAC(CH,copys)( ab23, c[ 2*rs_c + 3*cs_c ] ); PASTEMAC(CH,copys)( ab24, c[ 2*rs_c + 4*cs_c ] ); PASTEMAC(CH,copys)( ab25, c[ 2*rs_c + 5*cs_c ] ); PASTEMAC(CH,copys)( ab26, c[ 2*rs_c + 6*cs_c ] ); PASTEMAC(CH,copys)( ab27, c[ 2*rs_c + 7*cs_c ] ); PASTEMAC(CH,copys)( ab30, c[ 3*rs_c + 0*cs_c ] ); PASTEMAC(CH,copys)( ab31, c[ 3*rs_c + 1*cs_c ] ); PASTEMAC(CH,copys)( ab32, c[ 3*rs_c + 2*cs_c ] ); PASTEMAC(CH,copys)( ab33, c[ 3*rs_c + 3*cs_c ] ); PASTEMAC(CH,copys)( ab34, c[ 3*rs_c + 4*cs_c ] ); PASTEMAC(CH,copys)( ab35, c[ 3*rs_c + 5*cs_c ] ); PASTEMAC(CH,copys)( ab36, c[ 3*rs_c + 6*cs_c ] ); PASTEMAC(CH,copys)( ab37, c[ 3*rs_c + 7*cs_c ] ); } else { const CTYPE beta0 = *beta; // beta != 0: // c := beta * c + ab PASTEMAC(CH,xpbys)( ab00, beta0, c[ 0*rs_c + 0*cs_c ] ); PASTEMAC(CH,xpbys)( ab01, beta0, c[ 0*rs_c + 1*cs_c ] ); PASTEMAC(CH,xpbys)( ab02, beta0, c[ 0*rs_c + 2*cs_c ] ); PASTEMAC(CH,xpbys)( ab03, beta0, c[ 0*rs_c + 3*cs_c ] ); PASTEMAC(CH,xpbys)( ab04, beta0, c[ 0*rs_c + 4*cs_c ] ); PASTEMAC(CH,xpbys)( ab05, beta0, c[ 0*rs_c + 5*cs_c ] ); PASTEMAC(CH,xpbys)( ab06, beta0, c[ 0*rs_c + 6*cs_c ] ); PASTEMAC(CH,xpbys)( ab07, beta0, c[ 0*rs_c + 7*cs_c ] ); PASTEMAC(CH,xpbys)( ab10, beta0, c[ 1*rs_c + 0*cs_c ] ); PASTEMAC(CH,xpbys)( ab11, beta0, c[ 1*rs_c + 1*cs_c ] ); PASTEMAC(CH,xpbys)( ab12, beta0, c[ 1*rs_c + 2*cs_c ] ); PASTEMAC(CH,xpbys)( ab13, beta0, c[ 1*rs_c + 3*cs_c ] ); PASTEMAC(CH,xpbys)( ab14, beta0, c[ 1*rs_c + 4*cs_c ] ); PASTEMAC(CH,xpbys)( ab15, beta0, c[ 1*rs_c + 5*cs_c ] ); PASTEMAC(CH,xpbys)( ab16, beta0, c[ 1*rs_c + 6*cs_c ] ); PASTEMAC(CH,xpbys)( ab17, beta0, c[ 1*rs_c + 7*cs_c ] ); PASTEMAC(CH,xpbys)( ab20, beta0, c[ 2*rs_c + 0*cs_c ] ); PASTEMAC(CH,xpbys)( ab21, beta0, c[ 2*rs_c + 1*cs_c ] ); PASTEMAC(CH,xpbys)( ab22, beta0, c[ 2*rs_c + 2*cs_c ] ); PASTEMAC(CH,xpbys)( ab23, beta0, c[ 2*rs_c + 3*cs_c ] ); PASTEMAC(CH,xpbys)( ab24, beta0, c[ 2*rs_c + 4*cs_c ] ); PASTEMAC(CH,xpbys)( ab25, beta0, c[ 2*rs_c + 5*cs_c ] ); PASTEMAC(CH,xpbys)( ab26, beta0, c[ 2*rs_c + 6*cs_c ] ); PASTEMAC(CH,xpbys)( ab27, beta0, c[ 2*rs_c + 7*cs_c ] ); PASTEMAC(CH,xpbys)( ab30, beta0, c[ 3*rs_c + 0*cs_c ] ); PASTEMAC(CH,xpbys)( ab31, beta0, c[ 3*rs_c + 1*cs_c ] ); PASTEMAC(CH,xpbys)( ab32, beta0, c[ 3*rs_c + 2*cs_c ] ); PASTEMAC(CH,xpbys)( ab33, beta0, c[ 3*rs_c + 3*cs_c ] ); PASTEMAC(CH,xpbys)( ab34, beta0, c[ 3*rs_c + 4*cs_c ] ); PASTEMAC(CH,xpbys)( ab35, beta0, c[ 3*rs_c + 5*cs_c ] ); PASTEMAC(CH,xpbys)( ab36, beta0, c[ 3*rs_c + 6*cs_c ] ); PASTEMAC(CH,xpbys)( ab37, beta0, c[ 3*rs_c + 7*cs_c ] ); } } } cython-blis-0.9.1/blis/_src/ref_kernels/3/old/bli_trsm_simd_ref.c000066400000000000000000000130501427272030600247030ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #if 1 // An implementation that attempts to facilitate emission of vectorized // instructions via constant loop bounds + #pragma omp simd directives. #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const inc_t rs_a = 1; \ const inc_t cs_a = mr; \ \ const inc_t rs_b = nr; \ const inc_t cs_b = 1; \ \ PRAGMA_SIMD \ for ( dim_t i = 0; i < mr; ++i ) \ { \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ for ( dim_t j = 0; j < nr; ++j ) \ { \ ctype beta11c = b[i*rs_b + j*cs_b]; \ ctype rho11; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0s)( rho11 ); \ for ( dim_t l = 0; l < i; ++l ) \ { \ PASTEMAC(ch,axpys)( a[i*rs_a + l*cs_a], \ b[l*rs_b + j*cs_b], rho11 ); \ } \ PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ \ PASTEMAC(ch,scals)( a[i*rs_a + i*cs_a], beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, c[i*rs_c + j*cs_c] ); \ \ /* Store the local value back to b11. */ \ PASTEMAC(ch,copys)( beta11c, b[i*rs_b + j*cs_b] ); \ } \ } \ } //INSERT_GENTFUNC_BASIC2( trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 ) GENTFUNC( double, d, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( scomplex, c, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( dcomplex, z, trsm_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 ) #undef GENTFUNC #define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const inc_t rs_a = 1; \ const inc_t cs_a = mr; \ \ const inc_t rs_b = nr; \ const inc_t cs_b = 1; \ \ PRAGMA_SIMD \ for ( dim_t iter = 0; iter < mr; ++iter ) \ { \ dim_t i = mr - iter - 1; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ for ( dim_t j = 0; j < nr; ++j ) \ { \ ctype beta11c = b[i*rs_b + j*cs_b]; \ ctype rho11; \ \ /* beta11 = beta11 - a12t * b21; */ \ PASTEMAC(ch,set0s)( rho11 ); \ for ( dim_t l = 0; l < iter; ++l ) \ { \ PASTEMAC(ch,axpys)( a[i*rs_a + (i+1+l)*cs_a], \ b[(i+1+l)*rs_b + j*cs_b], rho11 ); \ } \ PASTEMAC(ch,subs)( rho11, beta11c ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: The INVERSE of alpha11 (1.0/alpha11) is stored instead of alpha11, so we can multiply rather than divide. We store the inverse of alpha11 intentionally to avoid expensive division instructions within the micro-kernel. */ \ PASTEMAC(ch,scals)( a[i*rs_a + i*cs_a], beta11c ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,copys)( beta11c, c[i*rs_c + j*cs_c] ); \ \ /* Store the local value back to b11. */ \ PASTEMAC(ch,copys)( beta11c, b[i*rs_b + j*cs_b] ); \ } \ } \ } //INSERT_GENTFUNC_BASIC2( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) GENTFUNC( float, s, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 ) GENTFUNC( double, d, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( scomplex, c, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 ) GENTFUNC( dcomplex, z, trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 ) #else #endif cython-blis-0.9.1/blis/_src/ref_kernels/bli_cntx_ref.c000066400000000000000000000643311427272030600227460ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" // -- Instantiate kernel prototypes for the current architecture --------------- // Define macros to construct the full symbol name from the operation name. #undef GENARNAME // architecture, _ref (no bli_) #define GENARNAME(opname) PASTECH2(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) #undef GENBARNAME // bli_, architecture, _ref #define GENBARNAME(opname) PASTEMAC2(opname,BLIS_CNAME_INFIX,BLIS_REF_SUFFIX) #undef GENBAINAME // bli_, architecture, _ind #define GENBAINAME(opname) PASTEMAC2(opname,BLIS_CNAME_INFIX,BLIS_IND_SUFFIX) // -- Level-3 native micro-kernel prototype redefinitions ---------------------- // -- Prototypes for completely generic level-3 microkernels -- #undef gemm_ukr_name #define gemm_ukr_name GENARNAME(gemm) #undef gemmtrsm_l_ukr_name #define gemmtrsm_l_ukr_name GENARNAME(gemmtrsm_l) #undef gemmtrsm_u_ukr_name #define gemmtrsm_u_ukr_name GENARNAME(gemmtrsm_u) #undef trsm_l_ukr_name #define trsm_l_ukr_name GENARNAME(trsm_l) #undef trsm_u_ukr_name #define trsm_u_ukr_name GENARNAME(trsm_u) // Instantiate prototypes for above functions via the native micro-kernel API // template. #include "bli_l3_ukr.h" // -- Level-3 virtual micro-kernel prototype redefinitions --------------------- // -- Prototypes for induced method level-3 microkernels -- // -- 1m -- #undef gemm1m_ukr_name #define gemm1m_ukr_name GENARNAME(gemm1m) #undef gemmtrsm1m_l_ukr_name #define gemmtrsm1m_l_ukr_name GENARNAME(gemmtrsm1m_l) #undef gemmtrsm1m_u_ukr_name #define gemmtrsm1m_u_ukr_name GENARNAME(gemmtrsm1m_u) #undef trsm1m_l_ukr_name #define trsm1m_l_ukr_name GENARNAME(trsm1m_l) #undef trsm1m_u_ukr_name #define trsm1m_u_ukr_name GENARNAME(trsm1m_u) // Instantiate prototypes for above functions via the virtual micro-kernel API // template. #include "bli_l3_ind_ukr.h" // -- Level-3 small/unpacked micro-kernel prototype definitions ---------------- // NOTE: This results in redundant prototypes for gemmsup_r and gemmsup_c // kernels, but since they will be identical the compiler won't complain. #undef gemmsup_rv_ukr_name #define gemmsup_rv_ukr_name GENARNAME(gemmsup_r) #undef gemmsup_rg_ukr_name #define gemmsup_rg_ukr_name GENARNAME(gemmsup_r) #undef gemmsup_cv_ukr_name #define gemmsup_cv_ukr_name GENARNAME(gemmsup_c) #undef gemmsup_cg_ukr_name #define gemmsup_cg_ukr_name GENARNAME(gemmsup_c) #undef gemmsup_gx_ukr_name #define gemmsup_gx_ukr_name GENARNAME(gemmsup_g) // Include the small/unpacked kernel API template. #include "bli_l3_sup_ker.h" // -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------ #undef packm_2xk_ker_name #define packm_2xk_ker_name GENARNAME(packm_2xk) #undef packm_3xk_ker_name #define packm_3xk_ker_name GENARNAME(packm_3xk) #undef packm_4xk_ker_name #define packm_4xk_ker_name GENARNAME(packm_4xk) #undef packm_6xk_ker_name #define packm_6xk_ker_name GENARNAME(packm_6xk) #undef packm_8xk_ker_name #define packm_8xk_ker_name GENARNAME(packm_8xk) #undef packm_10xk_ker_name #define packm_10xk_ker_name GENARNAME(packm_10xk) #undef packm_12xk_ker_name #define packm_12xk_ker_name GENARNAME(packm_12xk) #undef packm_14xk_ker_name #define packm_14xk_ker_name GENARNAME(packm_14xk) #undef packm_16xk_ker_name #define packm_16xk_ker_name GENARNAME(packm_16xk) #undef packm_24xk_ker_name #define packm_24xk_ker_name GENARNAME(packm_24xk) #undef unpackm_2xk_ker_name #define unpackm_2xk_ker_name GENARNAME(unpackm_2xk) #undef unpackm_4xk_ker_name #define unpackm_4xk_ker_name GENARNAME(unpackm_4xk) #undef unpackm_6xk_ker_name #define unpackm_6xk_ker_name GENARNAME(unpackm_6xk) #undef unpackm_8xk_ker_name #define unpackm_8xk_ker_name GENARNAME(unpackm_8xk) #undef unpackm_10xk_ker_name #define unpackm_10xk_ker_name GENARNAME(unpackm_10xk) #undef unpackm_12xk_ker_name #define unpackm_12xk_ker_name GENARNAME(unpackm_12xk) #undef unpackm_14xk_ker_name #define unpackm_14xk_ker_name GENARNAME(unpackm_14xk) #undef unpackm_16xk_ker_name #define unpackm_16xk_ker_name GENARNAME(unpackm_16xk) #undef packm_2xk_1er_ker_name #define packm_2xk_1er_ker_name GENARNAME(packm_2xk_1er) #undef packm_4xk_1er_ker_name #define packm_4xk_1er_ker_name GENARNAME(packm_4xk_1er) #undef packm_6xk_1er_ker_name #define packm_6xk_1er_ker_name GENARNAME(packm_6xk_1er) #undef packm_8xk_1er_ker_name #define packm_8xk_1er_ker_name GENARNAME(packm_8xk_1er) #undef packm_10xk_1er_ker_name #define packm_10xk_1er_ker_name GENARNAME(packm_10xk_1er) #undef packm_12xk_1er_ker_name #define packm_12xk_1er_ker_name GENARNAME(packm_12xk_1er) #undef packm_14xk_1er_ker_name #define packm_14xk_1er_ker_name GENARNAME(packm_14xk_1er) #undef packm_16xk_1er_ker_name #define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1er) // Instantiate prototypes for above functions via the level-1m kernel API // template. #include "bli_l1m_ker.h" // -- Level-1f kernel prototype redefinitions ---------------------------------- #undef axpy2v_ker_name #define axpy2v_ker_name GENARNAME(axpy2v) #undef dotaxpyv_ker_name #define dotaxpyv_ker_name GENARNAME(dotaxpyv) #undef axpyf_ker_name #define axpyf_ker_name GENARNAME(axpyf) #undef dotxf_ker_name #define dotxf_ker_name GENARNAME(dotxf) #undef dotxaxpyf_ker_name #define dotxaxpyf_ker_name GENARNAME(dotxaxpyf) // Instantiate prototypes for above functions via the level-1f kernel API // template. #include "bli_l1f_ker.h" // -- Level-1v kernel prototype redefinitions ---------------------------------- // -- prototypes for completely generic level-1v kernels -- #undef addv_ker_name #define addv_ker_name GENARNAME(addv) #undef amaxv_ker_name #define amaxv_ker_name GENARNAME(amaxv) #undef axpbyv_ker_name #define axpbyv_ker_name GENARNAME(axpbyv) #undef axpyv_ker_name #define axpyv_ker_name GENARNAME(axpyv) #undef copyv_ker_name #define copyv_ker_name GENARNAME(copyv) #undef dotv_ker_name #define dotv_ker_name GENARNAME(dotv) #undef dotxv_ker_name #define dotxv_ker_name GENARNAME(dotxv) #undef invertv_ker_name #define invertv_ker_name GENARNAME(invertv) #undef scalv_ker_name #define scalv_ker_name GENARNAME(scalv) #undef scal2v_ker_name #define scal2v_ker_name GENARNAME(scal2v) #undef setv_ker_name #define setv_ker_name GENARNAME(setv) #undef subv_ker_name #define subv_ker_name GENARNAME(subv) #undef swapv_ker_name #define swapv_ker_name GENARNAME(swapv) #undef xpbyv_ker_name #define xpbyv_ker_name GENARNAME(xpbyv) // Instantiate prototypes for above functions via the level-1v kernel API // template. #include "bli_l1v_ker.h" // -- Macros to help concisely instantiate bli_func_init() --------------------- #define gen_func_init_co( func_p, opname ) \ { \ bli_func_init( func_p, NULL, NULL, \ PASTEMAC(c,opname), PASTEMAC(z,opname) ); \ } #define gen_func_init( func_p, opname ) \ { \ bli_func_init( func_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \ PASTEMAC(c,opname), PASTEMAC(z,opname) ); \ } #define gen_sup_func_init( func0_p, func1_p, opname ) \ { \ bli_func_init( func0_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \ PASTEMAC(c,opname), PASTEMAC(z,opname) ); \ bli_func_init( func1_p, PASTEMAC(s,opname), PASTEMAC(d,opname), \ PASTEMAC(c,opname), PASTEMAC(z,opname) ); \ } // -- Helper function for 1m --------------------------------------------------- void GENBAINAME(cntx_init_blkszs) ( ind_t method, num_t dt, cntx_t* cntx ); // ----------------------------------------------------------------------------- void GENBARNAME(cntx_init) ( cntx_t* cntx ) { blksz_t blkszs[ BLIS_NUM_BLKSZS ]; blksz_t thresh[ BLIS_NUM_THRESH ]; func_t* funcs; mbool_t* mbools; dim_t i; void** vfuncs; // -- Clear the context ---------------------------------------------------- bli_cntx_clear( cntx ); // -- Set blocksizes ------------------------------------------------------- // s d c z bli_blksz_init_easy( &blkszs[ BLIS_KR ], 1, 1, 1, 1 ); bli_blksz_init_easy( &blkszs[ BLIS_MR ], 4, 4, 4, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_NR ], 16, 8, 8, 4 ); bli_blksz_init_easy( &blkszs[ BLIS_MC ], 256, 128, 128, 64 ); bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 256, 256, 256 ); bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 ); bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 ); bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 ); bli_blksz_init_easy( &blkszs[ BLIS_AF ], 8, 8, 8, 8 ); bli_blksz_init_easy( &blkszs[ BLIS_DF ], 6, 6, 6, 6 ); bli_blksz_init_easy( &blkszs[ BLIS_XF ], 4, 4, 4, 4 ); // Initialize the context with the default blocksize objects and their // multiples. bli_cntx_set_blkszs ( BLIS_NAT, 11, BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR, BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR, BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR, BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR, BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR, BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR, BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2, BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2, BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF, BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF, BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF, cntx ); // -- Set level-3 virtual micro-kernels ------------------------------------ funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); // NOTE: We set the virtual micro-kernel slots to contain the addresses // of the native micro-kernels. In general, the ukernels in the virtual // ukernel slots are always called, and if the function called happens to // be a virtual micro-kernel, it will then know to find its native ukernel // (i.e., in the native ukernel slots). gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); // -- Set level-3 native micro-kernels and preferences --------------------- funcs = bli_cntx_l3_nat_ukrs_buf( cntx ); mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx ); gen_func_init( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); gen_func_init( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); gen_func_init( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); // s d c z bli_mbool_init( &mbools[ BLIS_GEMM_UKR ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE ); bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE ); // -- Set level-3 small/unpacked thresholds -------------------------------- // NOTE: The default thresholds are set to zero so that the sup framework // does not activate by default. Note that the semantic meaning of the // thresholds is that the sup code path is executed if a dimension is // strictly less than its corresponding threshold. So actually, the // thresholds specify the minimum dimension size that will still dispatch // the non-sup/large code path. This "strictly less than" behavior was // chosen over "less than or equal to" so that threshold values of 0 would // effectively disable sup (even for matrix dimensions of 0). // s d c z bli_blksz_init_easy( &thresh[ BLIS_MT ], 0, 0, 0, 0 ); bli_blksz_init_easy( &thresh[ BLIS_NT ], 0, 0, 0, 0 ); bli_blksz_init_easy( &thresh[ BLIS_KT ], 0, 0, 0, 0 ); // Initialize the context with the default thresholds. bli_cntx_set_l3_sup_thresh ( 3, BLIS_MT, &thresh[ BLIS_MT ], BLIS_NT, &thresh[ BLIS_NT ], BLIS_KT, &thresh[ BLIS_KT ], cntx ); // -- Set level-3 small/unpacked handlers ---------------------------------- vfuncs = bli_cntx_l3_sup_handlers_buf( cntx ); // Initialize all of the function pointers to NULL; for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL; // The level-3 sup handlers are oapi-based, so we only set one slot per // operation. // Set the gemm slot to the default gemm sup handler. vfuncs[ BLIS_GEMM ] = bli_gemmsup_ref; vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref; // -- Set level-3 small/unpacked micro-kernels and preferences ------------- funcs = bli_cntx_l3_sup_kers_buf( cntx ); mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx ); #if 0 // Adhere to the small/unpacked ukernel mappings: // - rv -> rrr, rcr // - rg -> rrc, rcc // - cv -> ccr, ccc // - cg -> crr, crc gen_sup_func_init( &funcs[ BLIS_RRR ], &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); gen_sup_func_init( &funcs[ BLIS_RRC ], &funcs[ BLIS_RCC ], gemmsup_rg_ukr_name ); gen_sup_func_init( &funcs[ BLIS_CCR ], &funcs[ BLIS_CCC ], gemmsup_cv_ukr_name ); gen_sup_func_init( &funcs[ BLIS_CRR ], &funcs[ BLIS_CRC ], gemmsup_cg_ukr_name ); #endif gen_func_init( &funcs[ BLIS_RRR ], gemmsup_rv_ukr_name ); gen_func_init( &funcs[ BLIS_RRC ], gemmsup_rv_ukr_name ); gen_func_init( &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name ); gen_func_init( &funcs[ BLIS_RCC ], gemmsup_rv_ukr_name ); gen_func_init( &funcs[ BLIS_CRR ], gemmsup_rv_ukr_name ); gen_func_init( &funcs[ BLIS_CRC ], gemmsup_rv_ukr_name ); gen_func_init( &funcs[ BLIS_CCR ], gemmsup_rv_ukr_name ); gen_func_init( &funcs[ BLIS_CCC ], gemmsup_rv_ukr_name ); // Register the general-stride/generic ukernel to the "catch-all" slot // associated with the BLIS_XXX enum value. This slot will be queried if // *any* operand is stored with general stride. gen_func_init( &funcs[ BLIS_XXX ], gemmsup_gx_ukr_name ); // Set the l3 sup ukernel storage preferences. // s d c z bli_mbool_init( &mbools[ BLIS_RRR ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_RRC ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_RCR ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_RCC ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_CRR ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_CRC ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_CCR ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_CCC ], TRUE, TRUE, TRUE, TRUE ); bli_mbool_init( &mbools[ BLIS_XXX ], TRUE, TRUE, TRUE, TRUE ); // -- Set level-1f kernels ------------------------------------------------- funcs = bli_cntx_l1f_kers_buf( cntx ); gen_func_init( &funcs[ BLIS_AXPY2V_KER ], axpy2v_ker_name ); gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ], dotaxpyv_ker_name ); gen_func_init( &funcs[ BLIS_AXPYF_KER ], axpyf_ker_name ); gen_func_init( &funcs[ BLIS_DOTXF_KER ], dotxf_ker_name ); gen_func_init( &funcs[ BLIS_DOTXAXPYF_KER ], dotxaxpyf_ker_name ); // -- Set level-1v kernels ------------------------------------------------- funcs = bli_cntx_l1v_kers_buf( cntx ); gen_func_init( &funcs[ BLIS_ADDV_KER ], addv_ker_name ); gen_func_init( &funcs[ BLIS_AMAXV_KER ], amaxv_ker_name ); gen_func_init( &funcs[ BLIS_AXPBYV_KER ], axpbyv_ker_name ); gen_func_init( &funcs[ BLIS_AXPYV_KER ], axpyv_ker_name ); gen_func_init( &funcs[ BLIS_COPYV_KER ], copyv_ker_name ); gen_func_init( &funcs[ BLIS_DOTV_KER ], dotv_ker_name ); gen_func_init( &funcs[ BLIS_DOTXV_KER ], dotxv_ker_name ); gen_func_init( &funcs[ BLIS_INVERTV_KER ], invertv_ker_name ); gen_func_init( &funcs[ BLIS_SCALV_KER ], scalv_ker_name ); gen_func_init( &funcs[ BLIS_SCAL2V_KER ], scal2v_ker_name ); gen_func_init( &funcs[ BLIS_SETV_KER ], setv_ker_name ); gen_func_init( &funcs[ BLIS_SUBV_KER ], subv_ker_name ); gen_func_init( &funcs[ BLIS_SWAPV_KER ], swapv_ker_name ); gen_func_init( &funcs[ BLIS_XPBYV_KER ], xpbyv_ker_name ); // -- Set level-1m (packm/unpackm) kernels --------------------------------- funcs = bli_cntx_packm_kers_buf( cntx ); // Initialize all packm kernel func_t entries to NULL. for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i ) { bli_func_init_null( &funcs[ i ] ); } gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ], packm_3xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name ); funcs = bli_cntx_unpackm_kers_buf( cntx ); // Initialize all packm kernel func_t entries to NULL. for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i ) { bli_func_init_null( &funcs[ i ] ); } gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ], unpackm_2xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ], unpackm_4xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ], unpackm_6xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ], unpackm_8xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name ); gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name ); // -- Set miscellaneous fields --------------------------------------------- bli_cntx_set_method( BLIS_NAT, cntx ); } // ----------------------------------------------------------------------------- void GENBAINAME(cntx_init) ( ind_t method, cntx_t* cntx ) { func_t* funcs; dim_t i; // This function is designed to modify a copy of an existing native // context to enable computation via an induced method for complex // domain level-3 operations. It is called by bli_gks_query_ind_cntx() // on a context after its contexts are set by copying from the // architecture's native context. // -- Set induced method level-3 virtual micro-kernels --------------------- funcs = bli_cntx_l3_vir_ukrs_buf( cntx ); if ( method == BLIS_1M ) { gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm1m_ukr_name ); gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name ); gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name ); gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm1m_l_ukr_name ); gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm1m_u_ukr_name ); } else // if ( method == BLIS_NAT ) { gen_func_init_co( &funcs[ BLIS_GEMM_UKR ], gemm_ukr_name ); gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name ); gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name ); gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ], trsm_l_ukr_name ); gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ], trsm_u_ukr_name ); } // For 1m, we employ an optimization which requires that we copy the native // real domain gemm ukernel function pointers to the corresponding real // domain slots in the virtual gemm ukernel func_t. This optimization allows // us to, under certain conditions, adjust various parameters within the gemm // macrokernel so that the real-domain macrokernel (which will query and use // the real-domain virtual gemm ukernel) can be called instead of calling the // complex-domain macrokernel and the corresponding complex-domain virtual // microkernel. The non-optimized code path would require an extra level of // function call overhead, which can be avoided in most cases (i.e., when // beta has a zero imaginary component and C is either row- or column-stored). if ( method == BLIS_1M ) { func_t* gemm_nat_ukrs = bli_cntx_get_l3_nat_ukrs( BLIS_GEMM_UKR, cntx ); func_t* gemm_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, cntx ); bli_func_copy_dt( BLIS_FLOAT, gemm_nat_ukrs, BLIS_FLOAT, gemm_vir_ukrs ); bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs ); } // -- Set induced method packm kernels ------------------------------------- funcs = bli_cntx_packm_kers_buf( cntx ); // Initialize all packm kernel func_t entries to NULL. for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i ) { bli_func_init_null( &funcs[ i ] ); } if ( method == BLIS_1M ) { gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_1er_ker_name ); gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_1er_ker_name ); gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_1er_ker_name ); gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_1er_ker_name ); gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_1er_ker_name ); gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_1er_ker_name ); gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_1er_ker_name ); gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_1er_ker_name ); } else // if ( method == BLIS_NAT ) { gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ], packm_2xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ], packm_3xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ], packm_4xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ], packm_6xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ], packm_8xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name ); gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name ); } // -- Set induced method cache and register blocksizes --------------------- // Modify the context with cache and register blocksizes (and multiples) // appropriate for the current induced method. if ( method == BLIS_1M ) { //const bool is_pb = FALSE; // Call a helper function to initialize blocksizes for each complex // datatype. GENBAINAME(cntx_init_blkszs)( method, BLIS_SCOMPLEX, cntx ); GENBAINAME(cntx_init_blkszs)( method, BLIS_DCOMPLEX, cntx ); } else // if ( method == BLIS_NAT ) { // No change in blocksizes needed for native execution. } } // ----------------------------------------------------------------------------- void GENBAINAME(cntx_init_blkszs) ( ind_t method, num_t dt, cntx_t* cntx ) { // We MUST set the induced method in the context prior to calling // bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries // the induced method. That function needs the induced method value in // order to determine whether to evaluate the "prefers column storage" // predicate using the storage preference of the kernel for dt, or // the storage preference of the kernel for the real projection of // dt. Failing to set the induced method here can lead to strange // undefined behavior at runtime if the native complex kernel's // storage preference happens to not equal that of the native real // kernel. bli_cntx_set_method( method, cntx ); // Initialize the blocksizes according to the micro-kernel preference as // well as the algorithm. if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) ) { // This branch is used for algorithm 1m_c_bp. bli_cntx_set_ind_blkszs ( method, dt, 6, BLIS_NC, 1.0, 1.0, BLIS_KC, 2.0, 2.0, // halve kc... BLIS_MC, 2.0, 2.0, // halve mc... BLIS_NR, 1.0, 1.0, BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr) BLIS_KR, 1.0, 1.0, cntx ); } else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) ) { // This branch is used for algorithm 1m_r_bp. bli_cntx_set_ind_blkszs ( method, dt, 6, BLIS_NC, 2.0, 2.0, // halve nc... BLIS_KC, 2.0, 2.0, // halve kc... BLIS_MC, 1.0, 1.0, BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr) BLIS_MR, 1.0, 1.0, BLIS_KR, 1.0, 1.0, cntx ); } } cython-blis-0.9.1/blis/_src/ref_kernels/ind/000077500000000000000000000000001427272030600207075ustar00rootroot00000000000000cython-blis-0.9.1/blis/_src/ref_kernels/ind/bli_gemm1m_ref.c000066400000000000000000000213671427272030600237310ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict beta, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ const bool row_pref = !col_pref; \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const dim_t mr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ const dim_t nr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ \ const dim_t k2 = 2 * k; \ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype_r ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ inc_t rs_ct; \ inc_t cs_ct; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ \ ctype_r* restrict alpha_r = &PASTEMAC(ch,real)( *alpha ); \ ctype_r* restrict alpha_i = &PASTEMAC(ch,imag)( *alpha ); \ \ ctype_r* restrict beta_r = &PASTEMAC(ch,real)( *beta ); \ ctype_r* restrict beta_i = &PASTEMAC(ch,imag)( *beta ); \ \ ctype_r* c_use; \ inc_t rs_c_use; \ inc_t cs_c_use; \ \ bool using_ct; \ \ /* PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: a", mr, 2*k, \ a_r, 1, mr, "%5.2f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: b", 2*k, 2*nr, \ b_r, 2*nr, 1, "%5.2f", "" ); \ PASTEMAC(chr,fprintm)( stdout, "gemm_ukr: c after", mr, 2*nr, \ c_use, rs_c_use, cs_c_use, "%5.2f", "" ); \ */ \ \ /* SAFETY CHECK: The higher level implementation should never allow an alpha with non-zero imaginary component to be passed in, because it can't be applied properly using the 1m method. If alpha is not real, then something is very wrong. */ \ if ( !PASTEMAC(chr,eq0)( *alpha_i ) ) \ bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \ \ \ /* If beta has a non-zero imaginary component OR if c is stored with general stride, then we compute the alpha*a*b product into temporary storage and then accumulate that result into c afterwards. Note that the other two cases concerning disagreement between the storage of C and the output preference of the micro-kernel, should ONLY occur in the context of trsm, whereby this virtual micro-kernel is called directly from the trsm macro-kernel to update the micro-tile b11 that exists within the packed row-panel of B. Indeed that is the reason those cases MUST be explicitly handled. */ \ if ( !PASTEMAC(chr,eq0)( *beta_i ) ) using_ct = TRUE; \ else if ( bli_is_col_stored( rs_c, cs_c ) && row_pref ) using_ct = TRUE; \ else if ( bli_is_row_stored( rs_c, cs_c ) && col_pref ) using_ct = TRUE; \ else if ( bli_is_gen_stored( rs_c, cs_c ) ) using_ct = TRUE; \ else using_ct = FALSE; \ \ \ /* If we are not computing a full micro-tile, then we must write to ct and then accumulate to c afterwards. */ \ if ( mr != m || nr != n ) using_ct = TRUE; \ \ \ if ( using_ct ) \ { \ /* In the atypical cases, we compute the result into temporary workspace ct and then accumulated it back to c at the end. */ \ \ /* Set the strides of ct based on the preference of the underlying native real domain gemm micro-kernel. Note that we set the ct strides in units of complex elements. */ \ if ( col_pref ) { rs_ct = 1; cs_ct = mr; } \ else { rs_ct = nr; cs_ct = 1; } \ \ c_use = ( ctype_r* )ct; \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ \ /* Convert the strides from being in units of complex elements to be in units of real elements. Note that we don't need to check for general storage here because that case corresponds to the scenario where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ else rs_c_use *= 2; \ \ /* The following gemm micro-kernel call implements the 1m method, which induces a complex matrix multiplication by calling the real matrix micro-kernel on micro-panels that have been packed according to the 1e and 1r formats. */ \ \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ mr_r, \ nr_r, \ k2, \ alpha_r, \ a_r, \ b_r, \ zero_r, \ c_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ \ dim_t i, j; \ \ /* Accumulate the final result in ct back to c. */ \ if ( PASTEMAC(ch,eq1)( *beta ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ else if ( PASTEMAC(ch,eq0)( *beta ) ) \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ else \ { \ for ( j = 0; j < n; ++j ) \ for ( i = 0; i < m; ++i ) \ { \ PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \ *beta, \ *(c + i*rs_c + j*cs_c ) ); \ } \ } \ } \ else \ { \ /* In the typical cases, we use the real part of beta and accumulate directly into the output matrix c. */ \ \ c_use = ( ctype_r* )c; \ rs_c_use = rs_c; \ cs_c_use = cs_c; \ \ /* Convert the strides from being in units of complex elements to be in units of real elements. Note that we don't need to check for general storage here because that case corresponds to the scenario where we are using the ct buffer and its rs_ct/cs_ct strides. */ \ if ( bli_is_col_stored( rs_c_use, cs_c_use ) ) cs_c_use *= 2; \ else rs_c_use *= 2; \ \ /* The following gemm micro-kernel call implements the 1m method, which induces a complex matrix multiplication by calling the real matrix micro-kernel on micro-panels that have been packed according to the 1e and 1r formats. */ \ \ /* c = beta * c + alpha_r * a * b; */ \ rgemm_ukr \ ( \ mr_r, \ nr_r, \ k2, \ alpha_r, \ a_r, \ b_r, \ beta_r, \ c_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ } \ } INSERT_GENTFUNCCO_BASIC2( gemm1m, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX ) cython-blis-0.9.1/blis/_src/ref_kernels/ind/bli_gemmtrsm1m_ref.c000066400000000000000000000214721427272030600246340ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, trsmkerid ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ dim_t m, \ dim_t n, \ dim_t k, \ ctype* restrict alpha, \ ctype* restrict a1x, \ ctype* restrict a11, \ ctype* restrict bx1, \ ctype* restrict b11, \ ctype* restrict c11, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ const num_t dt_r = PASTEMAC(chr,type); \ \ PASTECH(chr,gemm_ukr_ft) \ rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ \ PASTECH(ch,trsm_ukr_ft) \ ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \ \ const bool col_pref_r = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const dim_t mr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_MR, cntx ); \ const dim_t nr_r = bli_cntx_get_blksz_def_dt( dt_r, BLIS_NR, cntx ); \ \ ctype bt[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ inc_t rs_bt; \ inc_t cs_bt; \ \ inc_t rs_bt_r; \ inc_t cs_bt_r; \ \ const dim_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const pack_t schema_b = bli_auxinfo_schema_b( data ); \ \ const dim_t k2 = 2 * k; \ \ ctype_r* restrict a1x_r = ( ctype_r* )a1x; \ \ ctype_r* restrict bx1_r = ( ctype_r* )bx1; \ \ const inc_t rs_b = packnr; \ const inc_t cs_b = 1; \ \ ctype_r* restrict zero_r = PASTEMAC(chr,0); \ ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \ \ const ctype_r alpha_r = PASTEMAC(ch,real)( *alpha ); \ const ctype_r alpha_i = PASTEMAC(ch,imag)( *alpha ); \ \ ctype_r* b_use; \ inc_t rs_b_use; \ inc_t cs_b_use; \ \ ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ / sizeof( ctype ) ] \ __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ /* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR instead? */ \ const bool col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \ const inc_t rs_ct = ( col_pref ? 1 : nr ); \ const inc_t cs_ct = ( col_pref ? mr : 1 ); \ \ const bool use_ct = ( m < mr || n < nr ); \ \ ctype* restrict c11_use = c11; \ inc_t rs_c_use = rs_c; \ inc_t cs_c_use = cs_c; \ \ if ( use_ct ) \ { \ c11_use = ct; \ rs_c_use = rs_ct; \ cs_c_use = cs_ct; \ } \ \ \ /* Handle alphas with non-zero imaginary components. */ \ /* NOTE: This branch should never execute because alphas with non-zero imaginary components should be applied during packing, and so the only alphas we should see here are those exclusively in the real domain, either because the value originally had no imaginary compoent (e.g. 4.0) or because a 1.0 was sent in as a placeholder since the alpha was applied during packing. */ \ if ( 0 ) \ if ( !PASTEMAC(chr,eq0)( alpha_i ) ) \ { \ bli_abort(); \ \ /* ctype_r* restrict one_r = PASTEMAC(chr,1); \ \ const inc_t ld_b = rs_b; \ \ PASTEMAC(ch,scal1ms_mxn)( schema_b, \ mr, \ nr, \ alpha, \ b11, rs_b, cs_b, ld_b ); \ \ alpha_r = *one_r; \ */ \ } \ \ \ { \ /* Set the strides for the temporary bt matrix based on the native real domain micro-kernel storage preferences. */ \ if ( col_pref_r ) { rs_bt = 1; cs_bt = mr; \ rs_bt_r = 1; cs_bt_r = mr_r; } \ else { rs_bt = nr; cs_bt = 1; \ rs_bt_r = nr_r; cs_bt_r = 1; } \ \ b_use = ( ctype_r* )bt; \ rs_b_use = rs_bt_r; \ cs_b_use = cs_bt_r; \ } \ \ \ /* Since b11 is stored in the 1e or 1r schema, we cannot update it directly, and instead must compute the matrix product in a local temporary microtile and then accumulate it into b11 according to its schema. */ \ \ \ /* lower: bt = -1.0 * a10 * b01; upper: bt = -1.0 * a12 * b21; */ \ rgemm_ukr \ ( \ mr_r, \ nr_r, \ k2, \ minus_one_r, \ a1x_r, \ bx1_r, \ zero_r, \ b_use, rs_b_use, cs_b_use, \ data, \ cntx \ ); \ \ \ if ( bli_is_1e_packed( schema_b ) ) \ { \ const inc_t ld_b = rs_b; \ \ ctype* restrict b11_ri = ( ctype* )b11; \ ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \ \ dim_t i, j; \ \ /* b11 = alpha * b11 + bt; */ \ for ( j = 0; j < nr; ++j ) \ for ( i = 0; i < mr; ++i ) \ { \ ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ ctype* restrict beta11_ri = b11_ri + i*rs_b + j*cs_b; \ ctype_r* restrict beta11_r = &PASTEMAC(ch,real)( *beta11_ri ); \ ctype_r* restrict beta11_i = &PASTEMAC(ch,imag)( *beta11_ri ); \ ctype* restrict beta11_ir = b11_ir + i*rs_b + j*cs_b; \ \ PASTEMAC3(ch,chr,ch,xpbyris) \ ( \ *beta11t_r, \ *beta11t_i, \ alpha_r, \ alpha_i, /* alpha_i not referenced */ \ *beta11_r, \ *beta11_i \ ); \ \ PASTEMAC(ch,sets)( -*beta11_i, \ *beta11_r, *beta11_ir ); \ } \ } \ else /* if ( bli_is_1r_packed( schema_b ) ) */ \ { \ const inc_t ld_b = rs_b; \ const inc_t rs_b2 = 2 * rs_b; \ const inc_t cs_b2 = cs_b; \ \ ctype_r* restrict b11_r = ( ctype_r* )b11; \ ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \ \ dim_t i, j; \ \ /* b11 = alpha * b11 + bt; */ \ for ( j = 0; j < nr; ++j ) \ for ( i = 0; i < mr; ++i ) \ { \ ctype* restrict beta11t = bt + i*rs_bt + j*cs_bt; \ ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \ ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \ ctype_r* restrict beta11_r = b11_r + i*rs_b2 + j*cs_b2; \ ctype_r* restrict beta11_i = b11_i + i*rs_b2 + j*cs_b2; \ \ PASTEMAC3(ch,chr,ch,xpbyris) \ ( \ *beta11t_r, \ *beta11t_i, \ alpha_r, \ alpha_i, /* alpha_i not referenced */ \ *beta11_r, \ *beta11_i \ ); \ } \ } \ \ \ /* b11 = inv(a11) * b11; c11 = b11; */ \ ctrsm_vir_ukr \ ( \ a11, \ b11, \ c11_use, rs_c_use, cs_c_use, \ data, \ cntx \ ); \ \ if ( use_ct ) \ { \ PASTEMAC(ch,copys_mxn) \ ( \ m, n, \ ct, rs_ct, cs_ct, \ c11, rs_c, cs_c \ ); \ } \ } INSERT_GENTFUNCCO_BASIC3( gemmtrsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR ) INSERT_GENTFUNCCO_BASIC3( gemmtrsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR ) cython-blis-0.9.1/blis/_src/ref_kernels/ind/bli_trsm1m_ref.c000066400000000000000000000417041427272030600237660ustar00rootroot00000000000000/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const dim_t m = mr; \ const dim_t n = nr; \ \ const inc_t rs_a = 1; \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ const inc_t cs_b = 1; \ \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ const pack_t schema_b = bli_auxinfo_schema_b( data ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ \ \ if ( bli_is_1e_packed( schema_b ) ) \ { \ const inc_t rs_a2 = 1 * rs_a; \ const inc_t cs_a2 = 2 * cs_a; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ \ ctype* restrict b_ri = ( ctype* )b; \ ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_behind = i; \ \ ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict a10t_r = a_r + (i )*rs_a2 + (0 )*cs_a2; \ ctype_r* restrict a10t_i = a_i + (i )*rs_a2 + (0 )*cs_a2; \ ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ ctype* restrict B0_ri = b_ri + (0 )*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ ctype* restrict b01_ri = B0_ri + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0ris)( rho11_r, \ rho11_i ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype_r* restrict alpha10_r = a10t_r + (l )*cs_a2; \ ctype_r* restrict alpha10_i = a10t_i + (l )*cs_a2; \ ctype* restrict beta01_ri = b01_ri + (l )*rs_b; \ ctype_r* restrict beta01_r = &PASTEMAC(ch,real)( *beta01_ri ); \ ctype_r* restrict beta01_i = &PASTEMAC(ch,imag)( *beta01_ri ); \ \ PASTEMAC(ch,axpyris)( *alpha10_r, \ *alpha10_i, \ *beta01_r, \ *beta01_i, \ rho11_r, \ rho11_i ); \ } \ PASTEMAC(ch,subris)( rho11_r, \ rho11_i, \ beta11c_r, \ beta11c_i ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,diagop)( *alpha11_r, \ *alpha11_i, \ beta11c_r, \ beta11c_i ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ } \ } \ } \ else /* ( bli_is_1r_packed( schema_b ) ) */ \ { \ const inc_t rs_b2 = 2 * rs_b; \ const inc_t cs_b2 = 1 * cs_b; \ \ ctype* restrict a_ri = ( ctype* )a; \ /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = iter; \ n_behind = i; \ \ ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ ctype* restrict a10t_ri = a_ri + (i )*rs_a + (0 )*cs_a; \ ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ ctype_r* restrict B0_r = b_r + (0 )*rs_b2 + (0 )*cs_b2; \ ctype_r* restrict B0_i = b_i + (0 )*rs_b2 + (0 )*cs_b2; \ \ /* b1 = b1 - a10t * B0; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ ctype_r* restrict b01_r = B0_r + (0 )*rs_b2 + (j )*cs_b2; \ ctype_r* restrict b01_i = B0_i + (0 )*rs_b2 + (j )*cs_b2; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype_r beta11c_r = *beta11_r; \ ctype_r beta11c_i = *beta11_i; \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0ris)( rho11_r, \ rho11_i ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha10_ri = a10t_ri + (l )*cs_a; \ ctype_r* restrict alpha10_r = &PASTEMAC(ch,real)( *alpha10_ri ); \ ctype_r* restrict alpha10_i = &PASTEMAC(ch,imag)( *alpha10_ri ); \ ctype_r* restrict beta01_r = b01_r + (l )*rs_b2; \ ctype_r* restrict beta01_i = b01_i + (l )*rs_b2; \ \ PASTEMAC(ch,axpyris)( *alpha10_r, \ *alpha10_i, \ *beta01_r, \ *beta01_i, \ rho11_r, \ rho11_i ); \ } \ PASTEMAC(ch,subris)( rho11_r, \ rho11_i, \ beta11c_r, \ beta11c_i ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,diagop)( *alpha11_r, \ *alpha11_i, \ beta11c_r, \ beta11c_i ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,sets)( beta11c_r, \ beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ PASTEMAC(ch,copyris)( beta11c_r, \ beta11c_i, \ *beta11_r, \ *beta11_i ); \ } \ } \ } \ } #ifdef BLIS_ENABLE_TRSM_PREINVERSION INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris ) #else INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris ) #endif #undef GENTFUNCCO #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \ \ void PASTEMAC3(ch,opname,arch,suf) \ ( \ ctype* restrict a, \ ctype* restrict b, \ ctype* restrict c, inc_t rs_c, inc_t cs_c, \ auxinfo_t* restrict data, \ cntx_t* restrict cntx \ ) \ { \ const num_t dt = PASTEMAC(ch,type); \ \ const dim_t mr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \ const dim_t nr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \ \ const inc_t packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \ const inc_t packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \ \ const dim_t m = mr; \ const dim_t n = nr; \ \ const inc_t rs_a = 1; \ const inc_t cs_a = packmr; \ \ const inc_t rs_b = packnr; \ const inc_t cs_b = 1; \ \ const inc_t ld_a = cs_a; \ const inc_t ld_b = rs_b; \ \ const pack_t schema_b = bli_auxinfo_schema_b( data ); \ \ dim_t iter, i, j, l; \ dim_t n_behind; \ \ \ if ( bli_is_1e_packed( schema_b ) ) \ { \ const inc_t rs_a2 = 1 * rs_a; \ const inc_t cs_a2 = 2 * cs_a; \ \ ctype_r* restrict a_r = ( ctype_r* )a; \ ctype_r* restrict a_i = ( ctype_r* )a + ld_a; \ \ ctype* restrict b_ri = ( ctype* )b; \ ctype* restrict b_ir = ( ctype* )b + ld_b/2; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_behind = iter; \ \ ctype_r* restrict alpha11_r = a_r + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict alpha11_i = a_i + (i )*rs_a2 + (i )*cs_a2; \ ctype_r* restrict a12t_r = a_r + (i )*rs_a2 + (i+1)*cs_a2; \ ctype_r* restrict a12t_i = a_i + (i )*rs_a2 + (i+1)*cs_a2; \ ctype* restrict b1_ri = b_ri + (i )*rs_b + (0 )*cs_b; \ ctype* restrict b1_ir = b_ir + (i )*rs_b + (0 )*cs_b; \ ctype* restrict B2_ri = b_ri + (i+1)*rs_b + (0 )*cs_b; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype* restrict beta11_ri = b1_ri + (0 )*rs_b + (j )*cs_b; \ ctype* restrict beta11_ir = b1_ir + (0 )*rs_b + (j )*cs_b; \ ctype* restrict b21_ri = B2_ri + (0 )*rs_b + (j )*cs_b; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype_r beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \ ctype_r beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0ris)( rho11_r, \ rho11_i ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype_r* restrict alpha12_r = a12t_r + (l )*cs_a2; \ ctype_r* restrict alpha12_i = a12t_i + (l )*cs_a2; \ ctype* restrict beta21_ri = b21_ri + (l )*rs_b; \ ctype_r* restrict beta21_r = &PASTEMAC(ch,real)( *beta21_ri ); \ ctype_r* restrict beta21_i = &PASTEMAC(ch,imag)( *beta21_ri ); \ \ PASTEMAC(ch,axpyris)( *alpha12_r, \ *alpha12_i, \ *beta21_r, \ *beta21_i, \ rho11_r, \ rho11_i ); \ } \ PASTEMAC(ch,subris)( rho11_r, \ rho11_i, \ beta11c_r, \ beta11c_i ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,diagop)( *alpha11_r, \ *alpha11_i, \ beta11c_r, \ beta11c_i ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ PASTEMAC(ch,sets)( beta11c_r, beta11c_i, *beta11_ri ); \ PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \ } \ } \ } \ else /* if ( bli_is_1r_packed( schema_b ) ) */ \ { \ const inc_t rs_b2 = 2 * rs_b; \ const inc_t cs_b2 = 1 * cs_b; \ \ ctype* restrict a_ri = ( ctype* )a; \ /*ctype* restrict a_ir = ( ctype* )a + ld_a/2;*/ \ \ ctype_r* restrict b_r = ( ctype_r* )b; \ ctype_r* restrict b_i = ( ctype_r* )b + ld_b; \ \ for ( iter = 0; iter < m; ++iter ) \ { \ i = m - iter - 1; \ n_behind = iter; \ \ ctype* restrict alpha11_ri = a_ri + (i )*rs_a + (i )*cs_a; \ ctype_r* restrict alpha11_r = &PASTEMAC(ch,real)( *alpha11_ri ); \ ctype_r* restrict alpha11_i = &PASTEMAC(ch,imag)( *alpha11_ri ); \ ctype* restrict a12t_ri = a_ri + (i )*rs_a + (i+1)*cs_a; \ ctype_r* restrict b1_r = b_r + (i )*rs_b2 + (0 )*cs_b2; \ ctype_r* restrict b1_i = b_i + (i )*rs_b2 + (0 )*cs_b2; \ ctype_r* restrict B2_r = b_r + (i+1)*rs_b2 + (0 )*cs_b2; \ ctype_r* restrict B2_i = b_i + (i+1)*rs_b2 + (0 )*cs_b2; \ \ /* b1 = b1 - a12t * B2; */ \ /* b1 = b1 / alpha11; */ \ for ( j = 0; j < n; ++j ) \ { \ ctype_r* restrict beta11_r = b1_r + (0 )*rs_b2 + (j )*cs_b2; \ ctype_r* restrict beta11_i = b1_i + (0 )*rs_b2 + (j )*cs_b2; \ ctype_r* restrict b21_r = B2_r + (0 )*rs_b2 + (j )*cs_b2; \ ctype_r* restrict b21_i = B2_i + (0 )*rs_b2 + (j )*cs_b2; \ ctype* restrict gamma11 = c + (i )*rs_c + (j )*cs_c; \ ctype_r beta11c_r = *beta11_r; \ ctype_r beta11c_i = *beta11_i; \ ctype_r rho11_r; \ ctype_r rho11_i; \ \ /* beta11 = beta11 - a10t * b01; */ \ PASTEMAC(ch,set0ris)( rho11_r, \ rho11_i ); \ for ( l = 0; l < n_behind; ++l ) \ { \ ctype* restrict alpha12_ri = a12t_ri + (l )*cs_a; \ ctype_r* restrict alpha12_r = &PASTEMAC(ch,real)( *alpha12_ri ); \ ctype_r* restrict alpha12_i = &PASTEMAC(ch,imag)( *alpha12_ri ); \ ctype_r* restrict beta21_r = b21_r + (l )*rs_b2; \ ctype_r* restrict beta21_i = b21_i + (l )*rs_b2; \ \ PASTEMAC(ch,axpyris)( *alpha12_r, \ *alpha12_i, \ *beta21_r, \ *beta21_i, \ rho11_r, \ rho11_i ); \ } \ PASTEMAC(ch,subris)( rho11_r, \ rho11_i, \ beta11c_r, \ beta11c_i ); \ \ /* beta11 = beta11 / alpha11; */ \ /* NOTE: When preinversion is enabled, the INVERSE of alpha11 (1.0/alpha11) is stored during packing instead alpha11 so we can multiply rather than divide. When preinversion is disabled, alpha11 is stored and division happens below explicitly. */ \ PASTEMAC(ch,diagop)( *alpha11_r, \ *alpha11_i, \ beta11c_r, \ beta11c_i ); \ \ /* Output final result to matrix c. */ \ PASTEMAC(ch,sets)( beta11c_r, \ beta11c_i, *gamma11 ); \ \ /* Store the local values back to b11. */ \ PASTEMAC(ch,copyris)( beta11c_r, \ beta11c_i, \ *beta11_r, \ *beta11_i ); \ } \ } \ } \ } #ifdef BLIS_ENABLE_TRSM_PREINVERSION INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris ) #else INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris ) #endif cython-blis-0.9.1/blis/about.py000066400000000000000000000010311427272030600163670ustar00rootroot00000000000000# Copyright ExplosionAI GmbH, released under BSD # inspired from: # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __name__ = "blis" __version__ = "0.9.1" __summary__ = ( "The Blis BLAS-like linear algebra library, as a self-contained C-extension." ) __uri__ = "https://github.com/explosion/cython-blis" __author__ = "Explosion" __email__ = "contact@explosion.ai" __license__ = "BSD" __title__ = "blis" __release__ = True cython-blis-0.9.1/blis/benchmark.py000066400000000000000000000051551427272030600172220ustar00rootroot00000000000000# Copyright ExplsionAI GmbH, released under BSD. import numpy import numpy.random from blis.py import gemm, einsum from timeit import default_timer as timer numpy.random.seed(0) def create_data(nO, nI, batch_size): X = numpy.zeros((batch_size, nI), dtype="f") X += numpy.random.uniform(-1.0, 1.0, X.shape) W = numpy.zeros((nO, nI), dtype="f") W += numpy.random.uniform(-1.0, 1.0, W.shape) return X, W def get_numpy_blas(): blas_libs = numpy.__config__.blas_ilp64_opt_info["libraries"] return blas_libs[0] def numpy_gemm(X, W, n=1000): nO, nI = W.shape batch_size = X.shape[0] total = 0.0 y = numpy.zeros((batch_size, nO), dtype="f") for i in range(n): numpy.dot(X, W, out=y) total += y.sum() y.fill(0) print("Total:", total) def blis_gemm(X, W, n=1000): nO, nI = W.shape batch_size = X.shape[0] total = 0.0 y = numpy.zeros((batch_size, nO), dtype="f") for i in range(n): gemm(X, W, out=y) total += y.sum() y.fill(0.0) print("Total:", total) def numpy_einsum(X, W, n=1000): nO, nI = W.shape batch_size = X.shape[0] total = 0.0 y = numpy.zeros((nO, batch_size), dtype="f") for i in range(n): numpy.einsum("ab,cb->ca", X, W, out=y) total += y.sum() y.fill(0.0) print("Total:", total) def blis_einsum(X, W, n=1000): nO, nI = W.shape batch_size = X.shape[0] total = 0.0 y = numpy.zeros((nO, batch_size), dtype="f") for i in range(n): einsum("ab,cb->ca", X, W, out=y) total += y.sum() y.fill(0.0) print("Total:", total) def main(nI=128 * 3, nO=128 * 3, batch_size=2000): print( "Setting up data for gemm. 1000 iters, " "nO={nO} nI={nI} batch_size={batch_size}".format(**locals()) ) numpy_blas = get_numpy_blas() X1, W1 = create_data(nI, nO, batch_size) X2 = X1.copy() W2 = W1.copy() print("Blis gemm...") start = timer() blis_gemm(X2, W2, n=1000) end = timer() blis_time = end - start print("%.2f seconds" % blis_time) print("Numpy (%s) gemm..." % numpy_blas) start = timer() numpy_gemm(X1, W1) end = timer() numpy_time = end - start print("%.2f seconds" % numpy_time) print("Blis einsum ab,cb->ca") start = timer() blis_einsum(X2, W2, n=1000) end = timer() blis_time = end - start print("%.2f seconds" % blis_time) print("Numpy (%s) einsum ab,cb->ca" % numpy_blas) start = timer() numpy_einsum(X2, W2) end = timer() numpy_time = end - start print("%.2f seconds" % numpy_time) if __name__: main() cython-blis-0.9.1/blis/cy.pxd000066400000000000000000000070271427272030600160460ustar00rootroot00000000000000# Copyright ExplsionAI GmbH, released under BSD. from cython cimport view from libc.stdint cimport int64_t ctypedef float[::1] float1d_t ctypedef double[::1] double1d_t ctypedef float[:, ::1] float2d_t ctypedef double[:, ::1] double2d_t ctypedef float* floats_t ctypedef double* doubles_t ctypedef const float[::1] const_float1d_t ctypedef const double[::1] const_double1d_t ctypedef const float[:, ::1] const_float2d_t ctypedef const double[:, ::1] const_double2d_t ctypedef const float* const_floats_t ctypedef const double* const_doubles_t cdef fused reals_ft: floats_t doubles_t float1d_t double1d_t cdef fused const_reals_ft: const_floats_t const_doubles_t const_float1d_t const_double1d_t cdef fused reals1d_ft: float1d_t double1d_t cdef fused const_reals1d_ft: const_float1d_t const_double1d_t cdef fused reals2d_ft: float2d_t double2d_t cdef fused const_reals2d_ft: const_float2d_t const_double2d_t cdef fused real_ft: float double ctypedef int64_t dim_t ctypedef int64_t inc_t ctypedef int64_t doff_t # Sucks to set these from magic numbers, but it's better than dragging # the header into our header. # We get some piece of mind from checking the values on init. cpdef enum trans_t: NO_TRANSPOSE = 0 TRANSPOSE = 8 CONJ_NO_TRANSPOSE = 16 CONJ_TRANSPOSE = 24 cpdef enum conj_t: NO_CONJUGATE = 0 CONJUGATE = 16 cpdef enum side_t: LEFT = 0 RIGHT = 1 cpdef enum uplo_t: LOWER = 192 UPPER = 96 DENSE = 224 cpdef enum diag_t: NONUNIT_DIAG = 0 UNIT_DIAG = 256 cdef void gemm( trans_t transa, trans_t transb, dim_t m, dim_t n, dim_t k, double alpha, reals_ft a, inc_t rsa, inc_t csa, reals_ft b, inc_t rsb, inc_t csb, double beta, reals_ft c, inc_t rsc, inc_t csc, ) nogil cdef void ger( conj_t conjx, conj_t conjy, dim_t m, dim_t n, double alpha, reals_ft x, inc_t incx, reals_ft y, inc_t incy, reals_ft a, inc_t rsa, inc_t csa ) nogil cdef void gemv( trans_t transa, conj_t conjx, dim_t m, dim_t n, real_ft alpha, reals_ft a, inc_t rsa, inc_t csa, reals_ft x, inc_t incx, real_ft beta, reals_ft y, inc_t incy ) nogil cdef void axpyv( conj_t conjx, dim_t m, real_ft alpha, reals_ft x, inc_t incx, reals_ft y, inc_t incy ) nogil cdef void scalv( conj_t conjalpha, dim_t m, real_ft alpha, reals_ft x, inc_t incx ) nogil cdef double dotv( conj_t conjx, conj_t conjy, dim_t m, reals_ft x, reals_ft y, inc_t incx, inc_t incy, ) nogil cdef double norm_L1( dim_t n, reals_ft x, inc_t incx ) nogil cdef double norm_L2( dim_t n, reals_ft x, inc_t incx ) nogil cdef double norm_inf( dim_t n, reals_ft x, inc_t incx ) nogil cdef void randv( dim_t m, reals_ft x, inc_t incx ) nogil cdef void dgemm(bint transA, bint transB, int M, int N, int K, double alpha, const double* A, int lda, const double* B, int ldb, double beta, double* C, int ldc) nogil cdef void sgemm(bint transA, bint transB, int M, int N, int K, float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, int ldc) nogil cdef void daxpy(int N, double alpha, const double* X, int incX, double* Y, int incY) nogil cdef void saxpy(int N, float alpha, const float* X, int incX, float* Y, int incY) nogil cython-blis-0.9.1/blis/cy.pyx000066400000000000000000000410261427272030600160700ustar00rootroot00000000000000# cython: infer_types=True # cython: boundscheck=False # Copyright ExplsionAI GmbH, released under BSD. import atexit cdef extern from "blis.h" nogil: enum blis_err_t "err_t": pass cdef struct blis_cntx_t "cntx_t": pass cdef struct blis_rntm_t "rntm_s": pass ctypedef enum blis_trans_t "trans_t": BLIS_NO_TRANSPOSE BLIS_TRANSPOSE BLIS_CONJ_NO_TRANSPOSE BLIS_CONJ_TRANSPOSE ctypedef enum blis_conj_t "conj_t": BLIS_NO_CONJUGATE BLIS_CONJUGATE ctypedef enum blis_side_t "side_t": BLIS_LEFT BLIS_RIGHT ctypedef enum blis_uplo_t "uplo_t": BLIS_LOWER BLIS_UPPER BLIS_DENSE ctypedef enum blis_diag_t "diag_t": BLIS_NONUNIT_DIAG BLIS_UNIT_DIAG char* bli_info_get_int_type_size_str() blis_err_t bli_init() blis_err_t bli_finalize() blis_err_t bli_rntm_init(blis_rntm_t* rntm); # BLAS level 3 routines void bli_dgemm_ex( blis_trans_t transa, blis_trans_t transb, dim_t m, dim_t n, dim_t k, double* alpha, double* a, inc_t rsa, inc_t csa, double* b, inc_t rsb, inc_t csb, double* beta, double* c, inc_t rsc, inc_t csc, blis_cntx_t* cntx, blis_rntm_t* rntm, ) # BLAS level 3 routines void bli_sgemm_ex( blis_trans_t transa, blis_trans_t transb, dim_t m, dim_t n, dim_t k, float* alpha, float* a, inc_t rsa, inc_t csa, float* b, inc_t rsb, inc_t csb, float* beta, float* c, inc_t rsc, inc_t csc, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_dger_ex( blis_conj_t conjx, blis_conj_t conjy, dim_t m, dim_t n, double* alpha, double* x, inc_t incx, double* y, inc_t incy, double* a, inc_t rsa, inc_t csa, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_sger_ex( blis_conj_t conjx, blis_conj_t conjy, dim_t m, dim_t n, float* alpha, float* x, inc_t incx, float* y, inc_t incy, float* a, inc_t rsa, inc_t csa, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_dgemv_ex( blis_trans_t transa, blis_conj_t conjx, dim_t m, dim_t n, double* alpha, double* a, inc_t rsa, inc_t csa, double* x, inc_t incx, double* beta, double* y, inc_t incy, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_sgemv_ex( blis_trans_t transa, blis_conj_t conjx, dim_t m, dim_t n, float* alpha, float* a, inc_t rsa, inc_t csa, float* x, inc_t incx, float* beta, float* y, inc_t incy, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_daxpyv_ex( blis_conj_t conjx, dim_t m, double* alpha, double* x, inc_t incx, double* y, inc_t incy, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_saxpyv_ex( blis_conj_t conjx, dim_t m, float* alpha, float* x, inc_t incx, float* y, inc_t incy, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_dscalv_ex( blis_conj_t conjalpha, dim_t m, double* alpha, double* x, inc_t incx, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_sscalv_ex( blis_conj_t conjalpha, dim_t m, float* alpha, float* x, inc_t incx, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_ddotv_ex( blis_conj_t conjx, blis_conj_t conjy, dim_t m, double* x, inc_t incx, double* y, inc_t incy, double* rho, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_sdotv_ex( blis_conj_t conjx, blis_conj_t conjy, dim_t m, float* x, inc_t incx, float* y, inc_t incy, float* rho, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_snorm1v_ex( dim_t n, float* x, inc_t incx, float* norm, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_dnorm1v_ex( dim_t n, double* x, inc_t incx, double* norm, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_snormfv_ex( dim_t n, float* x, inc_t incx, float* norm, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_dnormfv_ex( dim_t n, double* x, inc_t incx, double* norm, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_snormiv_ex( dim_t n, float* x, inc_t incx, float* norm, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_dnormiv_ex( dim_t n, double* x, inc_t incx, double* norm, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_srandv_ex( dim_t m, float* x, inc_t incx, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_drandv_ex( dim_t m, double* x, inc_t incx, blis_cntx_t* cntx, blis_rntm_t* rntm, ) void bli_ssumsqv_ex( dim_t m, float* x, inc_t incx, float* scale, float* sumsq, blis_cntx_t* cntx, blis_rntm_t* rntm, ) nogil void bli_dsumsqv_ex( dim_t m, double* x, inc_t incx, double* scale, double* sumsq, blis_cntx_t* cntx, blis_rntm_t* rntm, ) nogil bli_init() cdef blis_rntm_t rntm; def init(): bli_init() bli_rntm_init(&rntm); assert BLIS_NO_TRANSPOSE == NO_TRANSPOSE assert BLIS_TRANSPOSE == TRANSPOSE assert BLIS_CONJ_NO_TRANSPOSE == CONJ_NO_TRANSPOSE assert BLIS_CONJ_TRANSPOSE == CONJ_TRANSPOSE assert BLIS_NO_CONJUGATE == NO_CONJUGATE assert BLIS_CONJUGATE == CONJUGATE assert BLIS_LEFT == LEFT assert BLIS_RIGHT == RIGHT assert BLIS_LOWER == LOWER assert BLIS_UPPER == UPPER assert BLIS_DENSE == DENSE assert BLIS_NONUNIT_DIAG == NONUNIT_DIAG assert BLIS_UNIT_DIAG == UNIT_DIAG def get_int_type_size(): cdef char* int_size = bli_info_get_int_type_size_str() return '%d' % int_size[0] # BLAS level 3 routines cdef void gemm( trans_t trans_a, trans_t trans_b, dim_t m, dim_t n, dim_t k, double alpha, reals_ft a, inc_t rsa, inc_t csa, reals_ft b, inc_t rsb, inc_t csb, double beta, reals_ft c, inc_t rsc, inc_t csc ) nogil: cdef float alpha_f = alpha cdef float beta_f = beta cdef double alpha_d = alpha cdef double beta_d = beta if reals_ft is floats_t: bli_sgemm_ex( trans_a, trans_b, m, n, k, &alpha_f, a, rsa, csa, b, rsb, csb, &beta_f, c, rsc, csc, NULL, &rntm) elif reals_ft is doubles_t: bli_dgemm_ex( trans_a, trans_b, m, n, k, &alpha_d, a, rsa, csa, b, rsb, csb, &beta_d, c, rsc, csc, NULL, &rntm) elif reals_ft is float1d_t: bli_sgemm_ex( trans_a, trans_b, m, n, k, &alpha_f, &a[0], rsa, csa, &b[0], rsb, csb, &beta_f, &c[0], rsc, csc, NULL, &rntm) elif reals_ft is double1d_t: bli_dgemm_ex( trans_a, trans_b, m, n, k, &alpha_d, &a[0], rsa, csa, &b[0], rsb, csb, &beta_d, &c[0], rsc, csc, NULL, &rntm) else: # Impossible --- panic? pass cdef void ger( conj_t conjx, conj_t conjy, dim_t m, dim_t n, double alpha, reals_ft x, inc_t incx, reals_ft y, inc_t incy, reals_ft a, inc_t rsa, inc_t csa ) nogil: cdef float alpha_f = alpha cdef double alpha_d = alpha if reals_ft is floats_t: bli_sger_ex( conjx, conjy, m, n, &alpha_f, x, incx, y, incy, a, rsa, csa, NULL, &rntm) elif reals_ft is doubles_t: bli_dger_ex( conjx, conjy, m, n, &alpha_d, x, incx, y, incy, a, rsa, csa, NULL, &rntm) elif reals_ft is float1d_t: bli_sger_ex( conjx, conjy, m, n, &alpha_f, &x[0], incx, &y[0], incy, &a[0], rsa, csa, NULL, &rntm) elif reals_ft is double1d_t: bli_dger_ex( conjx, conjy, m, n, &alpha_d, &x[0], incx, &y[0], incy, &a[0], rsa, csa, NULL, &rntm) else: # Impossible --- panic? pass cdef void gemv( trans_t transa, conj_t conjx, dim_t m, dim_t n, real_ft alpha, reals_ft a, inc_t rsa, inc_t csa, reals_ft x, inc_t incx, real_ft beta, reals_ft y, inc_t incy ) nogil: cdef float alpha_f = alpha cdef double alpha_d = alpha cdef float beta_f = alpha cdef double beta_d = alpha if reals_ft is floats_t: bli_sgemv_ex( transa, conjx, m, n, &alpha_f, a, rsa, csa, x, incx, &beta_f, y, incy, NULL, &rntm) elif reals_ft is doubles_t: bli_dgemv_ex( transa, conjx, m, n, &alpha_d, a, rsa, csa, x, incx, &beta_d, y, incy, NULL, &rntm) elif reals_ft is float1d_t: bli_sgemv_ex( transa, conjx, m, n, &alpha_f, &a[0], rsa, csa, &x[0], incx, &beta_f, &y[0], incy, NULL, &rntm) elif reals_ft is double1d_t: bli_dgemv_ex( transa, conjx, m, n, &alpha_d, &a[0], rsa, csa, &x[0], incx, &beta_d, &y[0], incy, NULL, &rntm) else: # Impossible --- panic? pass cdef void axpyv( conj_t conjx, dim_t m, real_ft alpha, reals_ft x, inc_t incx, reals_ft y, inc_t incy ) nogil: cdef float alpha_f = alpha cdef double alpha_d = alpha if reals_ft is floats_t: bli_saxpyv_ex(conjx, m, &alpha_f, x, incx, y, incy, NULL, &rntm) elif reals_ft is doubles_t: bli_daxpyv_ex(conjx, m, &alpha_d, x, incx, y, incy, NULL, &rntm) elif reals_ft is float1d_t: bli_saxpyv_ex(conjx, m, &alpha_f, &x[0], incx, &y[0], incy, NULL, &rntm) elif reals_ft is double1d_t: bli_daxpyv_ex(conjx, m, &alpha_d, &x[0], incx, &y[0], incy, NULL, &rntm) else: # Impossible --- panic? pass cdef void scalv( conj_t conjalpha, dim_t m, real_ft alpha, reals_ft x, inc_t incx ) nogil: cdef float alpha_f = alpha cdef double alpha_d = alpha if reals_ft is floats_t: bli_sscalv_ex(conjalpha, m, &alpha_f, x, incx, NULL, &rntm) elif reals_ft is doubles_t: bli_dscalv_ex(conjalpha, m, &alpha_d, x, incx, NULL, &rntm) elif reals_ft is float1d_t: bli_sscalv_ex(conjalpha, m, &alpha_f, &x[0], incx, NULL, &rntm) elif reals_ft is double1d_t: bli_dscalv_ex(conjalpha, m, &alpha_d, &x[0], incx, NULL, &rntm) else: # Impossible --- panic? pass cdef double norm_L1( dim_t n, reals_ft x, inc_t incx ) nogil: cdef double dnorm = 0 cdef float snorm = 0 if reals_ft is floats_t: bli_snorm1v_ex(n, x, incx, &snorm, NULL, &rntm) dnorm = snorm elif reals_ft is doubles_t: bli_dnorm1v_ex(n, x, incx, &dnorm, NULL, &rntm) elif reals_ft is float1d_t: bli_snorm1v_ex(n, &x[0], incx, &snorm, NULL, &rntm) dnorm = snorm elif reals_ft is double1d_t: bli_dnorm1v_ex(n, &x[0], incx, &dnorm, NULL, &rntm) else: # Impossible --- panic? pass return dnorm cdef double norm_L2( dim_t n, reals_ft x, inc_t incx ) nogil: cdef double dnorm = 0 cdef float snorm = 0 if reals_ft is floats_t: bli_snormfv_ex(n, x, incx, &snorm, NULL, &rntm) dnorm = snorm elif reals_ft is doubles_t: bli_dnormfv_ex(n, x, incx, &dnorm, NULL, &rntm) elif reals_ft is float1d_t: bli_snormfv_ex(n, &x[0], incx, &snorm, NULL, &rntm) dnorm = snorm elif reals_ft is double1d_t: bli_dnormfv_ex(n, &x[0], incx, &dnorm, NULL, &rntm) else: # Impossible --- panic? pass return dnorm cdef double norm_inf( dim_t n, reals_ft x, inc_t incx ) nogil: cdef double dnorm = 0 cdef float snorm = 0 if reals_ft is floats_t: bli_snormiv_ex(n, x, incx, &snorm, NULL, &rntm) dnorm = snorm elif reals_ft is doubles_t: bli_dnormiv_ex(n, x, incx, &dnorm, NULL, &rntm) elif reals_ft is float1d_t: bli_snormiv_ex(n, &x[0], incx, &snorm, NULL, &rntm) dnorm = snorm elif reals_ft is double1d_t: bli_dnormiv_ex(n, &x[0], incx, &dnorm, NULL, &rntm) else: # Impossible --- panic? pass return dnorm cdef double dotv( conj_t conjx, conj_t conjy, dim_t m, reals_ft x, reals_ft y, inc_t incx, inc_t incy, ) nogil: cdef double rho_d = 0.0 cdef float rho_f = 0.0 if reals_ft is floats_t: bli_sdotv_ex(conjx, conjy, m, x, incx, y, incy, &rho_f, NULL, &rntm) return rho_f elif reals_ft is doubles_t: bli_ddotv_ex(conjx, conjy, m, x, incx, y, incy, &rho_d, NULL, &rntm) return rho_d elif reals_ft is float1d_t: bli_sdotv_ex(conjx, conjy, m, &x[0], incx, &y[0], incy, &rho_f, NULL, &rntm) return rho_f elif reals_ft is double1d_t: bli_ddotv_ex(conjx, conjy, m, &x[0], incx, &y[0], incy, &rho_d, NULL, &rntm) return rho_d else: raise ValueError("Unhandled fused type") cdef void randv(dim_t m, reals_ft x, inc_t incx) nogil: if reals_ft is floats_t: bli_srandv_ex(m, x, incx, NULL, &rntm) elif reals_ft is float1d_t: bli_srandv_ex(m, &x[0], incx, NULL, &rntm) if reals_ft is doubles_t: bli_drandv_ex(m, x, incx, NULL, &rntm) elif reals_ft is double1d_t: bli_drandv_ex(m, &x[0], incx, NULL, &rntm) else: with gil: raise ValueError("Unhandled fused type") cdef void sumsqv(dim_t m, reals_ft x, inc_t incx, reals_ft scale, reals_ft sumsq) nogil: if reals_ft is floats_t: bli_ssumsqv_ex(m, &x[0], incx, scale, sumsq, NULL, &rntm) elif reals_ft is float1d_t: bli_ssumsqv_ex(m, &x[0], incx, &scale[0], &sumsq[0], NULL, &rntm) if reals_ft is doubles_t: bli_dsumsqv_ex(m, x, incx, scale, sumsq, NULL, &rntm) elif reals_ft is double1d_t: bli_dsumsqv_ex(m, &x[0], incx, &scale[0], &sumsq[0], NULL, &rntm) else: with gil: raise ValueError("Unhandled fused type") cdef void dgemm(bint transA, bint transB, int M, int N, int K, double alpha, const double* A, int lda, const double* B, int ldb, double beta, double* C, int ldc) nogil: gemm( TRANSPOSE if transA else NO_TRANSPOSE, TRANSPOSE if transB else NO_TRANSPOSE, M, N, K, alpha, A, lda, 1, B, ldb, 1, beta, C, ldc, 1 ) cdef void sgemm(bint transA, bint transB, int M, int N, int K, float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, int ldc) nogil: gemm( TRANSPOSE if transA else NO_TRANSPOSE, TRANSPOSE if transB else NO_TRANSPOSE, M, N, K, alpha, A, lda, 1, B, ldb, 1, beta, C, ldc, 1 ) cdef void saxpy(int N, float alpha, const float* X, int incX, float* Y, int incY) nogil: axpyv(NO_CONJUGATE, N, alpha, X, incX, Y, incY) cdef void daxpy(int N, double alpha, const double* X, int incX, double* Y, int incY) nogil: axpyv(NO_CONJUGATE, N, alpha, X, incX, Y, incY) @atexit.register def finalize(): bli_finalize() cython-blis-0.9.1/blis/py.pyx000066400000000000000000000156171427272030600161140ustar00rootroot00000000000000# cython: boundscheck=False # Copyright ExplsionAI GmbH, released under BSD. cimport numpy as np from . cimport cy from .cy cimport reals1d_ft, reals2d_ft, float1d_t, float2d_t from .cy cimport const_reals1d_ft, const_reals2d_ft, const_float1d_t, const_float2d_t from .cy cimport const_double1d_t, const_double2d_t import numpy def axpy(const_reals1d_ft A, double scale=1., np.ndarray out=None): if const_reals1d_ft is const_float1d_t: if out is None: out = numpy.zeros((A.shape[0],), dtype='f') B = out.data return out elif const_reals1d_ft is const_double1d_t: if out is None: out = numpy.zeros((A.shape[0],), dtype='d') B = out.data with nogil: cy.axpyv(cy.NO_CONJUGATE, A.shape[0], scale, &A[0], 1, B, 1) return out else: B = NULL raise TypeError("Unhandled fused type") def batch_axpy(reals2d_ft A, reals1d_ft B, np.ndarray out=None): pass def ger(const_reals2d_ft A, const_reals1d_ft B, double scale=1., np.ndarray out=None): if const_reals2d_ft is const_float2d_t and const_reals1d_ft is const_float1d_t: if out is None: out = numpy.zeros((A.shape[0], B.shape[0]), dtype='f') with nogil: cy.ger( cy.NO_CONJUGATE, cy.NO_CONJUGATE, A.shape[0], B.shape[0], scale, &A[0,0], 1, &B[0], 1, out.data, out.shape[1], 1) return out elif const_reals2d_ft is const_double2d_t and const_reals1d_ft is const_double1d_t: if out is None: out = numpy.zeros((A.shape[0], B.shape[0]), dtype='d') with nogil: cy.ger( cy.NO_CONJUGATE, cy.NO_CONJUGATE, A.shape[0], B.shape[0], scale, &A[0,0], 1, &B[0], 1, out.data, out.shape[1], 1) return out else: C = NULL raise TypeError("Unhandled fused type") def gemm(const_reals2d_ft A, const_reals2d_ft B, np.ndarray out=None, bint trans1=False, bint trans2=False, double alpha=1., double beta=1.): cdef cy.dim_t nM = A.shape[0] if not trans1 else A.shape[1] cdef cy.dim_t nK = A.shape[1] if not trans1 else A.shape[0] cdef cy.dim_t nK_b = B.shape[0] if not trans2 else B.shape[1] cdef cy.dim_t nN = B.shape[1] if not trans2 else B.shape[0] if nK != nK_b: msg = "Shape mismatch for blis.gemm: (%d, %d), (%d, %d)" raise ValueError(msg % (nM, nK, nK_b, nN)) if const_reals2d_ft is const_float2d_t: if out is None: if beta == 0.: out = numpy.empty((nM, nN), dtype='f') else: out = numpy.zeros((nM, nN), dtype='f') C = out.data with nogil: cy.gemm( cy.TRANSPOSE if trans1 else cy.NO_TRANSPOSE, cy.TRANSPOSE if trans2 else cy.NO_TRANSPOSE, nM, nN, nK, alpha, &A[0,0], A.shape[1], 1, &B[0,0], B.shape[1], 1, beta, C, out.shape[1], 1) return out elif const_reals2d_ft is const_double2d_t: if out is None: out = numpy.zeros((A.shape[0], B.shape[1]), dtype='d') C = out.data with nogil: cy.gemm( cy.TRANSPOSE if trans1 else cy.NO_TRANSPOSE, cy.TRANSPOSE if trans2 else cy.NO_TRANSPOSE, A.shape[0], B.shape[1], A.shape[1], alpha, &A[0,0], A.shape[1], 1, &B[0,0], B.shape[1], 1, beta, C, out.shape[1], 1) return out else: C = NULL raise TypeError("Unhandled fused type") def gemv(const_reals2d_ft A, const_reals1d_ft B, bint trans1=False, double alpha=1., double beta=1., np.ndarray out=None): if const_reals1d_ft is const_float1d_t and const_reals2d_ft is const_float2d_t: if out is None: out = numpy.zeros((A.shape[0],), dtype='f') with nogil: cy.gemv( cy.TRANSPOSE if trans1 else cy.NO_TRANSPOSE, cy.NO_CONJUGATE, A.shape[0], A.shape[1], alpha, &A[0,0], A.shape[1], 1, &B[0], 1, beta, out.data, 1) return out elif const_reals1d_ft is const_double1d_t and const_reals2d_ft is const_double2d_t: if out is None: out = numpy.zeros((A.shape[0],), dtype='d') with nogil: cy.gemv( cy.TRANSPOSE if trans1 else cy.NO_TRANSPOSE, cy.NO_CONJUGATE, A.shape[0], A.shape[1], alpha, &A[0,0], A.shape[1], 1, &B[0], 1, beta, out.data, 1) return out else: raise TypeError("Unhandled fused type") def dotv(const_reals1d_ft X, const_reals1d_ft Y, bint conjX=False, bint conjY=False): if X.shape[0] != Y.shape[0]: msg = "Shape mismatch for blis.dotv: (%d,), (%d,)" raise ValueError(msg % (X.shape[0], Y.shape[0])) return cy.dotv( cy.CONJUGATE if conjX else cy.NO_CONJUGATE, cy.CONJUGATE if conjY else cy.NO_CONJUGATE, X.shape[0], &X[0], &Y[0], 1, 1 ) def einsum(todo, A, B, out=None): if todo == 'a,a->a': return axpy(A, B, out=out) elif todo == 'a,b->ab': return ger(A, B, out=out) elif todo == 'a,b->ba': return ger(B, A, out=out) elif todo == 'ab,a->ab': return batch_axpy(A, B, out=out) elif todo == 'ab,a->ba': return batch_axpy(A, B, trans1=True, out=out) elif todo == 'ab,b->a': return gemv(A, B, out=out) elif todo == 'ab,a->b': return gemv(A, B, trans1=True, out=out) # The rule here is, look at the first dimension of the output. That must # occur in arg1. Set trans1 if it's dimension 2. # E.g. bc is output, b occurs in ab, so that must be arg1. So we need # trans1=True, to make ba,ac->bc elif todo == 'ab,ac->bc': return gemm(A, B, trans1=True, trans2=False, out=out) elif todo == 'ab,ac->cb': return gemm(B, A, out=out, trans1=True, trans2=True) elif todo == 'ab,bc->ac': return gemm(A, B, out=out, trans1=False, trans2=False) elif todo == 'ab,bc->ca': return gemm(B, A, out=out, trans1=True, trans2=True) elif todo == 'ab,ca->bc': return gemm(A, B, out=out, trans1=True, trans2=True) elif todo == 'ab,ca->cb': return gemm(B, A, out=out, trans1=False, trans2=False) elif todo == 'ab,cb->ac': return gemm(A, B, out=out, trans1=False, trans2=True) elif todo == 'ab,cb->ca': return gemm(B, A, out=out, trans1=False, trans2=True) else: raise ValueError("Invalid einsum: %s" % todo) cython-blis-0.9.1/blis/tests/000077500000000000000000000000001427272030600160525ustar00rootroot00000000000000cython-blis-0.9.1/blis/tests/__init__.py000066400000000000000000000000001427272030600201510ustar00rootroot00000000000000cython-blis-0.9.1/blis/tests/common.py000066400000000000000000000050211427272030600177120ustar00rootroot00000000000000# Copyright ExplsionAI GmbH, released under BSD. from __future__ import print_function import numpy as np np.random.seed(0) from numpy.testing import assert_allclose from hypothesis import assume from hypothesis.strategies import tuples, integers, floats from hypothesis.extra.numpy import arrays def lengths(lo=1, hi=10): return integers(min_value=lo, max_value=hi) def shapes(min_rows=1, max_rows=100, min_cols=1, max_cols=100): return tuples(lengths(lo=min_rows, hi=max_rows), lengths(lo=min_cols, hi=max_cols)) def ndarrays_of_shape(shape, lo=-1000.0, hi=1000.0, dtype="float64"): width = 64 if dtype == "float64" else 32 return arrays( dtype, shape=shape, elements=floats(min_value=lo, max_value=hi, width=width) ) def ndarrays( min_len=0, max_len=10, min_val=-10000000.0, max_val=1000000.0, dtype="float64" ): return lengths(lo=min_len, hi=max_len).flatmap( lambda n: ndarrays_of_shape(n, lo=min_val, hi=max_val, dtype=dtype) ) def matrices( min_rows=1, max_rows=10, min_cols=1, max_cols=10, min_value=-10000000.0, max_value=1000000.0, dtype="float64", ): return shapes( min_rows=min_rows, max_rows=max_rows, min_cols=min_cols, max_cols=max_cols ).flatmap(lambda mn: ndarrays_of_shape(mn, lo=min_value, hi=max_value, dtype=dtype)) def positive_ndarrays(min_len=0, max_len=10, max_val=100000.0, dtype="float64"): return ndarrays( min_len=min_len, max_len=max_len, min_val=0, max_val=max_val, dtype=dtype ) def negative_ndarrays(min_len=0, max_len=10, min_val=-100000.0, dtype="float64"): return ndarrays( min_len=min_len, max_len=max_len, min_val=min_val, max_val=-1e-10, dtype=dtype ) def parse_layer(layer_data): # Get the first row, excluding the first column x = layer_data[0, 1:] # Get the first column, excluding the first row # .ascontiguousarray is support important here!!!! b = np.ascontiguousarray(layer_data[1:, 0], dtype="float64") # Slice out the row and the column used for the X and the bias W = layer_data[1:, 1:] assert x.ndim == 1 assert b.ndim == 1 assert b.shape[0] == W.shape[0] assert x.shape[0] == W.shape[1] assume(not np.isnan(W.sum())) assume(not np.isnan(x.sum())) assume(not np.isnan(b.sum())) assume(not any(np.isinf(val) for val in W.flatten())) assume(not any(np.isinf(val) for val in x)) assume(not any(np.isinf(val) for val in b)) return x, b, W def split_row(layer_data): return (layer_data[0, :], layer_data[:, :]) cython-blis-0.9.1/blis/tests/test_dotv.py000066400000000000000000000021561427272030600204430ustar00rootroot00000000000000# Copyright ExplosionAI GmbH, released under BSD. from __future__ import division from hypothesis import given, assume from blis.tests.common import * from blis.py import dotv @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float64"), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float64"), ) def test_memoryview_double_noconj(A, B): if len(A) < len(B): B = B[: len(A)] else: A = A[: len(B)] assume(A is not None) assume(B is not None) numpy_result = A.dot(B) result = dotv(A, B) assert_allclose([numpy_result], result, atol=1e-4, rtol=1e-4) @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float32"), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float32"), ) def test_memoryview_float_noconj(A, B): if len(A) < len(B): B = B[: len(A)] else: A = A[: len(B)] assume(A is not None) assume(B is not None) numpy_result = A.dot(B) result = dotv(A, B) assert_allclose([numpy_result], result, atol=1e-4, rtol=1e-3) cython-blis-0.9.1/blis/tests/test_gemm.py000066400000000000000000000046751427272030600204240ustar00rootroot00000000000000# Copyright ExplosionAI GmbH, released under BSD. from __future__ import division from hypothesis import given, assume from math import sqrt, floor from blis.tests.common import * from blis.py import gemm def _stretch_matrix(data, m, n): orig_len = len(data) orig_m = m orig_n = n ratio = sqrt(len(data) / (m * n)) m = int(floor(m * ratio)) n = int(floor(n * ratio)) data = np.ascontiguousarray(data[: m * n], dtype=data.dtype) return data.reshape((m, n)), m, n def _reshape_for_gemm( A, B, a_rows, a_cols, out_cols, dtype, trans_a=False, trans_b=False ): A, a_rows, a_cols = _stretch_matrix(A, a_rows, a_cols) if len(B) < a_cols or a_cols < 1: return (None, None, None) b_cols = int(floor(len(B) / a_cols)) B = np.ascontiguousarray(B.flatten()[: a_cols * b_cols], dtype=dtype) B = B.reshape((a_cols, b_cols)) out_cols = B.shape[1] C = np.zeros(shape=(A.shape[0], B.shape[1]), dtype=dtype) if trans_a: A = np.ascontiguousarray(A.T, dtype=dtype) return A, B, C @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float64"), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float64"), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000), ) def test_memoryview_double_notrans(A, B, a_rows, a_cols, out_cols): A, B, C = _reshape_for_gemm(A, B, a_rows, a_cols, out_cols, "float64") assume(A is not None) assume(B is not None) assume(C is not None) assume(A.size >= 1) assume(B.size >= 1) assume(C.size >= 1) gemm(A, B, out=C) numpy_result = A.dot(B) assert_allclose(numpy_result, C, atol=1e-4, rtol=1e-4) @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float32"), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype="float32"), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000), ) def test_memoryview_float_notrans(A, B, a_rows, a_cols, out_cols): A, B, C = _reshape_for_gemm(A, B, a_rows, a_cols, out_cols, dtype="float32") assume(A is not None) assume(B is not None) assume(C is not None) assume(A.size >= 1) assume(B.size >= 1) assume(C.size >= 1) gemm(A, B, out=C) numpy_result = A.dot(B) assert_allclose(numpy_result, C, atol=1e-3, rtol=1e-3) cython-blis-0.9.1/build-constraints.txt000066400000000000000000000006771427272030600201760ustar00rootroot00000000000000# build version constraints for use with wheelwright + multibuild numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64' numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64' numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' numpy==1.19.3; python_version=='3.9' numpy==1.21.3; python_version=='3.10' numpy; python_version>='3.11' cython-blis-0.9.1/dev-requirements.txt000066400000000000000000000000371427272030600200170ustar00rootroot00000000000000pytest cython hypothesis wheel cython-blis-0.9.1/extra-include/000077500000000000000000000000001427272030600165235ustar00rootroot00000000000000cython-blis-0.9.1/extra-include/configure000077500000000000000000002624651427272030600204510ustar00rootroot00000000000000#!/usr/bin/env bash # # BLIS # An object-based framework for developing high-performance BLAS-like # libraries. # # Copyright (C) 2014, The University of Texas at Austin # Copyright (C) 2018, Advanced Micro Devices, Inc. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # - Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # - Neither the name of The University of Texas at Austin nor the names # of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # # # -- Helper functions ---------------------------------------------------------- # print_usage() { # Use the version string in the 'version' file since we don't have # the patched version string yet. if [ -z "${version}" ]; then version=$(cat "${version_filepath}") fi # Echo usage info. echo " " echo " ${script_name} (BLIS ${version})" #echo " " #echo " BLIS ${version}" echo " " echo " Field G. Van Zee" echo " " echo " Configure BLIS's build system for compilation using a specified" echo " configuration directory." echo " " echo " Usage:" echo " " echo " ${script_name} [options] [env. vars.] confname" echo " " echo " Arguments:" echo " " echo " confname The name of the sub-directory inside of the 'config'" echo " directory containing the desired BLIS configuration." echo " Note that confname MUST be specified; if it is not," echo " configure will complain. To build a completely generic" echo " implementation, use the 'generic' configuration" echo " " echo " Options:" echo " " echo " -p PREFIX, --prefix=PREFIX" echo " " echo " The path to which make will install all build products." echo " If given, this option implies the following options:" echo " --libdir=PREFIX/lib" echo " --incdir=PREFIX/include" echo " --sharedir=PREFIX/share" echo " If not given, PREFIX defaults to \$(HOME)/blis. If PREFIX" echo " refers to a directory that does not exist, it will be" echo " created." echo " " echo " --libdir=LIBDIR" echo " " echo " The path to which make will install libraries. If given," echo " LIBDIR will override the corresponding directory implied" echo " by --prefix; if not not given, LIBDIR defaults to" echo " PREFIX/lib. If LIBDIR refers to a directory that does" echo " not exist, it will be created." echo " " echo " --includedir=INCDIR" echo " " echo " The path to which make will install development header" echo " files. If given, INCDIR will override the corresponding" echo " directory implied by --prefix; if not given, INCDIR" echo " defaults to PREFIX/include. If INCDIR refers to a" echo " directory that does not exist, it will be created." echo " " echo " --sharedir=SHAREDIR" echo " " echo " The path to which make will makefile fragments containing" echo " make variables determined by configure (e.g. CC, CFLAGS," echo " and LDFLAGS). These files allow certain BLIS makefiles," echo " such as those in the examples or testsuite directories, to" echo " operate on an installed copy of BLIS rather than a local" echo " (and possibly uninstalled) copy. If given, SHAREDIR will" echo " override the corresponding directory implied by --prefix;" echo " if not given, SHAREDIR defaults to PREFIX/share. If" echo " SHAREDIR refers to a directory that does not exist, it" echo " will be created." echo " " echo " -d DEBUG, --enable-debug[=DEBUG]" echo " " echo " Enable debugging symbols in the library. If argument" echo " DEBUG is given as 'opt', then optimization flags are" echo " kept in the framework, otherwise optimization is" echo " turned off." echo " " echo " --enable-verbose-make, --disable-verbose-make" echo " " echo " Enable (disabled by default) verbose compilation output" echo " during make." echo " " echo " --enable-arg-max-hack --disable-arg-max-hack" echo " " echo " Enable (disabled by default) build system logic that" echo " will allow archiving/linking the static/shared library" echo " even if the command plus command line arguments exceeds" echo " the operating system limit (ARG_MAX)." echo " " echo " --disable-static, --enable-static" echo " " echo " Disable (enabled by default) building BLIS as a static" echo " library. If the static library build is disabled, the" echo " shared library build must remain enabled." echo " " echo " --disable-shared, --enable-shared" echo " " echo " Disable (enabled by default) building BLIS as a shared" echo " library. If the shared library build is disabled, the" echo " static library build must remain enabled." echo " " echo " -t MODEL, --enable-threading[=MODEL], --disable-threading" echo " " echo " Enable threading in the library, using threading model" echo " MODEL={openmp,pthreads,no}. If MODEL=no or " echo " --disable-threading is specified, threading will be" echo " disabled. The default is 'no'." echo " " echo " --disable-packbuf-pools, --enable-packbuf-pools" echo " " echo " Disable (enabled by default) use of internal memory" echo " pools for managing packing buffers. When disabled," echo " the function specified by BLIS_MALLOC_POOL is called" echo " on-demand, whenever a packing buffer is needed, and" echo " the buffer is released via the function specified by" echo " BLIS_FREE_POOL() when the loop in which it was" echo " allocated terminates. When enabled, the memory pools" echo " minimize calls to both BLIS_MALLOC_POOL() and" echo " BLIS_FREE_POOL(), especially in a multithreaded" echo " environment, but does so through a mechanism that may" echo " incur additional overhead in some (but not all)" echo " situations." echo " " echo " -q, --quiet Suppress informational output. By default, configure" echo " is verbose. (NOTE: -q is not yet implemented)" echo " " echo " -i SIZE, --int-size=SIZE" echo " " echo " Set the size (in bits) of internal BLIS integers and" echo " integer types used in native BLIS interfaces. The" echo " default inteter type size is architecture dependent." echo " (Hint: You can always find this value printed at the" echo " beginning of the testsuite output.)" echo " " echo " -b SIZE, --blas-int-size=SIZE" echo " " echo " Set the size (in bits) of integer types in external" echo " BLAS and CBLAS interfaces, if enabled. The default" echo " integer type size used in BLAS/CBLAS is 32 bits." echo " " echo " --disable-blas, --enable-blas" echo " " echo " Disable (enabled by default) building the BLAS" echo " compatibility layer." echo " " echo " --enable-cblas, --disable-cblas" echo " " echo " Enable (disabled by default) building the CBLAS" echo " compatibility layer. This automatically enables the" echo " BLAS compatibility layer as well." echo " " echo " -s NAME --enable-sandbox=NAME" echo " " echo " Enable a separate sandbox implementation of gemm. This" echo " option disables BLIS's conventional gemm implementation" echo " (which shares common infrastructure with other level-3" echo " operations) and instead compiles and uses the code in" echo " the NAME directory, which is expected to be a sub-" echo " directory of 'sandbox'. By default, no sandboxes are" echo " enabled." echo " " echo " --with-memkind, --without-memkind" echo " " echo " Forcibly enable or disable the use of libmemkind's" echo " hbw_malloc() and hbw_free() as substitutes for malloc()" echo " and free(), respectively, when allocating memory for" echo " BLIS's memory pools, which are used to manage buffers" echo " into which matrices are packed. The default behavior" echo " for this option is environment-dependent; if configure" echo " detects the presence of libmemkind, libmemkind is used" echo " by default, and otherwise it is not used by default." echo " " echo " --force-version=STRING" echo " " echo " Force configure to use an arbitrary version string" echo " STRING. This option may be useful when repackaging" echo " custom versions of BLIS by outside organizations." echo " " echo " -c, --show-config-lists" echo " " echo " Print the config and kernel lists, and kernel-to-config" echo " map after they are read from file. This can be useful" echo " when debugging certain configuration issues, and/or as" echo " a sanity check to make sure these lists are constituted" echo " as expected." echo " " echo " -h, --help Output this information and quit." echo " " echo " Environment Variables:" echo " " echo " CC Specifies the C compiler to use." echo " RANLIB Specifies the ranlib executable to use." echo " CFLAGS Specifies additional compiler flags to use (prepended)." echo " LDFLAGS Specifies additional linker flags to use (prepended)." echo " " echo " Environment variables may also be specified as command line" echo " options, e.g.:" echo " " echo " ./configure [options] CC=gcc haswell" echo " " echo " Note that not all compilers are compatible with a given" echo " configuration." echo " " # Exit with non-zero exit status exit 1 } query_array() { local arr key var_name arr="$1" key="$2" var_name="${arr}_${key}" echo "${!var_name}" } assign_key_value() { local arr key val arr="$1" key="$2" val="$3" printf -v "${arr}_${key}" %s "${val}" } # # FGVZ: This commented-out function is being kept as an example how how # to effectively "pass by reference" in bash. That is, pass the name of # a variable, instead of its conents, and then let the function use the # variable by prepending a $, at which time it can evaluate the string # as if it were a literal variable occurance. # #filteradd_to_list() #{ # local dlist ditem list_c item_c is_blacklisted # # # Add $1 to the list identified by $2, but only if $1 is not # # found in a blacklist. # # # Note: $2 can actually be a list of items. # dlist=\$"$1" # ditem=\$"$2" # # # Acquire the contents of $list and $item and store them in list_c # # and item_c, respectively. # list_c=$(eval "expr \"$dlist\" ") # item_c=$(eval "expr \"$ditem\" ") # # # Iterate over $item_c in case it is actually multiple items. # for cur_item in $item_c; do # # is_blacklisted=$(is_in_list "${cur_item}" "${config_blist}") # if [ ${is_blacklisted} == "false" ]; then # # # If cur_item is not blacklisted, add it to list_c. # list_c="${list_c} ${cur_item}" # fi # done # # # Update the argument. # eval "$1=\"${list_c}\"" #} pass_config_kernel_registries() { local filename passnum local all_blist local curline list item config kernels local cname clist klist # Read function arguments: # first argument: the file containing the configuration registry. # second argument: the pass number: 0 or 1. Pass 0 builds the # indirect config blacklist (indirect_blist) ONLY. Pass 1 actually # begins populating the config and kernel registries, and assumes # the indirect_blist has already been created. filename="$1" passnum="$2" # Initialize a list of indirect blacklisted configurations for the # current iteration. These are configurations that are invalidated by # the removal of blacklisted configurations. For example, if haswell # is registered as needing the 'haswell' and 'zen' kernel sets: # # haswell: haswell/haswell/zen # # and 'zen' was blacklisted because of the compiler version, then the # 'haswell' configuration must be omitted from the registry, as it no # longer has all of the kernel sets it was expecting. if [ "${passnum}" == "0" ]; then indirect_blist="" fi # For convenience, merge the original and indirect blacklists. # NOTE: During pass 0, all_blist is equal to config_blist, since # indirect_blist is still empty. all_blist="${config_blist} ${indirect_blist}" # Disable support for indirect blacklisting by returning early during # pass 0. See issue #214 for details [1]. Basically, I realized that # indirect blacklisting is not needed in the use case that I envisioned # in the real-life example above. If a subconfiguration such as haswell # is defined to require the zen kernel set, it implies that the zen # kernels can be compiled with haswell compiler flags. That is, just # because the zen subconfig (and its compiler flags) is blacklisted # does not mean that the haswell subconfig cannot compile the zen # kernels with haswell-specific flags. # # [1] https://github.com/flame/blis/issues/214 # if [ "${passnum}" == "0" ]; then return fi while read -r line do curline="${line}" # Remove everything after comment character '#'. curline=${curline%%#*} # We've stripped out leading whitespace and trailing comments. If # the line is now empty, then we can skip it altogether. if [ "x${curline}" = "x" ]; then continue; fi # Read the config name and config list for the current line. cname=${curline%%:*} list=${curline##*:} # If we encounter a slash, it means the name of the configuration # and the kernel set needed by that configuration are different. if [[ "${list}" == *[/]* ]]; then #echo "Slash found." klist="" clist="" for item in "${list}"; do # The sub-configuration name is always the first sub-word in # the slash-separated compound word. config=${item%%/*} # Delete the sub-configuration name from the front of the # string, leaving the slash-separated kernel names (or just # the kernel name, if there is only one). kernels=${list#*/} # Replace the slashes with spaces to transform the string # into a space-separated list of kernel names. kernels=$(echo -e ${kernels} | sed -e "s/\// /g") clist="${clist} ${config}" klist="${klist} ${kernels}" done else #echo "Slash not found." clist=${list} klist=${list} fi # Strip out whitespace from the config name and config/kernel list # on each line. cname=$(canonicalize_ws "${cname}") clist=$(canonicalize_ws "${clist}") klist=$(canonicalize_ws "${klist}") # Next, we prepare to: # - pass 0: inspect klist for blacklisted configurations, which may # reveal configurations as needing to be indirectly blacklisted. # - pass 1: compare cname to the blacklists and commit clist/klist # to their respective registries, as appropriate. # Handle singleton and umbrella configuration entries separately. if [ $(is_singleton_family "${cname}" "${clist}") == "true" ]; then # Singleton configurations/families. # Note: for singleton families, clist contains one item, which # always equals cname, but klist could contain more than one # item. # Only consider updating the indirect blacklist (pass 0) or # committing clist and klist to the registries (pass 1) if the # configuration name (cname) is not blacklisted. if [ $(is_in_list "${cname}" "${all_blist}") == "false" ]; then if [ "${passnum}" == "0" ]; then # Even if the cname isn't blacklisted, one of the requisite # kernels might be, so we need to check klist for blacklisted # items. If we find one, we must assume that the entire entry # must be thrown out. (Ideally, we would simply fall back to # reference code for the blacklisted kernels, but that is not # at all straightforward under the current configuration # system architecture.) Thus, we add cname to the indirect # blacklist. for item in ${klist}; do if [ $(is_in_list "${item}" "${config_blist}") == "true" ]; then indirect_blist="${indirect_blist} ${cname}" break fi done fi if [ "${passnum}" == "1" ]; then # Store the clist to the cname key of the config registry. #config_registry[${cname}]=${clist} #printf -v "config_registry_${cname}" %s "${clist}" assign_key_value "config_registry" "${cname}" "${clist}" fi fi if [ "${passnum}" == "1" ]; then # Store the klist to the cname key of the kernel registry. #kernel_registry[${cname}]=${klist} #printf -v "kernel_registry_${cname}" %s "${klist}" assign_key_value "kernel_registry" "${cname}" "${klist}" fi else # Umbrella configurations/families. # First we check cname, which should generally not be blacklisted # for umbrella families, but we check anyway just to be safe. if [ $(is_in_list "${cname}" "${all_blist}") == "false" ]; then if [ "${passnum}" == "1" ]; then # Check each item in the clist and klist. (At this point, # clist == klist.) If any sub-config is blacklisted, we # omit it from clist and klist. for item in ${clist}; do if [ $(is_in_list "${item}" "${all_blist}") == "true" ]; then clist=$(remove_from_list "${item}" "${clist}") klist=$(remove_from_list "${item}" "${klist}") fi done # Store the config and kernel lists to entries that # corresponds to the config name. #config_registry[${cname}]=${clist} #kernel_registry[${cname}]=${klist} #printf -v "config_registry_${cname}" %s "${clist}" #printf -v "kernel_registry_${cname}" %s "${klist}" assign_key_value "config_registry" "${cname}" "${clist}" assign_key_value "kernel_registry" "${cname}" "${klist}" fi fi fi done < "${filename}" if [ "${passnum}" == "0" ]; then # Assign the final indirect blacklist (with whitespace removed). indirect_blist="$(canonicalize_ws ${indirect_blist})" fi } read_registry_file() { local filename local clist klist local iterate_again config local cr_var mem mems_mem newclist local kr_var ker kers_ker newklist filename="$1" # Execute an initial pass through the config_registry file so that # we can accumulate a list of indirectly blacklisted configurations, # if any. pass_config_kernel_registries "${filename}" "0" # Now that the indirect_blist has been created, make a second pass # through the 'config_registry' file, this time creating the actual # config and kernel registry data structures. pass_config_kernel_registries "${filename}" "1" # Now we must go back through the config_registry and subsitute any # configuration families with their constituents' members. Each time # one of these substitutions occurs, we set a flag that causes us to # make one more pass. (Subsituting a singleton definition does not # prompt additional iterations.) This process stops when a full pass # does not result in any subsitution. iterate_again="1" while [ "${iterate_again}" == "1" ]; do iterate_again="0" #for config in "${!config_registry[@]}"; do for cr_var in ${!config_registry_*}; do config=${cr_var##config_registry_} clist=$(query_array "config_registry" ${config}) # The entries that define singleton families should never need # any substitution. if [ $(is_singleton_family "${config}" "${clist}") == "true" ]; then continue fi #for mem in ${config_registry[$config]}; do #for mem in ${!cr_var}; do for mem in ${clist}; do #mems_mem="${config_registry[${mem}]}" mems_mem=$(query_array "config_registry" ${mem}) # If mems_mem is empty string, then mem was not found as a key # in the config list associative array. In that case, we continue # and will echo an error later in the script. if [ "${mems_mem}" == "" ]; then #echo " config for ${mem} is empty string! no entry in config list." continue; fi if [ "${mem}" != "${mems_mem}" ]; then #clist="${config_registry[$config]}" clist=$(query_array "config_registry" ${config}) # Replace the current config with its constituent config set, # canonicalize whitespace, and then remove duplicate config # set names, if they exist. Finally, update the config registry # with the new config list. newclist=$(echo -e "${clist}" | sed -e "s/${mem}/${mems_mem}/g") newclist=$(canonicalize_ws "${newclist}") newclist=$(rm_duplicate_words "${newclist}") #config_registry[${config}]=${newclist} #printf -v "config_registry_${config}" %s "${newclist}" assign_key_value "config_registry" "${config}" "${newclist}" # Since we performed a substitution and changed the config # list, mark the iteration flag to continue another round, # but only if the config (mem) value is NOT present # in the list of sub-configs. If it is present, then further # substitution may not necessarily be needed this round. if [ $(is_in_list "${mem}" "${mems_mem}") == "false" ]; then iterate_again="1" fi fi done done done # Similar to what we just did for the config_registry, we now iterate # through the kernel_registry and substitute any configuration families # in the kernel list (right side of ':') with the members of that # family's kernel set. This process continues iteratively, as before, # until all families have been replaced with singleton configurations' # kernel sets. iterate_again="1" while [ "${iterate_again}" == "1" ]; do iterate_again="0" #for config in "${!kernel_registry[@]}"; do for kr_var in ${!kernel_registry_*}; do config=${kr_var##kernel_registry_} klist=$(query_array "kernel_registry" ${config}) # The entries that define singleton families should never need # any substitution. In the kernel registry, we know it's a # singleton entry when the cname occurs somewhere in the klist. # (This is slightly different than the same test in the config # registry, where we test that clist is one word and that # clist == cname.) if [ $(is_in_list "${config}" "${klist}") == "true" ]; then #echo "debug: '${config}' not found in '${klist}'; skipping." continue fi #for ker in ${kernel_registry[$config]}; do #for ker in ${!kr_var}; do for ker in ${klist}; do #kers_ker="${kernel_registry[${ker}]}" kers_ker=$(query_array "kernel_registry" ${ker}) # If kers_ker is empty string, then ker was not found as a key # in the kernel registry. While not common, this can happen # when ker identifies a kernel set that does not correspond to # any configuration. (Example: armv7a and armv8a kernel sets are # used by cortexa* configurations, but do not corresond to their # own configurations.) if [ "${kers_ker}" == "" ]; then #echo "debug: ${ker} not found in kernel registry." continue fi # If the current config/kernel (ker) differs from its singleton kernel # entry (kers_ker), then that singleton entry was specified to use # a different configuration's kernel set. Thus, we need to replace the # occurrence in the current config/kernel name with that of the kernel # set it needs. if [ "${ker}" != "${kers_ker}" ]; then #klisttmp="${kernel_registry[$config]}" klisttmp=$(query_array "kernel_registry" ${config}) # Replace the current config with its requisite kernels, # canonicalize whitespace, and then remove duplicate kernel # set names, if they exist. Finally, update the kernel registry # with the new kernel list. newklist=$(echo -e "${klisttmp}" | sed -e "s/${ker}/${kers_ker}/g") newklist=$(canonicalize_ws "${newklist}") newklist=$(rm_duplicate_words "${newklist}") #kernel_registry[${config}]=${newklist} #printf -v "kernel_registry_${config}" %s "${newklist}" assign_key_value "kernel_registry" "${config}" "${newklist}" # Since we performed a substitution and changed the kernel # list, mark the iteration flag to continue another round, # unless we just substituted using a singleton family # definition, in which case we don't necessarily need to # iterate further this round. if [ $(is_in_list "${ker}" "${kers_ker}") == "false" ]; then iterate_again="1" fi fi done done done } build_kconfig_registry() { local familyname clist config kernels kernel cur_configs newvalue familyname="$1" #clist="${config_registry[${familyname}]}" clist=$(query_array "config_registry" ${familyname}) for config in ${clist}; do # Look up the kernels for the current sub-configuration. #kernels="${kernel_registry[${config}]}" kernels=$(query_array "kernel_registry" ${config}) for kernel in ${kernels}; do # Add the sub-configuration to the list associated with the # kernel. # Query the current sub-configs for the current ${kernel}. #cur_configs="${kconfig_registry[${kernel}]}" cur_configs=$(query_array "kconfig_registry" ${kernel}) # Add the current sub-configuration to the list of sub-configs # we just queried. newvalue=$(canonicalize_ws "${cur_configs} ${config}") # Update the array. #kconfig_registry[${kernel}]="${newvalue}" #printf -v "kconfig_registry_${kernel}" %s "${newvalue}" assign_key_value "kconfig_registry" "${kernel}" "${newvalue}" done done } is_in_list() { local word list rval item word="$1" list="$2" rval="false" for item in ${list}; do if [ "${item}" == "${word}" ]; then rval="true" break fi done echo "${rval}" } is_singleton() { local list rval count_str item list="$1" rval="false" count_str="" for item in ${list}; do count_str="${count_str}x" done if [ "${count_str}" == "x" ]; then rval="true" fi echo "${rval}" } is_singleton_family() { local familyname memberlist rval familyname="$1" memberlist="$2" rval="false" if [ $(is_singleton "${memberlist}") ]; then if [ "${memberlist}" == "${familyname}" ]; then rval="true" fi fi echo "${rval}" } remove_from_list() { local strike_list list flist item strike_words="$1" list="$2" flist="" for item in ${list}; do # Filter out any list item that matches any of the strike words. if [ $(is_in_list "${item}" "${strike_words}") == "false" ]; then flist="${flist} ${item}" fi done flist=$(canonicalize_ws "${flist}") # Return the filtered list. echo "${flist}" } canonicalize_ws() { local str str="$1" # Remove leading and trailing whitespace. str=$(echo -e "${str}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') # Remove duplicate spaces between words. str=$(echo -e "${str}" | tr -s " ") # Update the input argument. echo "${str}" } rm_duplicate_words() { local str revstr revres res str="$1" # We reverse the initial string, THEN remove duplicates, then reverse # the de-duplicated result so that only the last instance is kept after # removing duplicates (rather than keeping only the first). This is # totally unnecessary but works well for the kinds of duplicates that # show up in certain use cases of the config and kernel registries. # For example, these gymnastics allow us to keep only the last instance # of the 'generic' configuration in a configuration family that # includes it twice or more. revstr=$(echo "${str}" | awk '{ for (i=NF; i>1; i--) printf("%s ",$i); print $1; }') revres=$(echo "${revstr}" | awk '{for (i=1;i<=NF;i++) if (!a[$i]++) printf("%s%s",$i,FS)}{printf("\n")}') res=$(echo "${revres}" | awk '{ for (i=NF; i>1; i--) printf("%s ",$i); print $1; }') echo "${res}" } get_cc_search_list() { local list # For Linux, Darwin (OS X), and generic OSes, prioritize gcc. list="gcc clang cc" # For OpenBSD and FreeBSD, prioritize cc and clang over gcc. if [ "${os_name}" = "OpenBSD" ]; then list="cc clang gcc" elif [ "${os_name}" = "FreeBSD" ]; then list="cc clang gcc" fi echo "${list}" } get_cxx_search_list() { local list # For Linux, Darwin (OS X), and generic OSes, prioritize g++. list="g++ clang++ c++" # For OpenBSD and FreeBSD, prioritize cc and clang over gcc. if [ "${os_name}" = "OpenBSD" ]; then list="c++ clang++ g++" elif [ "${os_name}" = "FreeBSD" ]; then list="c++ clang++ g++" fi echo "${list}" } select_cc() { local search_list CC_env the_cc cc # This is the list of compilers to search for, and the order in which # to search for them. search_list=$1 # The environment variable associated with the compiler type we # are searching (e.g. CC, CXX). CC_env=$2 # If CC contains something, add it to the beginning of our default # search list. if [ -n "${CC_env}" ]; then search_list="${CC_env} ${search_list}" fi # Initialize our selected compiler to empty. the_cc="" # Try each compiler in the list and select the first one we find that # works. for cc in ${search_list}; do # See if the current compiler works and/or is present. ${cc} --version > /dev/null 2>&1 if [ "$?" == 0 ]; then the_cc=${cc} break fi done # Return the selected compiler. echo "${the_cc}" } auto_detect() { local cc cflags config_defines detected_config rval # Use the same compiler that was found earlier. cc="${found_cc}" # For debugging: reveal what compiler was chosen for auto-detection. #touch "${cc}.txt" # Tweak the flags we use based on the compiler. This is mostly just # an opportunity to turn off annoying warnings that some compilers # may throw off. if [ "${cc}" == "clang" ]; then cflags="-Wno-tautological-compare" else cflags= fi # Locate our source files. bli_arch_c="bli_arch.c" bli_cpuid_c="bli_cpuid.c" main_c="config_detect.c" bli_arch_c_filepath=$(find ${dist_path}/frame -name "${bli_arch_c}") bli_cpuid_c_filepath=$(find ${dist_path}/frame -name "${bli_cpuid_c}") main_c_filepath=$(find ${dist_path}/build -name "${main_c}") # Locate headers needed directly by the above files. bli_arch_h="bli_arch.h" bli_cpuid_h="bli_cpuid.h" bli_typed_h="bli_type_defs.h" bli_arch_h_filepath=$(find ${dist_path}/frame -name "${bli_arch_h}") bli_cpuid_h_filepath=$(find ${dist_path}/frame -name "${bli_cpuid_h}") bli_typed_h_filepath=$(find ${dist_path}/frame -name "${bli_typed_h}") bli_arch_h_path=${bli_arch_h_filepath%/${bli_arch_h}} bli_cpuid_h_path=${bli_cpuid_h_filepath%/${bli_cpuid_h}} bli_typed_h_path=${bli_typed_h_filepath%/${bli_typed_h}} # Locate other headers needed by bli_type_defs.h. bli_malloc_h="bli_malloc.h" bli_malloc_h_filepath=$(find ${dist_path}/frame -name "${bli_malloc_h}") bli_malloc_h_path=${bli_malloc_h_filepath%/${bli_malloc_h}} # Define the executable name. autodetect_x="auto-detect.x" # Create #defines for all of the BLIS_CONFIG_ macros in bli_cpuid.c. config_defines=$(grep BLIS_CONFIG_ ${bli_cpuid_c_filepath} \ | sed -e 's/#ifdef /-D/g') # Set the linker flags. We need pthreads because it is needed for # parts of bli_arch.c unrelated to bli_arch_string(), which is called # by the main() function in ${main_c}. ldflags="${LIBPTHREAD:--lpthread}" # Compile the auto-detect program using source code inside the # framework. ${cc} ${config_defines} \ -DBLIS_CONFIGURETIME_CPUID \ -I${bli_cpuid_h_path} \ -I${bli_arch_h_path} \ -I${bli_typed_h_path} \ -I${bli_malloc_h_path} \ -std=c99 \ ${cflags} \ ${bli_arch_c_filepath} \ ${bli_cpuid_c_filepath} \ ${ldflags} \ ${main_c_filepath} \ -o ${autodetect_x} # Run the auto-detect program. detected_config=$(./${autodetect_x}) # Remove the executable file. rm -f ./${autodetect_x} # Return the detected sub-configuration name. echo "${detected_config}" } has_libmemkind() { local main_c main_c_filepath LDFLAGS_mk binname rval # Path to libmemkind detection source file. main_c="libmemkind_detect.c" main_c_filepath=$(find ${dist_path}/build -name "${main_c}") # Add libmemkind to LDFLAGS. LDFLAGS_mk="${LDFLAGS} -lmemkind" # Binary executable filename. binname="libmemkind-detect.x" # Attempt to compile a simple main() program that contains a call # to hbw_malloc() and that links to libmemkind. ${found_cc} -o ${binname} ${main_c_filepath} ${LDFLAGS_mk} 2> /dev/null # Depending on the return code from the compile step above, we set # enable_memkind accordingly. if [ "$?" == 0 ]; then rval='yes' else rval='no' fi # Remove the executable generated above. rm -f ./${binname} echo "${rval}" } echoerr() { printf "${script_name}: error: %s\n" "$*" #>&2; } echowarn() { printf "${script_name}: warning: %s\n" "$*" #>&2; } blacklistcc_add() { # Check whether we've already blacklisted the given sub-config so # we don't output redundant messages. if [ $(is_in_list "$1" "${config_blist}") == "false" ]; then echowarn "${cc_vendor} ${cc_version} does not support '$1'; adding to blacklist." config_blist="${config_blist} $1" fi } blacklistbu_add() { # Check whether we've already blacklisted the given sub-config so # we don't output redundant messages. if [ $(is_in_list "$1" "${config_blist}") == "false" ]; then echowarn "assembler ('as' ${bu_version}) does not support '$1'; adding to blacklist." config_blist="${config_blist} $1" fi } blacklist_init() { config_blist="" } blacklist_cleanup() { # Remove duplicates and whitespace from the blacklist. config_blist=$(rm_duplicate_words "${config_blist}") config_blist=$(canonicalize_ws "${config_blist}") } echoerr_unsupportedcc() { echoerr "${script_name}: *** Unsupported compiler version: ${cc_vendor} ${cc_version}." exit 1 } get_binutils_version() { binutil=${AS:-as} # Query the full binutils version string output. This includes the # version string along with (potentially) a bunch of other textual # clutter. if [ "$(uname -s)" == "Darwin" ]; then # The default OS X assembler uses a trifecta of brain-dead # conventions: responding only to '-v', hanging indefinitely if # not given an argument, and outputing the result to stderr. # (And if you still weren't convinced, it creates an 'a.out' # by default. So yeah.) bu_string=$(${binutil} -v /dev/null -o /dev/null 2>&1) else bu_string=$(${binutil} --version 2>/dev/null) fi # Query the binutils version number. # The last part ({ read first rest ; echo $first ; }) is a workaround # to OS X's egrep only returning the first match. bu_version=$(echo "${bu_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; }) # Parse the version number into its major, minor, and revision # components. bu_major=$(echo "${bu_version}" | cut -d. -f1) bu_minor=$(echo "${bu_version}" | cut -d. -f2) bu_revision=$(echo "${bu_version}" | cut -d. -f3) echo "${script_name}: found assembler ('as') version ${bu_version} (maj: ${bu_major}, min: ${bu_minor}, rev: ${bu_revision})." } get_compiler_version() { local cc vendor_string cc="${found_cc}" # Query the full vendor version string output. This includes the # version number along with (potentially) a bunch of other textual # clutter. # NOTE: This maybe should use merged stdout/stderr rather than only # stdout. But it works for now. vendor_string="$(${cc} --version 2>/dev/null)" # Query the compiler "vendor" (ie: the compiler's simple name) and # isolate the version number. # The last part ({ read first rest ; echo $first ; }) is a workaround # to OS X's egrep only returning the first match. cc_vendor=$(echo "${vendor_string}" | egrep -o 'icc|gcc|clang|emcc|pnacl|IBM' | { read first rest ; echo $first ; }) if [ "$cc_vendor" = "icc" -o "$cc_vendor" = "gcc" -o "$cc_vendor" = "clang" ] then cc_version=$(${cc} -dumpversion) else cc_version=$(echo "${vendor_string}" | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*' | { read first rest ; echo ${first} ; }) fi # Parse the version number into its major, minor, and revision # components. cc_major=$(echo "${cc_version}" | cut -d. -f1) cc_minor=$(echo "${cc_version}" | cut -d. -f2) cc_revision=$(echo "${cc_version}" | cut -d. -f3) echo "${script_name}: found ${cc_vendor} version ${cc_version} (maj: ${cc_major}, min: ${cc_minor}, rev: ${cc_revision})." } check_compiler() { local cc cc="${found_cc}" # # Compiler requirements # # General: # # icc 15+, gcc 4.7+, clang 3.3+ # # Specific: # # skx: icc 15.0.1+, gcc 6.0+, clang 3.9+ # knl: icc 14.0.1+, gcc 5.0+, clang 3.5+ # haswell: any # sandybridge: any # penryn: any # # zen: gcc 6.0+[1], clang 4.0+ # excavator: gcc 4.9+, clang 3.5+ # steamroller: any # piledriver: any # bulldozer: any # # cortexa57: any # cortexa15: any # cortexa9: any # # generic: any # # Note: These compiler requirements were originally modeled after similar # requirements encoded into TBLIS's configure.ac [2]. # # [1] While gcc 6.0 or newer is needed for zen support (-march=znver1), # we relax this compiler version constraint a bit by targeting bdver4 # and then disabling the instruction sets that were removed in the # transition from bdver4 to znver1. (See config/zen/make_defs.mk for # the specific compiler flags used.) # [2] https://github.com/devinamatthews/tblis/ # echo "${script_name}: checking for blacklisted configurations due to ${cc} ${cc_version}." # gcc if [ "x${cc_vendor}" = "xgcc" ]; then if [ ${cc_major} -lt 4 ]; then echoerr_unsupportedcc fi if [ ${cc_major} -eq 4 ]; then blacklistcc_add "knl" if [ ${cc_minor} -lt 7 ]; then echoerr_unsupportedcc fi if [ ${cc_minor} -lt 9 ]; then blacklistcc_add "excavator" blacklistcc_add "zen" fi fi if [ ${cc_major} -lt 5 ]; then blacklistcc_add "knl" fi if [ ${cc_major} -lt 6 ]; then # Normally, zen would be blacklisted for gcc prior to 6.0. # However, we have a workaround in place in the zen # configuration's make_defs.mk file that starts with bdver4 # and disables the instructions that were removed in znver1. # Thus, this "blacklistcc_add" statement has been moved above. #blacklistcc_add "zen" blacklistcc_add "skx" fi fi # icc if [ "x${cc_vendor}" = "xicc" ]; then if [ ${cc_major} -lt 15 ]; then echoerr_unsupportedcc fi if [ ${cc_major} -eq 15 ]; then if [ ${cc_revision} -lt 1 ]; then blacklistcc_add "skx" fi fi fi # clang if [ "x${cc_vendor}" = "xclang" ]; then if [ ${cc_major} -lt 3 ]; then echoerr_unsupportedcc fi if [ ${cc_major} -eq 3 ]; then if [ ${cc_minor} -lt 3 ]; then echoerr_unsupportedcc fi if [ ${cc_minor} -lt 5 ]; then blacklistcc_add "excavator" blacklistcc_add "zen" blacklistcc_add "knl" fi if [ ${cc_minor} -lt 9 ]; then blacklistcc_add "skx" fi fi if [ ${cc_major} -lt 4 ]; then # See comment above regarding zen support. #blacklistcc_add "zen" : # explicit no-op since bash can't handle empty loop bodies. fi fi } check_assembler() { local cc asm_dir cflags asm_fp cc="${found_cc}" # The directory where the assembly files will be. asm_dir="${dist_path}/build" # Most of the time, we won't need any additional compiler flags. cflags="" echo "${script_name}: checking for blacklisted configurations due to as ${bu_version}." # # Check support for FMA4 (amd: bulldozer). # asm_fp=$(find ${asm_dir} -name "fma4.s") knows_fma4=$(try_assemble "${cc}" "${cflags}" "${asm_fp}") if [ "x${knows_fma4}" == "xno" ]; then blacklistbu_add "bulldozer" fi # # Check support for AVX (intel: sandybridge+, amd: piledriver+). # asm_fp=$(find ${asm_dir} -name "avx.s") knows_avx=$(try_assemble "${cc}" "${cflags}" "${asm_fp}") if [ "x${knows_avx}" == "xno" ]; then blacklistbu_add "sandybridge" fi # # Check support for FMA3 (intel: haswell+, amd: piledriver+). # asm_fp=$(find ${asm_dir} -name "fma3.s") knows_fma3=$(try_assemble "${cc}" "${cflags}" "${asm_fp}") if [ "x${knows_fma3}" == "xno" ]; then blacklistbu_add "haswell" blacklistbu_add "piledriver" blacklistbu_add "steamroller" blacklistbu_add "excavator" blacklistbu_add "skx" fi # # Check support for AVX-512f (knl, skx). # # The assembler on OS X won't recognize AVX-512 without help. if [ "$(uname -s)" == "Darwin" ]; then cflags="-Wa,-march=knl" fi asm_fp=$(find ${asm_dir} -name "avx512f.s") knows_avx512f=$(try_assemble "${cc}" "${cflags}" "${asm_fp}") if [ "x${knows_avx512f}" == "xno" ]; then blacklistbu_add "knl" blacklistbu_add "skx" fi # # Check support for AVX-512dq (skx). # # The assembler on OS X won't recognize AVX-512 without help. if [ "$(uname -s)" == "Darwin" ]; then cflags="-Wa,-march=skylake-avx512" fi asm_fp=$(find ${asm_dir} -name "avx512dq.s") knows_avx512dq=$(try_assemble "${cc}" "${cflags}" "${asm_fp}") if [ "x${knows_avx512dq}" == "xno" ]; then blacklistbu_add "skx" fi } try_assemble() { local cc cflags asm_src asm_base asm_bin rval cc="$1" cflags="$2" asm_src="$3" # Construct the filename to the .o file corresponding to asm_src. # (Strip the filepath, then the file extension, and then add ".o".) asm_base=${asm_src##*/} asm_base=${asm_base%.*} asm_bin="${asm_base}.o" # Try to assemble the file. ${cc} ${cflags} -c ${asm_src} -o ${asm_bin} > /dev/null 2>&1 if [ "$?" == 0 ]; then rval='yes' else rval='no' fi # Remove the object file. rm -f "${asm_bin}" # Return the result. echo "${rval}" } set_default_version() { local gitdir version_file gd_stderr git_describe_str git_error new_version_str gitdir='.git' # The path to the version file. version_file=$1 echo "${script_name}: determining default version string." # Check if the .git dir exists; if it does not, we do nothing. if [ -d "${dist_path}/${gitdir}" ]; then echo "${script_name}: found '${gitdir}' directory; assuming git clone." echo "${script_name}: executing: git describe --tags." gd_stderr="git_describe_stderr.txt" # Query git for the version string, which is simply the current tag, # followed by a number signifying how many commits have transpired # since the tag, followed by a 'g' and a shortened hash tab. Capture # stderr to a file. git_describe_str=$(git -C ${dist_path} describe --tags 2> ${gd_stderr}) # Pull in whatever error message was generated, if any, and delete # the file. git_error=$(cat ${gd_stderr}) # Remove the stderr file. rm -f ${gd_stderr} # If git returned an error, don't do anything. if [ -n "${git_error}" ]; then echo "${script_name}: git returned an error: '${git_error}'." echo "${script_name}: using string from unmodified version file." # Use what's in the version file as-is. version=$(cat "${version_file}") else echo "${script_name}: got back ${git_describe_str}." # Strip off the commit hash label. new_version_str=$(echo ${git_describe_str} | cut -d- -f-2) echo "${script_name}: truncating to ${new_version_str}." # Write the new version string to the version file. #echo "${new_version_str}" > ${version_file} # Set the version variable. version="${new_version_str}" fi else echo "${script_name}: could not find '${gitdir}' directory; using unmodified version file." # Use what's in the version file as-is. version=$(cat "${version_file}") fi } # # -- main function ------------------------------------------------------------- # main() { #declare -A config_registry #declare -A kernel_registry #declare -A kconfig_registry # -- Basic names and paths -- # The name of the script, stripped of any preceeding path. script_name=${0##*/} # The path to the script. We need this to find the top-level directory # of the source distribution in the event that the user has chosen to # build elsewhere. dist_path=${0%/${script_name}} # The path to the directory in which we are building. We do this to # make explicit that we distinguish between the top-level directory # of the distribution and the directory in which we are building. cur_dirpath="." # The file in which the version string is kept. version_file="version" version_filepath="${dist_path}/${version_file}" # The name of and path to the directory named "build" in the top-level # directory of the source distribution. build_dir='build' build_dirpath="${dist_path}/${build_dir}" # The name/path to the registry (master list) of supported configurations. registry_file="config_registry" registry_filepath=${dist_path}/${registry_file} # The names/paths for the template config.mk.in and its instantiated # counterpart. config_mk_in='config.mk.in' config_mk_out='config.mk' config_mk_in_path="${build_dirpath}/${config_mk_in}" config_mk_out_path="${cur_dirpath}/${config_mk_out}" # The names/paths for the template bli_config.h.in and its instantiated # counterpart. bli_config_h_in='bli_config.h.in' bli_config_h_out='bli_config.h' bli_config_h_in_path="${build_dirpath}/${bli_config_h_in}" bli_config_h_out_path="${cur_dirpath}/${bli_config_h_out}" # Path to 'mirror-tree.sh' script. mirror_tree_sh="${build_dirpath}/mirror-tree.sh" # Path to 'gen-make-frags.sh' script and directory. gen_make_frags_dirpath="${build_dirpath}/gen-make-frags" gen_make_frags_sh="${gen_make_frags_dirpath}/gen-make-frag.sh" # The name of the (top-level) configuration directory. config_dir='config' config_dirpath="${dist_path}/${config_dir}" # The name of the (top-level) kernels directory. kernels_dir='kernels' kernels_dirpath="${dist_path}/${kernels_dir}" # The name of the (top-level) reference kernels directory. refkern_dir='ref_kernels' refkern_dirpath="${dist_path}/${refkern_dir}" # The root directory of the BLIS framework. frame_dir='frame' frame_dirpath="${dist_path}/${frame_dir}" # The name of the sandbox directory. sandbox_dir='sandbox' sandbox_dirpath="${dist_path}/${sandbox_dir}" # The name of the directory in which object files will be kept. obj_dir='obj' obj_dirpath="${cur_dirpath}/${obj_dir}" # The name of the directory in which libraries will be kept. lib_dir='lib' lib_dirpath="${cur_dirpath}/${lib_dir}" # The name of the directory in which headers will be kept. include_dir='include' include_dirpath="${cur_dirpath}/${include_dir}" # The name of the directory in which the BLAS test suite is kept. blastest_dir='blastest' # The name of the directory in which the BLIS test suite is kept. testsuite_dir='testsuite' # -- Version-related -- # The shared library (.so) version file. so_version_file='so_version' so_version_filepath="${dist_path}/${so_version_file}" # The major and minor/build .so version numbers. so_version_major='' so_version_minorbuild='' # -- configure options -- # The user-given install prefix and a flag indicating it was given. #install_prefix_def="${HOME}/blis" install_prefix_user=${HOME}/blis # default to this directory. prefix_flag='' # The user-given install libdir and a flag indicating it was given. install_libdir_user='' libdir_flag='' # The user-given install includedir and a flag indicating it was given. install_incdir_user='' incdir_flag='' # The user-given install sharedir and a flag indicating it was given. install_sharedir_user='' sharedir_flag='' # The preset value of CFLAGS and LDFLAGS (ie: compiler and linker flags # to use in addition to those determined by the build system). cflags_preset='' ldflags_preset='' # The user-given debug type and a flag indicating it was given. debug_type='' debug_flag='' # The threading flag. threading_model='no' # Option variables. quiet_flag='' show_config_list='' # Additional flags. enable_verbose='no' enable_arg_max_hack='no' enable_static='yes' enable_shared='yes' enable_packbuf_pools='yes' int_type_size=0 blas_int_type_size=32 enable_blas='yes' enable_cblas='no' enable_memkind='' # The default memkind value is determined later on. force_version='no' # The sandbox flag and name. sandbox_flag='' sandbox='' # -- Configuration registry -- # The name of the chosen configuration (the configuration "family"). config_name='' # The list of sub-configurations associated with config_name. config_list='' # The list of kernel sets that will be needed by the sub-configurations # in config_list.. kernel_list='' # The list of kernel:sub-configuration pairs for all kernels contained # in kernel_list. kconfig_map='' # -- Out-of-tree -- # Whether we are building out-of-tree. configured_oot="no" # Dummy file. Used to check whether the cwd is the same as the top-level # source distribution directory. dummy_file='_blis_dir_detect.tmp' # -- Command line option/argument parsing ---------------------------------- # Process our command line options. while getopts ":hp:d:s:t:qci:b:-:" opt; do case $opt in -) case "$OPTARG" in help) print_usage ;; quiet) quiet_flag=1 ;; prefix=*) prefix_flag=1 install_prefix_user=${OPTARG#*=} ;; libdir=*) libdir_flag=1 install_libdir_user=${OPTARG#*=} ;; includedir=*) incdir_flag=1 install_incdir_user=${OPTARG#*=} ;; sharedir=*) sharedir_flag=1 install_sharedir_user=${OPTARG#*=} ;; enable-debug) debug_flag=1 debug_type=noopt ;; enable-debug=*) debug_flag=1 debug_type=${OPTARG#*=} ;; disable-debug) debug_flag=0 ;; enable-verbose-make) enable_verbose='yes' ;; disable-verbose-make) enable_verbose='no' ;; enable-arg-max-hack) enable_arg_max_hack='yes' ;; disable-arg-max-hack) enable_arg_max_hack='no' ;; enable-static) enable_static='yes' ;; disable-static) enable_static='no' ;; enable-shared) enable_shared='yes' ;; disable-shared) enable_shared='no' ;; enable-threading=*) threading_model=${OPTARG#*=} ;; disable-threading) threading_model='no' ;; enable-packbuf-pools) enable_packbuf_pools='yes' ;; disable-packbuf-pools) enable_packbuf_pools='no' ;; enable-sandbox=*) sandbox_flag=1 sandbox=${OPTARG#*=} ;; disable-sandbox) sandbox_flag=0 ;; int-size=*) int_type_size=${OPTARG#*=} ;; blas-int-size=*) blas_int_type_size=${OPTARG#*=} ;; enable-blas) enable_blas='yes' ;; disable-blas) enable_blas='no' ;; enable-cblas) enable_cblas='yes' ;; disable-cblas) enable_cblas='no' ;; with-memkind) enable_memkind='yes' ;; without-memkind) enable_memkind='no' ;; force-version=*) force_version=${OPTARG#*=} ;; show-config-list) show_config_list=1 ;; *) print_usage ;; esac;; h) print_usage ;; p) prefix_flag=1 install_prefix_user=$OPTARG ;; d) debug_flag=1 debug_type=$OPTARG ;; s) sandbox_flag=1 sandbox=$OPTARG ;; q) quiet_flag=1 ;; t) threading_model=$OPTARG ;; i) int_type_size=$OPTARG ;; b) blas_int_type_size=$OPTARG ;; c) show_config_list=1 ;; \?) print_usage ;; esac done shift $(($OPTIND - 1)) # Parse environment variables while [ $# -gt 0 ]; do case $1 in CC=*) CC=${1#*=} shift ;; RANLIB=*) RANLIB=${1#*=} shift ;; *=*) print_usage ;; *) break ;; esac done # -- Check the operating system -------------------------------------------- os_name=$(uname -s) os_vers=$(uname -r) echo "${script_name}: detected ${os_name} kernel version ${os_vers}." # -- Find a C compiler ----------------------------------------------------- # Acquire the compiler search order. This will vary based on the os # found above. cc_search_list=$(get_cc_search_list) echo "${script_name}: C compiler search list is: ${cc_search_list}." # Find a working C compiler. found_cc=$(select_cc "${cc_search_list}" "${CC}") # If we didn't find any working C compilers, we print an error message. if [ -z "${found_cc}" ]; then echo "${script_name}: *** Could not find working C compiler! Cannot continue." exit 1 fi echo "${script_name}: using '${found_cc}' C compiler." # -- Find a C++ compiler --------------------------------------------------- # Acquire the compiler search order. This will vary based on the os # found above. cxx_search_list=$(get_cxx_search_list) echo "${script_name}: C++ compiler search list is: ${cxx_search_list}." # Find a working C++ compiler. NOTE: We can reuse the select_cc() # function since it is written in a way that is general-purpose. found_cxx=$(select_cc "${cxx_search_list}" "${CXX}") # If we didn't find any working C++ compilers, we print an error message. if [ -z "${found_cxx}" ]; then echo "${script_name}: Could not find working C++ compiler! C++ will not be available in sandbox." found_cxx="c++notfound" fi echo "${script_name}: using '${found_cxx}' C++ compiler (for sandbox only)." # -- Check the compiler version -------------------------------------------- # Initialize the blacklist to empty. blacklist_init # Check the compiler's version. Certain versions of certain compilers # will preclude building certain sub-configurations, which are added # to a blacklist. get_compiler_version check_compiler # Now check the assembler's ability to assemble code. Older versions # of binutils may not be aware of certain instruction sets. Those # sub-configurations employing kernels that use such instruction sets # will also be blacklisted. get_binutils_version check_assembler # Remove duplicates and whitespace from the blacklist. blacklist_cleanup if [ -n "${config_blist}" ]; then echo "${script_name}: configuration blacklist:" echo "${script_name}: ${config_blist}" fi # -- Read the configuration registry --------------------------------------- # Make sure the config registry file exists and can be opened. if [ ! -f "${registry_filepath}" ]; then echo "${script_name}: could not open '${registry_file}' file; cannot continue." echo "${script_name}: BLIS distribution appears to be incomplete." echo "${script_name}: *** Please verify source distribution." exit 1 fi # Read the registered configuration names and lists into associative # arrays. echo -n "${script_name}: reading configuration registry..." read_registry_file ${registry_filepath} echo "done." # Report if additional configurations needed to be blacklisted. # NOTE: This branch should never execute so long as indirect blacklisting # is disabled. See comment regarding issue #214 in the definition of # pass_config_kernel_registries(). if [ -n "${indirect_blist}" ]; then echo "${script_name}: needed to indirectly blacklist additional configurations:" echo "${script_name}: ${indirect_blist}" fi # -- Acquire the BLIS version ---------------------------------------------- # Set the 'version' variable to the default value (the 'git describe' # augmented instance of whatever is in the 'version' file if this is a git # clone, or whatever is in the 'version' file unmodified if it is a bare # source release). set_default_version "${version_filepath}" # Initial message. echo "${script_name}: starting configuration of BLIS ${version}." # Check if the user requested a custom version string. if [ "x${force_version}" = "xno" ]; then echo "${script_name}: configuring with official version string." else echo "${script_name}: configuring with custom version string '${force_version}'." version="${force_version}" fi # -- Acquire the shared library (.so) versions ----------------------------- # The first line of the 'so_version' file contains the .so major version. so_version_major=$(cat ${so_version_filepath} | sed -n "1p") # The second line contains the minor and build .so version numbers # (separated by a '.'). so_version_minorbuild=$(cat ${so_version_filepath} | sed -n "2p") echo "${script_name}: found shared library .so version '${so_version_major}.${so_version_minorbuild}'." echo "${script_name}: .so major version: ${so_version_major}" echo "${script_name}: .so minor.build version: ${so_version_minorbuild}" # -- Various pre-configuration checks -------------------------------------- # Set config_name based on the number of arguments leftover (after command # line option processing). if [ $# = "0" ]; then #configs_avail="auto "$(ls ${config_dirpath}) echo "${script_name}: " echo "${script_name}: *** No configuration given! ***" echo "${script_name}: " echo "${script_name}: Default configuration behavior is not implemented (for your" echo "${script_name}: own safety). Please re-run '${script_name}' and specify one" echo "${script_name}: of the existing configurations in the source distribution's" echo "${script_name} '${registry_file}' file:" echo "${script_name}: " #for k in "${!config_registry[@]}"; do for cr_var in ${!config_registry_*}; do #v=${config_registry[$k]} k=${cr_var##config_registry_}; v=${!cr_var} echo "${script_name}: $k (${v})" done echo "${script_name}: " exit 1 elif [ $# != "1" ]; then # more than one configuration argument given. print_usage fi if [ $1 = "auto" ]; then echo "${script_name}: automatic configuration requested." # Call the auto_detect() function and save the returned string in # config_name. config_name=$(auto_detect) echo "${script_name}: hardware detection driver returned '${config_name}'." else # Use the command line argument as the configuration name. config_name=$1 #echo "${script_name}: manual configuration requested." echo "${script_name}: manual configuration requested; configuring with '${config_name}'." fi # Use the selected config name to look up the list of configurations # and kernels associated with that name. #config_list=${config_registry[${config_name}]} #kernel_list=${kernel_registry[${config_name}]} config_list=$(query_array "config_registry" ${config_name}) kernel_list=$(query_array "kernel_registry" ${config_name}) # Use the config_registry and kernel_registry to build a kconfig_registry # for the selected config_name. build_kconfig_registry "${config_name}" # Print the configuration list and kernel list, if requested. if [ "${show_config_list}" == "1" ]; then echo "${script_name}: configuration list:" #for k in "${!config_registry[@]}"; do for cr_var in ${!config_registry_*}; do #v=${config_registry[$k]} k=${cr_var##config_registry_}; v=${!cr_var} echo "${script_name}: $k: ${v}" done echo "${script_name}: kernel list:" #for k in "${!kernel_registry[@]}"; do for kr_var in ${!kernel_registry_*}; do #v=${kernel_registry[$k]} k=${kr_var##kernel_registry_}; v=${!kr_var} echo "${script_name}: $k: ${v}" done echo "${script_name}: kernel-to-config map for '${config_name}':" #for k in "${!kconfig_registry[@]}"; do for kc_var in ${!kconfig_registry_*}; do #v=${kconfig_registry[$k]} k=${kc_var##kconfig_registry_}; v=${!kc_var} echo "${script_name}: $k: ${v}" done fi # For each kernel in the kernel list, reduce the list of associated # sub-configurations (in the kconfig_registry) to a singleton using # the following rules: # 1. If the list is a singleton, use that name. # 2. If the list contains a sub-configuration name that matches the # kernel name, use that name. # 3. Otherwise, use the first name in the list. # We use the chosen singleton to ceate a "kernel:subconfig" pair, which # we accumulate into a list. This list is the kernel-to-config map, or # kconfig_map. # We use a sorted version of kernel_list so that it ends up matching the # display order of the kconfig_registry above. kernel_list_sort=$(echo ${kernel_list} | xargs -n1 | sort -u) kconfig_map="" for kernel in ${kernel_list_sort}; do #configs="${kconfig_registry[$kernel]}" configs=$(query_array "kconfig_registry" ${kernel}) has_one_kernel=$(is_singleton "${configs}") contains_kernel=$(is_in_list "${kernel}" "${configs}") # Check if the list is a singleton. if [ "${has_one_kernel}" == "true" ]; then reducedclist="${configs}" # Check if the list contains a sub-config name that matches the kernel. elif [ "${contains_kernel}" == "true" ]; then reducedclist="${kernel}" # Otherwise, use the first name. else first_config=${configs%% *} reducedclist="${first_config}" fi # Create a new "kernel:subconfig" pair and add it to the kconfig_map # list, removing whitespace. new_pair="${kernel}:${reducedclist}" kconfig_map=$(canonicalize_ws "${kconfig_map} ${new_pair}") done if [ "${show_config_list}" == "1" ]; then echo "${script_name}: kernel-to-config map for '${config_name}' (chosen pairs):" for k in ${kconfig_map}; do echo "${script_name}: $k" done fi echo "${script_name}: checking configuration against contents of '${registry_file}'." # First, ensure that the config name is registered (ie: it is present # in the config_registry file). if [ -z "${config_list}" ]; then # NOTE: This branch should never execute when using auto-detection, # but we have it here just in case. if [ $1 = "auto" ]; then echo "${script_name}: 'auto-detected configuration '${conf}' is NOT registered!" echo "${script_name}: " echo "${script_name}: *** Cannot continue with unregistered configuration '${conf}'. ***" echo "${script_name}: " exit 1; else echo "${script_name}: 'user-specified configuration '${conf}' is NOT registered!" echo "${script_name}: " echo "${script_name}: *** Cannot continue with unregistered configuration '${conf}'. ***" echo "${script_name}: " exit 1; fi else # This branch executes when the configuration is found to be present # (i.e. registered) in the config_registry file. echo "${script_name}: configuration '${config_name}' is registered." echo "${script_name}: '${config_name}' is defined as having the following sub-configurations:" echo "${script_name}: ${config_list}" echo "${script_name}: which collectively require the following kernels:" echo "${script_name}: ${kernel_list}" fi echo "${script_name}: checking sub-configurations:" # Now, verify that the constituent configurations associated with the # config name are all valid. for conf in ${config_list}; do # First confirm that the current configuration is registered. #this_clist=${config_registry[${conf}]} this_clist=$(query_array "config_registry" ${conf}) # If the config_list associated with conf is empty, then it was # never entered into the config_registry to begin with. Thus, # conf must be unregistered. if [ -z "${this_clist}" ]; then echo "${script_name}: '${conf}' is NOT registered!" echo "${script_name}: " echo "${script_name}: *** Cannot continue with unregistered configuration '${conf}'. ***" echo "${script_name}: " exit 1; else echo -n "${script_name}: '${conf}' is registered." fi # Then confirm that the current sub-configuration directory exists. if [ ! -d "${config_dirpath}/${conf}" ]; then echo "..but does NOT exist!" echo "${script_name}: " echo "${script_name}: *** Cannot continue with nonexistent configuration '${conf}'. ***" echo "${script_name}: " exit 1; else echo "..and exists." fi done echo "${script_name}: checking sub-configurations' requisite kernels:" # Also, let's verify that the requisite kernel sets associated with # the config name all correspond to directories that exist. for kernel in ${kernel_list}; do echo -n "${script_name}: '${kernel}' kernels..." # Confirm that the current kernel sub-directory exists. if [ ! -d "${kernels_dirpath}/${kernel}" ]; then echo "do NOT exist!" echo "${script_name}: " echo "${script_name}: *** Cannot continue with nonexistent kernel '${kernel}'. ***" echo "${script_name}: " exit 1; else echo "exist." fi done # In order to determine the default behavior of the --with[out]-memkind # option, we try to detect whether libmemkind is available. If it is, # the default implied option will be --with-memkind; otherwise, will be # --without-memkind. has_memkind=$(has_libmemkind) # -- Prepare variables for subsitution into template files ----------------- # Parse the status of the install prefix and echo feedback. if [ -n "${prefix_flag}" ]; then echo "${script_name}: detected --prefix='${install_prefix_user}'." else echo "${script_name}: no install prefix option given; defaulting to '${install_prefix_user}'." fi # Set initial (candidate) values for the libdir and includedir using the # install prefix that was determined above. install_libdir=${install_prefix_user}/lib install_incdir=${install_prefix_user}/include install_sharedir=${install_prefix_user}/share # Set the install libdir, if it was specified. Note that this will override # the default libdir implied by the install prefix, even if both options # were given. if [ -n "${libdir_flag}" ]; then echo "${script_name}: detected --libdir='${install_libdir_user}'." install_libdir=${install_libdir_user} else echo "${script_name}: no install libdir option given; defaulting to PREFIX/lib." fi # Set the install includedir, if it was specified. Note that this will # override the default includedir implied by the install prefix, even if # both options were given. if [ -n "${incdir_flag}" ]; then echo "${script_name}: detected --includedir='${install_incdir_user}'." install_incdir=${install_incdir_user} else echo "${script_name}: no install includedir option given; defaulting to PREFIX/include." fi # Set the install sharedir, if it was specified. Note that this will # override the default sharedir implied by the install prefix, even if # both options were given. if [ -n "${sharedir_flag}" ]; then echo "${script_name}: detected --sharedir='${install_sharedir_user}'." install_sharedir=${install_sharedir_user} else echo "${script_name}: no install sharedir option given; defaulting to PREFIX/share." fi # Echo the installation directories that we settled on. echo "${script_name}: final installation directories:" echo "${script_name}: libdir: ${install_libdir}" echo "${script_name}: includedir: ${install_incdir}" echo "${script_name}: sharedir: ${install_sharedir}" # Check if CFLAGS is non-empty. if [ -n "${CFLAGS}" ]; then cflags_preset="${CFLAGS}" echo "${script_name}: detected preset CFLAGS; prepending:" echo "${script_name}: ${cflags_preset}" else cflags_preset='' echo "${script_name}: no preset CFLAGS detected." fi # Check if LDFLAGS is non-empty. if [ -n "${LDFLAGS}" ]; then ldflags_preset="${LDFLAGS}" echo "${script_name}: detected preset LDFLAGS; prepending:" echo "${script_name}: ${ldflags_preset}" else ldflags_preset='' echo "${script_name}: no preset LDFLAGS detected." fi # Check if the debug flag was specified. if [ -n "${debug_flag}" ]; then if [ "x${debug_type}" = "xopt" ]; then echo "${script_name}: enabling debug symbols with optimizations." elif [ "x${debug_type}" = "xsde" ]; then debug_type='sde' echo "${script_name}: enabling SDE processor emulation." else debug_type='noopt' echo "${script_name}: enabling debug symbols; optimizations disabled." fi else debug_type='off' echo "${script_name}: debug symbols disabled." fi # Check if the verbose make flag was specified. if [ "x${enable_verbose}" = "xyes" ]; then echo "${script_name}: enabling verbose make output. (disable with 'make V=0'.)" else echo "${script_name}: disabling verbose make output. (enable with 'make V=1'.)" fi # Check if the ARG_MAX hack was requested. if [ "x${enable_arg_max_hack}" = "xyes" ]; then echo "${script_name}: enabling ARG_MAX hack." else echo "${script_name}: disabling ARG_MAX hack." fi # Check if the static lib flag was specified. if [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xyes" ]; then echo "${script_name}: building BLIS as both static and shared libraries." elif [ "x${enable_static}" = "xyes" -a "x${enable_shared}" = "xno" ]; then echo "${script_name}: building BLIS as a static library (shared library disabled)." elif [ "x${enable_static}" = "xno" -a "x${enable_shared}" = "xyes" ]; then echo "${script_name}: building BLIS as a shared library (static library disabled)." fi #else # echo "${script_name}: Both static and shared libraries were disabled." # echo "${script_name}: *** Please enable one (or both) to continue." # exit 1 # Check the threading model flag and standardize its value, if needed. # NOTE: 'omp' is deprecated but still supported; 'openmp' is preferred. enable_openmp='no' enable_openmp_01=0 enable_pthreads='no' enable_pthreads_01=0 if [ "x${threading_model}" = "xauto" ]; then echo "${script_name}: determining the threading model automatically." elif [ "x${threading_model}" = "xopenmp" ] || [ "x${threading_model}" = "xomp" ]; then echo "${script_name}: using OpenMP for threading." enable_openmp='yes' enable_openmp_01=1 threading_model="openmp" # Standardize the value. elif [ "x${threading_model}" = "xpthreads" ] || [ "x${threading_model}" = "xpthread" ] || [ "x${threading_model}" = "xposix" ]; then echo "${script_name}: using Pthreads for threading." enable_pthreads='yes' enable_pthreads_01=1 threading_model="pthreads" # Standardize the value. elif [ "x${threading_model}" = "xno" ] || [ "x${threading_model}" = "xnone" ]; then echo "${script_name}: threading is disabled." else echo "${script_name}: *** Unsupported threading model: ${threading_model}." exit 1 fi # Convert 'yes' and 'no' flags to booleans. if [ "x${enable_packbuf_pools}" = "xyes" ]; then echo "${script_name}: internal memory pools for packing buffers are enabled." enable_packbuf_pools_01=1 else echo "${script_name}: internal memory pools for packing buffers are disabled." enable_packbuf_pools_01=0 fi if [ "x${has_memkind}" = "xyes" ]; then # If no explicit option was given for libmemkind one way or the other, # we use the value returned previously by has_libmemkind() to determine # the default. if [ "x${enable_memkind}" = "x" ]; then enable_memkind="yes" fi echo "${script_name}: libmemkind found; default is to enable use." if [ "x${enable_memkind}" = "xyes" ]; then echo "${script_name}: received explicit request to enable libmemkind." enable_memkind_01=1 else echo "${script_name}: received explicit request to disable libmemkind." enable_memkind_01=0 fi else echo "${script_name}: libmemkind not found; disabling." if [ "x${enable_memkind}" = "xyes" ]; then echo "${script_name}: cannot honor explicit request to enable libmemkind." fi enable_memkind="no" enable_memkind_01=0 fi if [ "x${enable_blas}" = "xyes" ]; then echo "${script_name}: the BLAS compatibility layer is enabled." enable_blas_01=1 else echo "${script_name}: the BLAS compatibility layer is disabled." enable_blas_01=0 fi if [ "x${enable_cblas}" = "xyes" ]; then echo "${script_name}: the CBLAS compatibility layer is enabled." enable_cblas_01=1 # Force BLAS layer when CBLAS is enabled enable_blas='yes' else echo "${script_name}: the CBLAS compatibility layer is disabled." enable_cblas_01=0 fi # Report integer sizes if [ "x${int_type_size}" = "x32" ]; then echo "${script_name}: the internal integer size is 32-bit." elif [ "x${int_type_size}" = "x64" ]; then echo "${script_name}: the internal integer size is 64-bit." else echo "${script_name}: the internal integer size is automatically determined." fi if [ "x${blas_int_type_size}" = "x32" ]; then echo "${script_name}: the BLAS/CBLAS interface integer size is 32-bit." elif [ "x${blas_int_type_size}" = "x64" ]; then echo "${script_name}: the BLAS/CBLAS interface integer size is 64-bit." else echo "${script_name}: the BLAS/CBLAS interface integer size is automatically determined." fi # Check if a sandbox was given. if [ -n "${sandbox_flag}" ]; then #sandbox_relpath="${sandbox_dir}/${sandbox}" echo "${script_name}: configuring for alternate gemm implementation:" echo "${script_name}: ${sandbox_dir}/${sandbox}" sandbox_fullpath="${sandbox_dirpath}/${sandbox}" if [ ! -d "${sandbox_fullpath}" ]; then echo "${script_name}: requested sandbox sub-directory does not exist! Cannot continue." echo "${script_name}: *** Please verify sandbox existence and name." exit 1 fi enable_sandbox_01=1 else echo "${script_name}: configuring for conventional gemm implementation." enable_sandbox_01=0 fi # Variables that contain forward slashes, such as paths, need extra # escaping when used in sed commands. We insert those extra escape # characters here so that the sed commands below do the right thing. install_libdir_esc=$(echo "${install_libdir}" | sed 's/\//\\\//g') install_incdir_esc=$(echo "${install_incdir}" | sed 's/\//\\\//g') install_sharedir_esc=$(echo "${install_sharedir}" | sed 's/\//\\\//g') dist_path_esc=$(echo "${dist_path}" | sed 's/\//\\\//g') cc_esc=$(echo "${found_cc}" | sed 's/\//\\\//g') cxx_esc=$(echo "${found_cxx}" | sed 's/\//\\\//g') #sandbox_relpath_esc=$(echo "${sandbox_relpath}" | sed 's/\//\\\//g') # For RANLIB, if the variable is not set, we use a default value of # 'ranlib'. ranlib_esc=$(echo "${RANLIB:-ranlib}" | sed 's/\//\\\//g') cflags_preset_esc=$(echo "${cflags_preset}" | sed 's/\//\\\//g') ldflags_preset_esc=$(echo "${ldflags_preset}" | sed 's/\//\\\//g') # Create a #define for the configuration family (config_name). uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]') config_name_define="#define BLIS_FAMILY_${uconf}\n" # Create a list of #defines, one for each configuration in config_list. config_list_defines="" for conf in ${config_list}; do # Convert the current config name to uppercase. uconf=$(echo ${conf} | tr '[:lower:]' '[:upper:]') # Create a #define and add it to the running list. config_define="BLIS_CONFIG_${uconf}" config_list_defines="${config_list_defines}#define ${config_define}\n" done # Create a list of #defines, one for each kernel set in kernel_list. kernel_list_defines="" for kern in ${kernel_list}; do # Convert the current config name to uppercase. uconf=$(echo ${kern} | tr '[:lower:]' '[:upper:]') # Create a #define and add it to the running list. kernel_define="BLIS_KERNELS_${uconf}" kernel_list_defines="${kernel_list_defines}#define ${kernel_define}\n" done # -- Determine whether we are performing an out-of-tree build -------------- if [ ${dist_path} != "./" ]; then # At this point, we know the user did not run "./configure". But we # have not yet ruled out "/configure" or some # equivalent # that uses relative paths. To further rule out these possibilities, # we create a dummy file in the current build directory. touch ./${dummy_file} # If the dummy file we just created in the current directory does not # appear in the source distribution path, then we are in a different # directory and thus we must create a symbolic link. if [ ! -f "${dist_path}/${dummy_file}" ]; then configured_oot="yes" #echo "${script_name}: detected out-of-tree build directory." else configured_oot="no" #echo "${script_name}: detected in-tree build directory." fi # Remove the dummy file. rm -f "./${dummy_file}" fi # -- Instantiate config.mk, bli_config.h files from templates -------------- # Begin substituting information into the config_mk_in file, outputting # to config_mk_out. echo "${script_name}: creating ${config_mk_out_path} from ${config_mk_in_path}" cat "${config_mk_in_path}" \ | sed -e "s/@version@/${version}/g" \ | sed -e "s/@so_version_major@/${so_version_major}/g" \ | sed -e "s/@so_version_minorbuild@/${so_version_minorbuild}/g" \ | sed -e "s/@config_name@/${config_name}/g" \ | sed -e "s/@config_list@/${config_list}/g" \ | sed -e "s/@kernel_list@/${kernel_list}/g" \ | sed -e "s/@kconfig_map@/${kconfig_map}/g" \ | sed -e "s/@os_name@/${os_name}/g" \ | sed -e "s/@dist_path@/${dist_path_esc}/g" \ | sed -e "s/@CC_VENDOR@/${cc_vendor}/g" \ | sed -e "s/@CC@/${cc_esc}/g" \ | sed -e "s/@CXX@/${cxx_esc}/g" \ | sed -e "s/@RANLIB@/${ranlib_esc}/g" \ | sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \ | sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \ | sed -e "s/@debug_type@/${debug_type}/g" \ | sed -e "s/@threading_model@/${threading_model}/g" \ | sed -e "s/@install_libdir@/${install_libdir_esc}/g" \ | sed -e "s/@install_incdir@/${install_incdir_esc}/g" \ | sed -e "s/@install_sharedir@/${install_sharedir_esc}/g" \ | sed -e "s/@enable_verbose@/${enable_verbose}/g" \ | sed -e "s/@configured_oot@/${configured_oot}/g" \ | sed -e "s/@enable_arg_max_hack@/${enable_arg_max_hack}/g" \ | sed -e "s/@enable_static@/${enable_static}/g" \ | sed -e "s/@enable_shared@/${enable_shared}/g" \ | sed -e "s/@enable_blas@/${enable_blas}/g" \ | sed -e "s/@enable_cblas@/${enable_cblas}/g" \ | sed -e "s/@enable_memkind@/${enable_memkind}/g" \ | sed -e "s/@sandbox@/${sandbox}/g" \ > "${config_mk_out_path}" # Begin substituting information into the bli_config_h_in file, outputting # to bli_config_h_out. NOTE: We use perl instead of sed because the version # of sed used on OS X is old and does not handle the '\n' character # intuitively, which was used when constructing ${config_name_define}, # ${config_list_defines}, and ${kernel_list_defines}. echo "${script_name}: creating ${bli_config_h_out_path} from ${bli_config_h_in_path}" cat "${bli_config_h_in_path}" \ | perl -pe "s/\@config_name_define\@/${config_name_define}/g" \ | perl -pe "s/\@config_list_defines\@/${config_list_defines}/g" \ | perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \ | sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \ | sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \ | sed -e "s/@enable_packbuf_pools@/${enable_packbuf_pools_01}/g" \ | sed -e "s/@int_type_size@/${int_type_size}/g" \ | sed -e "s/@blas_int_type_size@/${blas_int_type_size}/g" \ | sed -e "s/@enable_blas@/${enable_blas_01}/g" \ | sed -e "s/@enable_cblas@/${enable_cblas_01}/g" \ | sed -e "s/@enable_memkind@/${enable_memkind_01}/g" \ | sed -e "s/@enable_sandbox@/${enable_sandbox_01}/g" \ > "${bli_config_h_out_path}" # -- Create top-level object directories ----------------------------------- # Create obj sub-directories (if they do not already exist). base_obj_dirpath="${obj_dirpath}/${config_name}" echo "${script_name}: creating ${base_obj_dirpath}" mkdir -p ${base_obj_dirpath} obj_config_dirpath="${base_obj_dirpath}/${config_dir}" #echo "${script_name}: creating ${obj_config_dirpath}" mkdir -p ${obj_config_dirpath} for conf in ${config_list}; do echo "${script_name}: creating ${obj_config_dirpath}/${conf}" mkdir -p ${obj_config_dirpath}/${conf} done obj_kernels_dirpath="${base_obj_dirpath}/${kernels_dir}" #echo "${script_name}: creating ${obj_kernels_dirpath}" mkdir -p ${obj_kernels_dirpath} for kern in ${kernel_list}; do echo "${script_name}: creating ${obj_kernels_dirpath}/${kern}" mkdir -p ${obj_kernels_dirpath}/${kern} done obj_refkern_dirpath="${base_obj_dirpath}/${refkern_dir}" #echo "${script_name}: creating ${obj_refkern_dirpath}" mkdir -p ${obj_refkern_dirpath} for conf in ${config_list}; do echo "${script_name}: creating ${obj_refkern_dirpath}/${conf}" mkdir -p ${obj_refkern_dirpath}/${conf} done obj_frame_dirpath="${base_obj_dirpath}/${frame_dir}" echo "${script_name}: creating ${obj_frame_dirpath}" mkdir -p ${obj_frame_dirpath} if [ -n "${sandbox_flag}" ]; then obj_sandbox_dirpath="${base_obj_dirpath}/${sandbox_dir}" echo "${script_name}: creating ${obj_sandbox_dirpath}/${sandbox}" mkdir -p ${obj_sandbox_dirpath}/${sandbox} fi obj_blastest_dirpath="${base_obj_dirpath}/${blastest_dir}" echo "${script_name}: creating ${obj_blastest_dirpath}" mkdir -p ${obj_blastest_dirpath} obj_testsuite_dirpath="${base_obj_dirpath}/${testsuite_dir}" echo "${script_name}: creating ${obj_testsuite_dirpath}" mkdir -p ${obj_testsuite_dirpath} # Create lib directory (if it does not already exist). base_lib_dirpath="${lib_dirpath}/${config_name}" echo "${script_name}: creating ${base_lib_dirpath}" mkdir -p ${base_lib_dirpath} # Create include directory (if it does not already exist). base_include_dirpath="${include_dirpath}/${config_name}" echo "${script_name}: creating ${base_include_dirpath}" mkdir -p ${base_include_dirpath} # -- Mirror source directory hierarchies to object directories ------------- # Combine the config_list with the config_name and then remove duplicates. config_list_plus_name=$(rm_duplicate_words "${config_list} ${config_name}") # Mirror each of the sub-configuration directories to the object directory. for conf in ${config_list_plus_name}; do echo "${script_name}: mirroring ${config_dirpath}/${conf} to ${obj_config_dirpath}/${conf}" ${mirror_tree_sh} "${config_dirpath}/${conf}" "${obj_config_dirpath}/${conf}" done # Mirror optimized kernels source tree to its object sub-directory. # We perform the mirroring on each configuration/kernel sub-directory # within 'kernels'. for kern in ${kernel_list}; do # Only mirror the optimized kernels source directory if it exists. # There are occasions where one of the sub-configurations in the # config_list does not correspond to a kernels sub-directory, such # as when architecture B is so close to architecture A that B can # use A's kernel source code unmodified (though perhaps with # different blocksizes). #if [ -d "${kernels_dirpath}/${conf}" ]; then echo "${script_name}: mirroring ${kernels_dirpath}/${kern} to ${obj_kernels_dirpath}/${kern}" ${mirror_tree_sh} "${kernels_dirpath}/${kern}" "${obj_kernels_dirpath}/${kern}" #else # echo "${script_name}: mirroring ${kernels_dirpath}/${conf} skipped... directory does not exist" #fi done # Mirror reference kernel source tree to its object sub-directory. echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}" ${mirror_tree_sh} ${refkern_dirpath} ${obj_refkern_dirpath} # Mirror reference kernels source tree to its object sub-directory. for conf in ${config_list}; do echo "${script_name}: mirroring ${refkern_dirpath} to ${obj_refkern_dirpath}/${conf}" ${mirror_tree_sh} "${refkern_dirpath}" "${obj_refkern_dirpath}/${conf}" done # Mirror framework source tree to its object sub-directory. echo "${script_name}: mirroring ${frame_dirpath} to ${obj_frame_dirpath}" ${mirror_tree_sh} ${frame_dirpath} ${obj_frame_dirpath} # Mirror the chosen sandbox source tree to its object sub-directory. if [ -n "${sandbox_flag}" ]; then echo "${script_name}: mirroring ${sandbox_dirpath}/${sandbox} to ${obj_sandbox_dirpath}/${sandbox}" ${mirror_tree_sh} "${sandbox_dirpath}/${sandbox}" "${obj_sandbox_dirpath}/${sandbox}" fi # -- Generate makefile fragements ------------------------------------------ clist_contains_cname=$(is_in_list "${config_name}" "${config_list}") # If the config_list does not already contain the config_name (i.e., # if config_name is an umbrella family), generate makefiles in that # directory. (In the next step, we will loop over the actual sub- # configurations and create fragments there as well.) if [ "${clist_contains_cname}" == "false" ]; then echo "${script_name}: creating makefile fragments in ${obj_config_dirpath}/${config_name}" ${gen_make_frags_sh} \ -h -r -v0 \ -o ${script_name} \ -p 'CONFIG' \ ${config_dirpath}/${config_name} \ ${obj_config_dirpath}/${config_name} \ ${gen_make_frags_dirpath}/fragment.mk \ ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list fi # Generate makefile fragments for each of the sub-configurations present # in the configuration list. for conf in ${config_list}; do echo "${script_name}: creating makefile fragments in ${obj_config_dirpath}/${conf}" ${gen_make_frags_sh} \ -h -r -v0 \ -o ${script_name} \ -p 'CONFIG' \ ${config_dirpath}/${conf} \ ${obj_config_dirpath}/${conf} \ ${gen_make_frags_dirpath}/fragment.mk \ ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list done # Generate makefile fragments for each of the kernel sets required by # the configuration list (in the kernel list). for kern in ${kernel_list}; do echo "${script_name}: creating makefile fragments in ${obj_kernels_dirpath}/${kern}" ${gen_make_frags_sh} \ -h -r -v0 \ -o ${script_name} \ -p 'KERNELS' \ ${kernels_dirpath}/${kern} \ ${obj_kernels_dirpath}/${kern} \ ${gen_make_frags_dirpath}/fragment.mk \ ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list done # Generate makefile fragments in the reference kernels directory. echo "${script_name}: creating makefile fragments in ${obj_refkern_dirpath}" ${gen_make_frags_sh} \ -h -r -v0 \ -o ${script_name} \ -p 'REFKERN' \ ${refkern_dirpath} \ ${obj_refkern_dirpath} \ ${gen_make_frags_dirpath}/fragment.mk \ ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list # Generate makefile fragments in the framework directory. echo "${script_name}: creating makefile fragments in ${obj_frame_dirpath}" ${gen_make_frags_sh} \ -h -r -v0 \ -o ${script_name} \ -p 'FRAME' \ ${frame_dirpath} \ ${obj_frame_dirpath} \ ${gen_make_frags_dirpath}/fragment.mk \ ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list # Generate makefile fragments in the sandbox sub-directory. if [ -n "${sandbox_flag}" ]; then echo "${script_name}: creating makefile fragments in ${obj_sandbox_dirpath}/${sandbox}" ${gen_make_frags_sh} \ -h -r -v0 \ -o ${script_name} \ -p 'SANDBOX' \ ${sandbox_dirpath}/${sandbox} \ ${obj_sandbox_dirpath}/${sandbox} \ ${gen_make_frags_dirpath}/fragment.mk \ ${gen_make_frags_dirpath}/suffix_list \ ${gen_make_frags_dirpath}/ignore_list fi # -- Handle out-of-tree builds --------------------------------------------- # Under some circumstances, we need to create some symbolic links to # properly handle out-of-tree builds. if [ "${configured_oot}" = "yes" ]; then # If 'Makefile' symlink does not already exist in the current # directory, create a symbolic link to it. If one does exist, we # use -f to force creation of a new link. if [ ! -e "./Makefile" ]; then echo "${script_name}: creating symbolic link to Makefile." ln -s "${dist_path}/Makefile" elif [ -h "./Makefile" ]; then echo "${script_name}: symbolic link to Makefile already exists; forcing creation of new link." ln -sf "${dist_path}/Makefile" else echo "${script_name}: Non-symbolic link file or directory 'Makefile' blocks creation of symlink." echo "${script_name}: *** Please remove this entity and re-run configure." exit 1 fi # If 'common.mk' symlink does not already exist in the current # directory, create a symbolic link to it. If one does exist, we # use -f to force creation of a new link. if [ ! -e "./common.mk" ]; then echo "${script_name}: creating symbolic link to common.mk." ln -s "${dist_path}/common.mk" elif [ -h "./common.mk" ]; then echo "${script_name}: symbolic link to common.mk already exists; forcing creation of new link." ln -sf "${dist_path}/common.mk" else echo "${script_name}: Non-symbolic link file or directory 'common.mk' blocks creation of symlink." echo "${script_name}: *** Please remove this entity and re-run configure." exit 1 fi # If 'config' symlink does not already exist in the current # directory, create a symbolic link to it. If one does exist, we # use -f to force creation of a new link. if [ ! -e "./config" ]; then echo "${script_name}: creating symbolic link to 'config' directory." ln -s "${dist_path}/config" elif [ -h "./config" ]; then echo "${script_name}: symbolic link to 'config' directory already exists; forcing creation of new link." ln -sf "${dist_path}/config" else echo "${script_name}: Non-symbolic link file or directory 'config' blocks creation of symlink." echo "${script_name}: *** Please remove this entity and re-run configure." exit 1 fi echo "${script_name}: configured to build outside of source distribution." else echo "${script_name}: configured to build within top-level directory of source distribution." fi # Exit peacefully. return 0 } # The script's main entry point, passing all parameters given. main "$@" cython-blis-0.9.1/extra-include/pthread.h000066400000000000000000001217071427272030600203330ustar00rootroot00000000000000/* This is an implementation of the threads API of POSIX 1003.1-2001. * * -------------------------------------------------------------------------- * * Pthreads-win32 - POSIX Threads Library for Win32 * Copyright(C) 1998 John E. Bossom * Copyright(C) 1999,2005 Pthreads-win32 contributors * * Contact Email: rpj@callisto.canberra.edu.au * * The current list of contributors is contained * in the file CONTRIBUTORS included with the source * code distribution. The list can also be seen at the * following World Wide Web location: * http://sources.redhat.com/pthreads-win32/contributors.html * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library in the file COPYING.LIB; * if not, write to the Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ // MH: We hack this in, see // https://stackoverflow.com/questions/33557506/timespec-redefinition-error #define HAVE_STRUCT_TIMESPEC #if !defined( PTHREAD_H ) #define PTHREAD_H /* * See the README file for an explanation of the pthreads-win32 version * numbering scheme and how the DLL is named etc. */ #define PTW32_VERSION 2,8,0,0 #define PTW32_VERSION_STRING "2, 8, 0, 0\0" /* There are three implementations of cancel cleanup. * Note that pthread.h is included in both application * compilation units and also internally for the library. * The code here and within the library aims to work * for all reasonable combinations of environments. * * The three implementations are: * * WIN32 SEH * C * C++ * * Please note that exiting a push/pop block via * "return", "exit", "break", or "continue" will * lead to different behaviour amongst applications * depending upon whether the library was built * using SEH, C++, or C. For example, a library built * with SEH will call the cleanup routine, while both * C++ and C built versions will not. */ /* * Define defaults for cleanup code. * Note: Unless the build explicitly defines one of the following, then * we default to standard C style cleanup. This style uses setjmp/longjmp * in the cancelation and thread exit implementations and therefore won't * do stack unwinding if linked to applications that have it (e.g. * C++ apps). This is currently consistent with most/all commercial Unix * POSIX threads implementations. */ #if !defined( __CLEANUP_SEH ) && !defined( __CLEANUP_CXX ) && !defined( __CLEANUP_C ) # define __CLEANUP_C #endif #if defined( __CLEANUP_SEH ) && ( !defined( _MSC_VER ) && !defined(PTW32_RC_MSC)) #error ERROR [__FILE__, line __LINE__]: SEH is not supported for this compiler. #endif /* * Stop here if we are being included by the resource compiler. */ #ifndef RC_INVOKED #undef PTW32_LEVEL #if defined(_POSIX_SOURCE) #define PTW32_LEVEL 0 /* Early POSIX */ #endif #if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309 #undef PTW32_LEVEL #define PTW32_LEVEL 1 /* Include 1b, 1c and 1d */ #endif #if defined(INCLUDE_NP) #undef PTW32_LEVEL #define PTW32_LEVEL 2 /* Include Non-Portable extensions */ #endif #define PTW32_LEVEL_MAX 3 #if !defined(PTW32_LEVEL) #define PTW32_LEVEL PTW32_LEVEL_MAX /* Include everything */ #endif #ifdef _UWIN # define HAVE_STRUCT_TIMESPEC 1 # define HAVE_SIGNAL_H 1 # undef HAVE_CONFIG_H # pragma comment(lib, "pthread") #endif /* * ------------------------------------------------------------- * * * Module: pthread.h * * Purpose: * Provides an implementation of PThreads based upon the * standard: * * POSIX 1003.1-2001 * and * The Single Unix Specification version 3 * * (these two are equivalent) * * in order to enhance code portability between Windows, * various commercial Unix implementations, and Linux. * * See the ANNOUNCE file for a full list of conforming * routines and defined constants, and a list of missing * routines and constants not defined in this implementation. * * Authors: * There have been many contributors to this library. * The initial implementation was contributed by * John Bossom, and several others have provided major * sections or revisions of parts of the implementation. * Often significant effort has been contributed to * find and fix important bugs and other problems to * improve the reliability of the library, which sometimes * is not reflected in the amount of code which changed as * result. * As much as possible, the contributors are acknowledged * in the ChangeLog file in the source code distribution * where their changes are noted in detail. * * Contributors are listed in the CONTRIBUTORS file. * * As usual, all bouquets go to the contributors, and all * brickbats go to the project maintainer. * * Maintainer: * The code base for this project is coordinated and * eventually pre-tested, packaged, and made available by * * Ross Johnson * * QA Testers: * Ultimately, the library is tested in the real world by * a host of competent and demanding scientists and * engineers who report bugs and/or provide solutions * which are then fixed or incorporated into subsequent * versions of the library. Each time a bug is fixed, a * test case is written to prove the fix and ensure * that later changes to the code don't reintroduce the * same error. The number of test cases is slowly growing * and therefore so is the code reliability. * * Compliance: * See the file ANNOUNCE for the list of implemented * and not-implemented routines and defined options. * Of course, these are all defined is this file as well. * * Web site: * The source code and other information about this library * are available from * * http://sources.redhat.com/pthreads-win32/ * * ------------------------------------------------------------- */ /* Try to avoid including windows.h */ #if defined(__MINGW32__) && defined(__cplusplus) #define PTW32_INCLUDE_WINDOWS_H #endif #ifdef PTW32_INCLUDE_WINDOWS_H #include #endif #if defined(_MSC_VER) && _MSC_VER < 1300 || defined(__DMC__) /* * VC++6.0 or early compiler's header has no DWORD_PTR type. */ typedef unsigned long DWORD_PTR; #endif /* * ----------------- * autoconf switches * ----------------- */ #if HAVE_CONFIG_H #include "config.h" #endif /* HAVE_CONFIG_H */ #ifndef NEED_FTIME #include #else /* NEED_FTIME */ /* use native WIN32 time API */ #endif /* NEED_FTIME */ #if HAVE_SIGNAL_H #include #endif /* HAVE_SIGNAL_H */ #include #include /* * Boolean values to make us independent of system includes. */ enum { PTW32_FALSE = 0, PTW32_TRUE = (! PTW32_FALSE) }; /* * This is a duplicate of what is in the autoconf config.h, * which is only used when building the pthread-win32 libraries. */ #ifndef PTW32_CONFIG_H # if defined(WINCE) # define NEED_ERRNO # define NEED_SEM # endif # if defined(_UWIN) || defined(__MINGW32__) # define HAVE_MODE_T # endif #endif /* * */ #if PTW32_LEVEL >= PTW32_LEVEL_MAX #ifdef NEED_ERRNO #include "need_errno.h" #else #include #endif #endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */ /* * Several systems don't define some error numbers. */ #ifndef ENOTSUP # define ENOTSUP 48 /* This is the value in Solaris. */ #endif #ifndef ETIMEDOUT # define ETIMEDOUT 10060 /* This is the value in winsock.h. */ #endif #ifndef ENOSYS # define ENOSYS 140 /* Semi-arbitrary value */ #endif #ifndef EDEADLK # ifdef EDEADLOCK # define EDEADLK EDEADLOCK # else # define EDEADLK 36 /* This is the value in MSVC. */ # endif #endif #include /* * To avoid including windows.h we define only those things that we * actually need from it. */ #ifndef PTW32_INCLUDE_WINDOWS_H #ifndef HANDLE # define PTW32__HANDLE_DEF # define HANDLE void * #endif #ifndef DWORD # define PTW32__DWORD_DEF # define DWORD unsigned long #endif #endif #ifndef HAVE_STRUCT_TIMESPEC #define HAVE_STRUCT_TIMESPEC 1 struct timespec { long tv_sec; long tv_nsec; }; #endif /* HAVE_STRUCT_TIMESPEC */ #ifndef SIG_BLOCK #define SIG_BLOCK 0 #endif /* SIG_BLOCK */ #ifndef SIG_UNBLOCK #define SIG_UNBLOCK 1 #endif /* SIG_UNBLOCK */ #ifndef SIG_SETMASK #define SIG_SETMASK 2 #endif /* SIG_SETMASK */ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /* * ------------------------------------------------------------- * * POSIX 1003.1-2001 Options * ========================= * * Options are normally set in , which is not provided * with pthreads-win32. * * For conformance with the Single Unix Specification (version 3), all of the * options below are defined, and have a value of either -1 (not supported) * or 200112L (supported). * * These options can neither be left undefined nor have a value of 0, because * either indicates that sysconf(), which is not implemented, may be used at * runtime to check the status of the option. * * _POSIX_THREADS (== 200112L) * If == 200112L, you can use threads * * _POSIX_THREAD_ATTR_STACKSIZE (== 200112L) * If == 200112L, you can control the size of a thread's * stack * pthread_attr_getstacksize * pthread_attr_setstacksize * * _POSIX_THREAD_ATTR_STACKADDR (== -1) * If == 200112L, you can allocate and control a thread's * stack. If not supported, the following functions * will return ENOSYS, indicating they are not * supported: * pthread_attr_getstackaddr * pthread_attr_setstackaddr * * _POSIX_THREAD_PRIORITY_SCHEDULING (== -1) * If == 200112L, you can use realtime scheduling. * This option indicates that the behaviour of some * implemented functions conforms to the additional TPS * requirements in the standard. E.g. rwlocks favour * writers over readers when threads have equal priority. * * _POSIX_THREAD_PRIO_INHERIT (== -1) * If == 200112L, you can create priority inheritance * mutexes. * pthread_mutexattr_getprotocol + * pthread_mutexattr_setprotocol + * * _POSIX_THREAD_PRIO_PROTECT (== -1) * If == 200112L, you can create priority ceiling mutexes * Indicates the availability of: * pthread_mutex_getprioceiling * pthread_mutex_setprioceiling * pthread_mutexattr_getprioceiling * pthread_mutexattr_getprotocol + * pthread_mutexattr_setprioceiling * pthread_mutexattr_setprotocol + * * _POSIX_THREAD_PROCESS_SHARED (== -1) * If set, you can create mutexes and condition * variables that can be shared with another * process.If set, indicates the availability * of: * pthread_mutexattr_getpshared * pthread_mutexattr_setpshared * pthread_condattr_getpshared * pthread_condattr_setpshared * * _POSIX_THREAD_SAFE_FUNCTIONS (== 200112L) * If == 200112L you can use the special *_r library * functions that provide thread-safe behaviour * * _POSIX_READER_WRITER_LOCKS (== 200112L) * If == 200112L, you can use read/write locks * * _POSIX_SPIN_LOCKS (== 200112L) * If == 200112L, you can use spin locks * * _POSIX_BARRIERS (== 200112L) * If == 200112L, you can use barriers * * + These functions provide both 'inherit' and/or * 'protect' protocol, based upon these macro * settings. * * ------------------------------------------------------------- */ /* * POSIX Options */ #undef _POSIX_THREADS #define _POSIX_THREADS 200112L #undef _POSIX_READER_WRITER_LOCKS #define _POSIX_READER_WRITER_LOCKS 200112L #undef _POSIX_SPIN_LOCKS #define _POSIX_SPIN_LOCKS 200112L #undef _POSIX_BARRIERS #define _POSIX_BARRIERS 200112L #undef _POSIX_THREAD_SAFE_FUNCTIONS #define _POSIX_THREAD_SAFE_FUNCTIONS 200112L #undef _POSIX_THREAD_ATTR_STACKSIZE #define _POSIX_THREAD_ATTR_STACKSIZE 200112L /* * The following options are not supported */ #undef _POSIX_THREAD_ATTR_STACKADDR #define _POSIX_THREAD_ATTR_STACKADDR -1 #undef _POSIX_THREAD_PRIO_INHERIT #define _POSIX_THREAD_PRIO_INHERIT -1 #undef _POSIX_THREAD_PRIO_PROTECT #define _POSIX_THREAD_PRIO_PROTECT -1 /* TPS is not fully supported. */ #undef _POSIX_THREAD_PRIORITY_SCHEDULING #define _POSIX_THREAD_PRIORITY_SCHEDULING -1 #undef _POSIX_THREAD_PROCESS_SHARED #define _POSIX_THREAD_PROCESS_SHARED -1 /* * POSIX 1003.1-2001 Limits * =========================== * * These limits are normally set in , which is not provided with * pthreads-win32. * * PTHREAD_DESTRUCTOR_ITERATIONS * Maximum number of attempts to destroy * a thread's thread-specific data on * termination (must be at least 4) * * PTHREAD_KEYS_MAX * Maximum number of thread-specific data keys * available per process (must be at least 128) * * PTHREAD_STACK_MIN * Minimum supported stack size for a thread * * PTHREAD_THREADS_MAX * Maximum number of threads supported per * process (must be at least 64). * * SEM_NSEMS_MAX * The maximum number of semaphores a process can have. * (must be at least 256) * * SEM_VALUE_MAX * The maximum value a semaphore can have. * (must be at least 32767) * */ #undef _POSIX_THREAD_DESTRUCTOR_ITERATIONS #define _POSIX_THREAD_DESTRUCTOR_ITERATIONS 4 #undef PTHREAD_DESTRUCTOR_ITERATIONS #define PTHREAD_DESTRUCTOR_ITERATIONS _POSIX_THREAD_DESTRUCTOR_ITERATIONS #undef _POSIX_THREAD_KEYS_MAX #define _POSIX_THREAD_KEYS_MAX 128 #undef PTHREAD_KEYS_MAX #define PTHREAD_KEYS_MAX _POSIX_THREAD_KEYS_MAX #undef PTHREAD_STACK_MIN #define PTHREAD_STACK_MIN 0 #undef _POSIX_THREAD_THREADS_MAX #define _POSIX_THREAD_THREADS_MAX 64 /* Arbitrary value */ #undef PTHREAD_THREADS_MAX #define PTHREAD_THREADS_MAX 2019 #undef _POSIX_SEM_NSEMS_MAX #define _POSIX_SEM_NSEMS_MAX 256 /* Arbitrary value */ #undef SEM_NSEMS_MAX #define SEM_NSEMS_MAX 1024 #undef _POSIX_SEM_VALUE_MAX #define _POSIX_SEM_VALUE_MAX 32767 #undef SEM_VALUE_MAX #define SEM_VALUE_MAX INT_MAX #if __GNUC__ && ! defined (__declspec) # error Please upgrade your GNU compiler to one that supports __declspec. #endif /* * When building the DLL code, you should define PTW32_BUILD so that * the variables/functions are exported correctly. When using the DLL, * do NOT define PTW32_BUILD, and then the variables/functions will * be imported correctly. */ #ifndef PTW32_STATIC_LIB # ifdef PTW32_BUILD # define PTW32_DLLPORT __declspec (dllexport) # else # define PTW32_DLLPORT __declspec (dllimport) # endif #else # define PTW32_DLLPORT #endif /* * The Open Watcom C/C++ compiler uses a non-standard calling convention * that passes function args in registers unless __cdecl is explicitly specified * in exposed function prototypes. * * We force all calls to cdecl even though this could slow Watcom code down * slightly. If you know that the Watcom compiler will be used to build both * the DLL and application, then you can probably define this as a null string. * Remember that pthread.h (this file) is used for both the DLL and application builds. */ #define PTW32_CDECL __cdecl #if defined(_UWIN) && PTW32_LEVEL >= PTW32_LEVEL_MAX # include #else /* * Generic handle type - intended to extend uniqueness beyond * that available with a simple pointer. It should scale for either * IA-32 or IA-64. */ typedef struct { void * p; /* Pointer to actual object */ unsigned int x; /* Extra information - reuse count etc */ } ptw32_handle_t; typedef ptw32_handle_t pthread_t; typedef struct pthread_attr_t_ * pthread_attr_t; typedef struct pthread_once_t_ pthread_once_t; typedef struct pthread_key_t_ * pthread_key_t; typedef struct pthread_mutex_t_ * pthread_mutex_t; typedef struct pthread_mutexattr_t_ * pthread_mutexattr_t; typedef struct pthread_cond_t_ * pthread_cond_t; typedef struct pthread_condattr_t_ * pthread_condattr_t; #endif typedef struct pthread_rwlock_t_ * pthread_rwlock_t; typedef struct pthread_rwlockattr_t_ * pthread_rwlockattr_t; typedef struct pthread_spinlock_t_ * pthread_spinlock_t; typedef struct pthread_barrier_t_ * pthread_barrier_t; typedef struct pthread_barrierattr_t_ * pthread_barrierattr_t; /* * ==================== * ==================== * POSIX Threads * ==================== * ==================== */ enum { /* * pthread_attr_{get,set}detachstate */ PTHREAD_CREATE_JOINABLE = 0, /* Default */ PTHREAD_CREATE_DETACHED = 1, /* * pthread_attr_{get,set}inheritsched */ PTHREAD_INHERIT_SCHED = 0, PTHREAD_EXPLICIT_SCHED = 1, /* Default */ /* * pthread_{get,set}scope */ PTHREAD_SCOPE_PROCESS = 0, PTHREAD_SCOPE_SYSTEM = 1, /* Default */ /* * pthread_setcancelstate paramters */ PTHREAD_CANCEL_ENABLE = 0, /* Default */ PTHREAD_CANCEL_DISABLE = 1, /* * pthread_setcanceltype parameters */ PTHREAD_CANCEL_ASYNCHRONOUS = 0, PTHREAD_CANCEL_DEFERRED = 1, /* Default */ /* * pthread_mutexattr_{get,set}pshared * pthread_condattr_{get,set}pshared */ PTHREAD_PROCESS_PRIVATE = 0, PTHREAD_PROCESS_SHARED = 1, /* * pthread_barrier_wait */ PTHREAD_BARRIER_SERIAL_THREAD = -1 }; /* * ==================== * ==================== * Cancelation * ==================== * ==================== */ #define PTHREAD_CANCELED ((void *) -1) /* * ==================== * ==================== * Once Key * ==================== * ==================== */ #define PTHREAD_ONCE_INIT { PTW32_FALSE, 0, 0, 0} struct pthread_once_t_ { int done; /* indicates if user function has been executed */ void * lock; int reserved1; int reserved2; }; /* * ==================== * ==================== * Object initialisers * ==================== * ==================== */ #define PTHREAD_MUTEX_INITIALIZER ((pthread_mutex_t) -1) #define PTHREAD_RECURSIVE_MUTEX_INITIALIZER ((pthread_mutex_t) -2) #define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER ((pthread_mutex_t) -3) /* * Compatibility with LinuxThreads */ #define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP PTHREAD_RECURSIVE_MUTEX_INITIALIZER #define PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP PTHREAD_ERRORCHECK_MUTEX_INITIALIZER #define PTHREAD_COND_INITIALIZER ((pthread_cond_t) -1) #define PTHREAD_RWLOCK_INITIALIZER ((pthread_rwlock_t) -1) #define PTHREAD_SPINLOCK_INITIALIZER ((pthread_spinlock_t) -1) /* * Mutex types. */ enum { /* Compatibility with LinuxThreads */ PTHREAD_MUTEX_FAST_NP, PTHREAD_MUTEX_RECURSIVE_NP, PTHREAD_MUTEX_ERRORCHECK_NP, PTHREAD_MUTEX_TIMED_NP = PTHREAD_MUTEX_FAST_NP, PTHREAD_MUTEX_ADAPTIVE_NP = PTHREAD_MUTEX_FAST_NP, /* For compatibility with POSIX */ PTHREAD_MUTEX_NORMAL = PTHREAD_MUTEX_FAST_NP, PTHREAD_MUTEX_RECURSIVE = PTHREAD_MUTEX_RECURSIVE_NP, PTHREAD_MUTEX_ERRORCHECK = PTHREAD_MUTEX_ERRORCHECK_NP, PTHREAD_MUTEX_DEFAULT = PTHREAD_MUTEX_NORMAL }; typedef struct ptw32_cleanup_t ptw32_cleanup_t; #if defined(_MSC_VER) /* Disable MSVC 'anachronism used' warning */ #pragma warning( disable : 4229 ) #endif typedef void (* PTW32_CDECL ptw32_cleanup_callback_t)(void *); #if defined(_MSC_VER) #pragma warning( default : 4229 ) #endif struct ptw32_cleanup_t { ptw32_cleanup_callback_t routine; void *arg; struct ptw32_cleanup_t *prev; }; #ifdef __CLEANUP_SEH /* * WIN32 SEH version of cancel cleanup. */ #define pthread_cleanup_push( _rout, _arg ) \ { \ ptw32_cleanup_t _cleanup; \ \ _cleanup.routine = (ptw32_cleanup_callback_t)(_rout); \ _cleanup.arg = (_arg); \ __try \ { \ #define pthread_cleanup_pop( _execute ) \ } \ __finally \ { \ if( _execute || AbnormalTermination()) \ { \ (*(_cleanup.routine))( _cleanup.arg ); \ } \ } \ } #else /* __CLEANUP_SEH */ #ifdef __CLEANUP_C /* * C implementation of PThreads cancel cleanup */ #define pthread_cleanup_push( _rout, _arg ) \ { \ ptw32_cleanup_t _cleanup; \ \ ptw32_push_cleanup( &_cleanup, (ptw32_cleanup_callback_t) (_rout), (_arg) ); \ #define pthread_cleanup_pop( _execute ) \ (void) ptw32_pop_cleanup( _execute ); \ } #else /* __CLEANUP_C */ #ifdef __CLEANUP_CXX /* * C++ version of cancel cleanup. * - John E. Bossom. */ class PThreadCleanup { /* * PThreadCleanup * * Purpose * This class is a C++ helper class that is * used to implement pthread_cleanup_push/ * pthread_cleanup_pop. * The destructor of this class automatically * pops the pushed cleanup routine regardless * of how the code exits the scope * (i.e. such as by an exception) */ ptw32_cleanup_callback_t cleanUpRout; void * obj; int executeIt; public: PThreadCleanup() : cleanUpRout( 0 ), obj( 0 ), executeIt( 0 ) /* * No cleanup performed */ { } PThreadCleanup( ptw32_cleanup_callback_t routine, void * arg ) : cleanUpRout( routine ), obj( arg ), executeIt( 1 ) /* * Registers a cleanup routine for 'arg' */ { } ~PThreadCleanup() { if ( executeIt && ((void *) cleanUpRout != (void *) 0) ) { (void) (*cleanUpRout)( obj ); } } void execute( int exec ) { executeIt = exec; } }; /* * C++ implementation of PThreads cancel cleanup; * This implementation takes advantage of a helper * class who's destructor automatically calls the * cleanup routine if we exit our scope weirdly */ #define pthread_cleanup_push( _rout, _arg ) \ { \ PThreadCleanup cleanup((ptw32_cleanup_callback_t)(_rout), \ (void *) (_arg) ); #define pthread_cleanup_pop( _execute ) \ cleanup.execute( _execute ); \ } #else #error ERROR [__FILE__, line __LINE__]: Cleanup type undefined. #endif /* __CLEANUP_CXX */ #endif /* __CLEANUP_C */ #endif /* __CLEANUP_SEH */ /* * =============== * =============== * Methods * =============== * =============== */ /* * PThread Attribute Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_attr_init (pthread_attr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_attr_destroy (pthread_attr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_attr_getdetachstate (const pthread_attr_t * attr, int *detachstate); PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstackaddr (const pthread_attr_t * attr, void **stackaddr); PTW32_DLLPORT int PTW32_CDECL pthread_attr_getstacksize (const pthread_attr_t * attr, size_t * stacksize); PTW32_DLLPORT int PTW32_CDECL pthread_attr_setdetachstate (pthread_attr_t * attr, int detachstate); PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstackaddr (pthread_attr_t * attr, void *stackaddr); PTW32_DLLPORT int PTW32_CDECL pthread_attr_setstacksize (pthread_attr_t * attr, size_t stacksize); PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedparam (const pthread_attr_t *attr, struct sched_param *param); PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedparam (pthread_attr_t *attr, const struct sched_param *param); PTW32_DLLPORT int PTW32_CDECL pthread_attr_setschedpolicy (pthread_attr_t *, int); PTW32_DLLPORT int PTW32_CDECL pthread_attr_getschedpolicy (pthread_attr_t *, int *); PTW32_DLLPORT int PTW32_CDECL pthread_attr_setinheritsched(pthread_attr_t * attr, int inheritsched); PTW32_DLLPORT int PTW32_CDECL pthread_attr_getinheritsched(pthread_attr_t * attr, int * inheritsched); PTW32_DLLPORT int PTW32_CDECL pthread_attr_setscope (pthread_attr_t *, int); PTW32_DLLPORT int PTW32_CDECL pthread_attr_getscope (const pthread_attr_t *, int *); /* * PThread Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_create (pthread_t * tid, const pthread_attr_t * attr, void *(*start) (void *), void *arg); PTW32_DLLPORT int PTW32_CDECL pthread_detach (pthread_t tid); PTW32_DLLPORT int PTW32_CDECL pthread_equal (pthread_t t1, pthread_t t2); PTW32_DLLPORT void PTW32_CDECL pthread_exit (void *value_ptr); PTW32_DLLPORT int PTW32_CDECL pthread_join (pthread_t thread, void **value_ptr); PTW32_DLLPORT pthread_t PTW32_CDECL pthread_self (void); PTW32_DLLPORT int PTW32_CDECL pthread_cancel (pthread_t thread); PTW32_DLLPORT int PTW32_CDECL pthread_setcancelstate (int state, int *oldstate); PTW32_DLLPORT int PTW32_CDECL pthread_setcanceltype (int type, int *oldtype); PTW32_DLLPORT void PTW32_CDECL pthread_testcancel (void); PTW32_DLLPORT int PTW32_CDECL pthread_once (pthread_once_t * once_control, void (*init_routine) (void)); #if PTW32_LEVEL >= PTW32_LEVEL_MAX PTW32_DLLPORT ptw32_cleanup_t * PTW32_CDECL ptw32_pop_cleanup (int execute); PTW32_DLLPORT void PTW32_CDECL ptw32_push_cleanup (ptw32_cleanup_t * cleanup, void (*routine) (void *), void *arg); #endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */ /* * Thread Specific Data Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_key_create (pthread_key_t * key, void (*destructor) (void *)); PTW32_DLLPORT int PTW32_CDECL pthread_key_delete (pthread_key_t key); PTW32_DLLPORT int PTW32_CDECL pthread_setspecific (pthread_key_t key, const void *value); PTW32_DLLPORT void * PTW32_CDECL pthread_getspecific (pthread_key_t key); /* * Mutex Attribute Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_init (pthread_mutexattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_destroy (pthread_mutexattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getpshared (const pthread_mutexattr_t * attr, int *pshared); PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setpshared (pthread_mutexattr_t * attr, int pshared); PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_settype (pthread_mutexattr_t * attr, int kind); PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_gettype (pthread_mutexattr_t * attr, int *kind); /* * Barrier Attribute Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_init (pthread_barrierattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_destroy (pthread_barrierattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_getpshared (const pthread_barrierattr_t * attr, int *pshared); PTW32_DLLPORT int PTW32_CDECL pthread_barrierattr_setpshared (pthread_barrierattr_t * attr, int pshared); /* * Mutex Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_mutex_init (pthread_mutex_t * mutex, const pthread_mutexattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_mutex_destroy (pthread_mutex_t * mutex); PTW32_DLLPORT int PTW32_CDECL pthread_mutex_lock (pthread_mutex_t * mutex); PTW32_DLLPORT int PTW32_CDECL pthread_mutex_timedlock(pthread_mutex_t *mutex, const struct timespec *abstime); PTW32_DLLPORT int PTW32_CDECL pthread_mutex_trylock (pthread_mutex_t * mutex); PTW32_DLLPORT int PTW32_CDECL pthread_mutex_unlock (pthread_mutex_t * mutex); /* * Spinlock Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_spin_init (pthread_spinlock_t * lock, int pshared); PTW32_DLLPORT int PTW32_CDECL pthread_spin_destroy (pthread_spinlock_t * lock); PTW32_DLLPORT int PTW32_CDECL pthread_spin_lock (pthread_spinlock_t * lock); PTW32_DLLPORT int PTW32_CDECL pthread_spin_trylock (pthread_spinlock_t * lock); PTW32_DLLPORT int PTW32_CDECL pthread_spin_unlock (pthread_spinlock_t * lock); /* * Barrier Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_barrier_init (pthread_barrier_t * barrier, const pthread_barrierattr_t * attr, unsigned int count); PTW32_DLLPORT int PTW32_CDECL pthread_barrier_destroy (pthread_barrier_t * barrier); PTW32_DLLPORT int PTW32_CDECL pthread_barrier_wait (pthread_barrier_t * barrier); /* * Condition Variable Attribute Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_condattr_init (pthread_condattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_condattr_destroy (pthread_condattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_condattr_getpshared (const pthread_condattr_t * attr, int *pshared); PTW32_DLLPORT int PTW32_CDECL pthread_condattr_setpshared (pthread_condattr_t * attr, int pshared); /* * Condition Variable Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_cond_init (pthread_cond_t * cond, const pthread_condattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_cond_destroy (pthread_cond_t * cond); PTW32_DLLPORT int PTW32_CDECL pthread_cond_wait (pthread_cond_t * cond, pthread_mutex_t * mutex); PTW32_DLLPORT int PTW32_CDECL pthread_cond_timedwait (pthread_cond_t * cond, pthread_mutex_t * mutex, const struct timespec *abstime); PTW32_DLLPORT int PTW32_CDECL pthread_cond_signal (pthread_cond_t * cond); PTW32_DLLPORT int PTW32_CDECL pthread_cond_broadcast (pthread_cond_t * cond); /* * Scheduling */ PTW32_DLLPORT int PTW32_CDECL pthread_setschedparam (pthread_t thread, int policy, const struct sched_param *param); PTW32_DLLPORT int PTW32_CDECL pthread_getschedparam (pthread_t thread, int *policy, struct sched_param *param); PTW32_DLLPORT int PTW32_CDECL pthread_setconcurrency (int); PTW32_DLLPORT int PTW32_CDECL pthread_getconcurrency (void); /* * Read-Write Lock Functions */ PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_init(pthread_rwlock_t *lock, const pthread_rwlockattr_t *attr); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_destroy(pthread_rwlock_t *lock); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_tryrdlock(pthread_rwlock_t *); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_trywrlock(pthread_rwlock_t *); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_rdlock(pthread_rwlock_t *lock); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedrdlock(pthread_rwlock_t *lock, const struct timespec *abstime); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_wrlock(pthread_rwlock_t *lock); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_timedwrlock(pthread_rwlock_t *lock, const struct timespec *abstime); PTW32_DLLPORT int PTW32_CDECL pthread_rwlock_unlock(pthread_rwlock_t *lock); PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_init (pthread_rwlockattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_destroy (pthread_rwlockattr_t * attr); PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_getpshared (const pthread_rwlockattr_t * attr, int *pshared); PTW32_DLLPORT int PTW32_CDECL pthread_rwlockattr_setpshared (pthread_rwlockattr_t * attr, int pshared); #if PTW32_LEVEL >= PTW32_LEVEL_MAX - 1 /* * Signal Functions. Should be defined in but MSVC and MinGW32 * already have signal.h that don't define these. */ PTW32_DLLPORT int PTW32_CDECL pthread_kill(pthread_t thread, int sig); /* * Non-portable functions */ /* * Compatibility with Linux. */ PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_setkind_np(pthread_mutexattr_t * attr, int kind); PTW32_DLLPORT int PTW32_CDECL pthread_mutexattr_getkind_np(pthread_mutexattr_t * attr, int *kind); /* * Possibly supported by other POSIX threads implementations */ PTW32_DLLPORT int PTW32_CDECL pthread_delay_np (struct timespec * interval); PTW32_DLLPORT int PTW32_CDECL pthread_num_processors_np(void); /* * Useful if an application wants to statically link * the lib rather than load the DLL at run-time. */ PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_attach_np(void); PTW32_DLLPORT int PTW32_CDECL pthread_win32_process_detach_np(void); PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_attach_np(void); PTW32_DLLPORT int PTW32_CDECL pthread_win32_thread_detach_np(void); /* * Features that are auto-detected at load/run time. */ PTW32_DLLPORT int PTW32_CDECL pthread_win32_test_features_np(int); enum ptw32_features { PTW32_SYSTEM_INTERLOCKED_COMPARE_EXCHANGE = 0x0001, /* System provides it. */ PTW32_ALERTABLE_ASYNC_CANCEL = 0x0002 /* Can cancel blocked threads. */ }; /* * Register a system time change with the library. * Causes the library to perform various functions * in response to the change. Should be called whenever * the application's top level window receives a * WM_TIMECHANGE message. It can be passed directly to * pthread_create() as a new thread if desired. */ PTW32_DLLPORT void * PTW32_CDECL pthread_timechange_handler_np(void *); #endif /*PTW32_LEVEL >= PTW32_LEVEL_MAX - 1 */ #if PTW32_LEVEL >= PTW32_LEVEL_MAX /* * Returns the Win32 HANDLE for the POSIX thread. */ PTW32_DLLPORT HANDLE PTW32_CDECL pthread_getw32threadhandle_np(pthread_t thread); /* * Protected Methods * * This function blocks until the given WIN32 handle * is signaled or pthread_cancel had been called. * This function allows the caller to hook into the * PThreads cancel mechanism. It is implemented using * * WaitForMultipleObjects * * on 'waitHandle' and a manually reset WIN32 Event * used to implement pthread_cancel. The 'timeout' * argument to TimedWait is simply passed to * WaitForMultipleObjects. */ PTW32_DLLPORT int PTW32_CDECL pthreadCancelableWait (HANDLE waitHandle); PTW32_DLLPORT int PTW32_CDECL pthreadCancelableTimedWait (HANDLE waitHandle, DWORD timeout); #endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */ /* * Thread-Safe C Runtime Library Mappings. */ #ifndef _UWIN # if defined(NEED_ERRNO) PTW32_DLLPORT int * PTW32_CDECL _errno( void ); # else # ifndef errno # if (defined(_MT) || defined(_DLL)) __declspec(dllimport) extern int * __cdecl _errno(void); # define errno (*_errno()) # endif # endif # endif #endif /* * WIN32 C runtime library had been made thread-safe * without affecting the user interface. Provide * mappings from the UNIX thread-safe versions to * the standard C runtime library calls. * Only provide function mappings for functions that * actually exist on WIN32. */ #if !defined(__MINGW32__) #define strtok_r( _s, _sep, _lasts ) \ ( *(_lasts) = strtok( (_s), (_sep) ) ) #endif /* !__MINGW32__ */ #define asctime_r( _tm, _buf ) \ ( strcpy( (_buf), asctime( (_tm) ) ), \ (_buf) ) #define ctime_r( _clock, _buf ) \ ( strcpy( (_buf), ctime( (_clock) ) ), \ (_buf) ) #define gmtime_r( _clock, _result ) \ ( *(_result) = *gmtime( (_clock) ), \ (_result) ) #define localtime_r( _clock, _result ) \ ( *(_result) = *localtime( (_clock) ), \ (_result) ) #define rand_r( _seed ) \ ( _seed == _seed? rand() : rand() ) /* * Some compiler environments don't define some things. */ #if defined(__BORLANDC__) # define _ftime ftime # define _timeb timeb #endif #ifdef __cplusplus /* * Internal exceptions */ class ptw32_exception {}; class ptw32_exception_cancel : public ptw32_exception {}; class ptw32_exception_exit : public ptw32_exception {}; #endif #if PTW32_LEVEL >= PTW32_LEVEL_MAX /* FIXME: This is only required if the library was built using SEH */ /* * Get internal SEH tag */ PTW32_DLLPORT DWORD PTW32_CDECL ptw32_get_exception_services_code(void); #endif /* PTW32_LEVEL >= PTW32_LEVEL_MAX */ #ifndef PTW32_BUILD #ifdef __CLEANUP_SEH /* * Redefine the SEH __except keyword to ensure that applications * propagate our internal exceptions up to the library's internal handlers. */ #define __except( E ) \ __except( ( GetExceptionCode() == ptw32_get_exception_services_code() ) \ ? EXCEPTION_CONTINUE_SEARCH : ( E ) ) #endif /* __CLEANUP_SEH */ #ifdef __CLEANUP_CXX /* * Redefine the C++ catch keyword to ensure that applications * propagate our internal exceptions up to the library's internal handlers. */ #ifdef _MSC_VER /* * WARNING: Replace any 'catch( ... )' with 'PtW32CatchAll' * if you want Pthread-Win32 cancelation and pthread_exit to work. */ #ifndef PtW32NoCatchWarn #pragma message("Specify \"/DPtW32NoCatchWarn\" compiler flag to skip this message.") #pragma message("------------------------------------------------------------------") #pragma message("When compiling applications with MSVC++ and C++ exception handling:") #pragma message(" Replace any 'catch( ... )' in routines called from POSIX threads") #pragma message(" with 'PtW32CatchAll' or 'CATCHALL' if you want POSIX thread") #pragma message(" cancelation and pthread_exit to work. For example:") #pragma message("") #pragma message(" #ifdef PtW32CatchAll") #pragma message(" PtW32CatchAll") #pragma message(" #else") #pragma message(" catch(...)") #pragma message(" #endif") #pragma message(" {") #pragma message(" /* Catchall block processing */") #pragma message(" }") #pragma message("------------------------------------------------------------------") #endif #define PtW32CatchAll \ catch( ptw32_exception & ) { throw; } \ catch( ... ) #else /* _MSC_VER */ #define catch( E ) \ catch( ptw32_exception & ) { throw; } \ catch( E ) #endif /* _MSC_VER */ #endif /* __CLEANUP_CXX */ #endif /* ! PTW32_BUILD */ #ifdef __cplusplus } /* End of extern "C" */ #endif /* __cplusplus */ #ifdef PTW32__HANDLE_DEF # undef HANDLE #endif #ifdef PTW32__DWORD_DEF # undef DWORD #endif #undef PTW32_LEVEL #undef PTW32_LEVEL_MAX #endif /* ! RC_INVOKED */ #endif /* PTHREAD_H */ cython-blis-0.9.1/extra-include/sched.h000066400000000000000000000206651427272030600177730ustar00rootroot00000000000000/* * Module: sched.h * * Purpose: * Provides an implementation of POSIX realtime extensions * as defined in * * POSIX 1003.1b-1993 (POSIX.1b) * * -------------------------------------------------------------------------- * * Pthreads-win32 - POSIX Threads Library for Win32 * Copyright(C) 1998 John E. Bossom * Copyright(C) 1999,2012 Pthreads-win32 contributors * * Homepage1: http://sourceware.org/pthreads-win32/ * Homepage2: http://sourceforge.net/projects/pthreads4w/ * * The current list of contributors is contained * in the file CONTRIBUTORS included with the source * code distribution. The list can also be seen at the * following World Wide Web location: * http://sources.redhat.com/pthreads-win32/contributors.html * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library in the file COPYING.LIB; * if not, write to the Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #if !defined(_SCHED_H) #define _SCHED_H #if defined(_MSC_VER) # if _MSC_VER < 1300 # define PTW32_CONFIG_MSVC6 # endif # if _MSC_VER < 1400 # define PTW32_CONFIG_MSVC7 # endif #endif #undef PTW32_SCHED_LEVEL #if defined(_POSIX_SOURCE) #define PTW32_SCHED_LEVEL 0 /* Early POSIX */ #endif #if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 199309 #undef PTW32_SCHED_LEVEL #define PTW32_SCHED_LEVEL 1 /* Include 1b, 1c and 1d */ #endif #if defined(INCLUDE_NP) #undef PTW32_SCHED_LEVEL #define PTW32_SCHED_LEVEL 2 /* Include Non-Portable extensions */ #endif #define PTW32_SCHED_LEVEL_MAX 3 // MH: Hack this out to avoid an annoying warning //#if ( defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112 ) || !defined(PTW32_SCHED_LEVEL) //#define PTW32_SCHED_LEVEL PTW32_SCHED_LEVEL_MAX ///* Include everything */ //#endif #if defined(__GNUC__) && !defined(__declspec) # error Please upgrade your GNU compiler to one that supports __declspec. #endif /* * When building the library, you should define PTW32_BUILD so that * the variables/functions are exported correctly. When using the library, * do NOT define PTW32_BUILD, and then the variables/functions will * be imported correctly. */ #if !defined(PTW32_STATIC_LIB) # if defined(PTW32_BUILD) # define PTW32_DLLPORT __declspec (dllexport) # else # define PTW32_DLLPORT __declspec (dllimport) # endif #else # define PTW32_DLLPORT #endif /* * The Open Watcom C/C++ compiler uses a non-standard calling convention * that passes function args in registers unless __cdecl is explicitly specified * in exposed function prototypes. * * We force all calls to cdecl even though this could slow Watcom code down * slightly. If you know that the Watcom compiler will be used to build both * the DLL and application, then you can probably define this as a null string. * Remember that sched.h (this file) is used for both the DLL and application builds. */ #if !defined(PTW32_CDECL) # define PTW32_CDECL __cdecl #endif /* * This is a duplicate of what is in the autoconf config.h, * which is only used when building the pthread-win32 libraries. */ #if !defined(PTW32_CONFIG_H) # if defined(WINCE) # define NEED_ERRNO # define NEED_SEM # endif # if defined(__MINGW64__) # define HAVE_STRUCT_TIMESPEC # define HAVE_MODE_T # elif defined(_UWIN) || defined(__MINGW32__) # define HAVE_MODE_T # endif #endif /* * */ #include #if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX #if defined(NEED_ERRNO) #include "need_errno.h" #else #include #endif #endif /* PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX */ #if (defined(__MINGW64__) || defined(__MINGW32__)) || defined(_UWIN) # if PTW32_SCHED_LEVEL >= PTW32_SCHED_LEVEL_MAX /* For pid_t */ # include /* Required by Unix 98 */ # include # else typedef int pid_t; # endif #else /* [i_a] fix for using pthread_win32 with mongoose code, which #define's its own pid_t akin to typedef HANDLE pid_t; */ #undef pid_t # if defined(_MSC_VER) typedef void *pid_t; # else typedef int pid_t; # endif #endif /* * Microsoft VC++6.0 lacks these *_PTR types */ #if defined(_MSC_VER) && _MSC_VER < 1300 && !defined(PTW32_HAVE_DWORD_PTR) typedef unsigned long ULONG_PTR; typedef ULONG_PTR DWORD_PTR; #endif /* Thread scheduling policies */ enum { SCHED_OTHER = 0, SCHED_FIFO, SCHED_RR, SCHED_MIN = SCHED_OTHER, SCHED_MAX = SCHED_RR }; struct sched_param { int sched_priority; }; /* * CPU affinity * * cpu_set_t: * Considered opaque but cannot be an opaque pointer * due to the need for compatibility with GNU systems * and sched_setaffinity() et.al. which include the * cpusetsize parameter "normally set to sizeof(cpu_set_t)". */ #define CPU_SETSIZE (sizeof(size_t)*8) #define CPU_COUNT(setptr) (_sched_affinitycpucount(setptr)) #define CPU_ZERO(setptr) (_sched_affinitycpuzero(setptr)) #define CPU_SET(cpu, setptr) (_sched_affinitycpuset((cpu),(setptr))) #define CPU_CLR(cpu, setptr) (_sched_affinitycpuclr((cpu),(setptr))) #define CPU_ISSET(cpu, setptr) (_sched_affinitycpuisset((cpu),(setptr))) #define CPU_AND(destsetptr, srcset1ptr, srcset2ptr) (_sched_affinitycpuand((destsetptr),(srcset1ptr),(srcset2ptr))) #define CPU_OR(destsetptr, srcset1ptr, srcset2ptr) (_sched_affinitycpuor((destsetptr),(srcset1ptr),(srcset2ptr))) #define CPU_XOR(destsetptr, srcset1ptr, srcset2ptr) (_sched_affinitycpuxor((destsetptr),(srcset1ptr),(srcset2ptr))) #define CPU_EQUAL(set1ptr, set2ptr) (_sched_affinitycpuequal((set1ptr),(set2ptr))) typedef union { char cpuset[CPU_SETSIZE/8]; size_t _align; } cpu_set_t; #if defined(__cplusplus) extern "C" { #endif /* __cplusplus */ PTW32_DLLPORT int PTW32_CDECL sched_yield (void); PTW32_DLLPORT int PTW32_CDECL sched_get_priority_min (int policy); PTW32_DLLPORT int PTW32_CDECL sched_get_priority_max (int policy); PTW32_DLLPORT int PTW32_CDECL sched_setscheduler (pid_t pid, int policy); PTW32_DLLPORT int PTW32_CDECL sched_getscheduler (pid_t pid); /* Compatibility with Linux - not standard */ PTW32_DLLPORT int PTW32_CDECL sched_setaffinity (pid_t pid, size_t cpusetsize, cpu_set_t *mask); PTW32_DLLPORT int PTW32_CDECL sched_getaffinity (pid_t pid, size_t cpusetsize, cpu_set_t *mask); /* * Support routines and macros for cpu_set_t */ PTW32_DLLPORT int PTW32_CDECL _sched_affinitycpucount (const cpu_set_t *set); PTW32_DLLPORT void PTW32_CDECL _sched_affinitycpuzero (cpu_set_t *pset); PTW32_DLLPORT void PTW32_CDECL _sched_affinitycpuset (int cpu, cpu_set_t *pset); PTW32_DLLPORT void PTW32_CDECL _sched_affinitycpuclr (int cpu, cpu_set_t *pset); PTW32_DLLPORT int PTW32_CDECL _sched_affinitycpuisset (int cpu, const cpu_set_t *pset); PTW32_DLLPORT void PTW32_CDECL _sched_affinitycpuand(cpu_set_t *pdestset, const cpu_set_t *psrcset1, const cpu_set_t *psrcset2); PTW32_DLLPORT void PTW32_CDECL _sched_affinitycpuor(cpu_set_t *pdestset, const cpu_set_t *psrcset1, const cpu_set_t *psrcset2); PTW32_DLLPORT void PTW32_CDECL _sched_affinitycpuxor(cpu_set_t *pdestset, const cpu_set_t *psrcset1, const cpu_set_t *psrcset2); PTW32_DLLPORT int PTW32_CDECL _sched_affinitycpuequal (const cpu_set_t *pset1, const cpu_set_t *pset2); /* * Note that this macro returns ENOTSUP rather than * ENOSYS as might be expected. However, returning ENOSYS * should mean that sched_get_priority_{min,max} are * not implemented as well as sched_rr_get_interval. * This is not the case, since we just don't support * round-robin scheduling. Therefore I have chosen to * return the same value as sched_setscheduler when * SCHED_RR is passed to it. */ #define sched_rr_get_interval(_pid, _interval) \ ( errno = ENOTSUP, (int) -1 ) #if defined(__cplusplus) } /* End of extern "C" */ #endif /* __cplusplus */ #undef PTW32_SCHED_LEVEL #undef PTW32_SCHED_LEVEL_MAX #endif /* !_SCHED_H */ cython-blis-0.9.1/fabfile.py000066400000000000000000000053011427272030600157200ustar00rootroot00000000000000# coding: utf-8 from __future__ import unicode_literals, print_function import contextlib from pathlib import Path from fabric.api import local, lcd, env, settings, prefix from os import path, environ import shutil import sys PWD = path.dirname(__file__) ENV = environ['VENV_DIR'] if 'VENV_DIR' in environ else '.env' VENV_DIR = Path(PWD) / ENV @contextlib.contextmanager def virtualenv(name, create=False, python='/usr/bin/python3.6'): python = Path(python).resolve() env_path = VENV_DIR if create: if env_path.exists(): shutil.rmtree(str(env_path)) local('{python} -m venv {env_path}'.format(python=python, env_path=VENV_DIR)) def wrapped_local(cmd, env_vars=[], capture=False, direct=False): return local('source {}/bin/activate && {}'.format(env_path, cmd), shell='/bin/bash', capture=False) yield wrapped_local def env(lang='python3.6'): if VENV_DIR.exists(): local('rm -rf {env}'.format(env=VENV_DIR)) if lang.startswith('python3'): local('{lang} -m venv {env}'.format(lang=lang, env=VENV_DIR)) else: local('{lang} -m pip install virtualenv --no-cache-dir'.format(lang=lang)) local('{lang} -m virtualenv {env} --no-cache-dir'.format(lang=lang, env=VENV_DIR)) with virtualenv(VENV_DIR) as venv_local: print(venv_local('python --version', capture=True)) venv_local('pip install --upgrade setuptools --no-cache-dir') venv_local('pip install pytest --no-cache-dir') venv_local('pip install wheel --no-cache-dir') venv_local('pip install -r requirements.txt --no-cache-dir') venv_local('pip install pex --no-cache-dir') def install(): with virtualenv(VENV_DIR) as venv_local: venv_local('pip install dist/*.tar.gz') def make(): with lcd(path.dirname(__file__)): local('export PYTHONPATH=`pwd` && source .env/bin/activate && python setup.py build_ext --inplace', shell='/bin/bash') def sdist(): with virtualenv(VENV_DIR) as venv_local: with lcd(path.dirname(__file__)): venv_local('python -m pip install -U setuptools') venv_local('python setup.py sdist') def wheel(): with virtualenv(VENV_DIR) as venv_local: with lcd(path.dirname(__file__)): venv_local('python setup.py bdist_wheel') def clean(): with lcd(path.dirname(__file__)): local('rm -f dist/*.whl') local('rm -f dist/*.pex') with virtualenv(VENV_DIR) as venv_local: venv_local('python setup.py clean --all') def test(): with virtualenv(VENV_DIR) as venv_local: with lcd(path.dirname(__file__)): venv_local('PYTHONPATH=`pwd` pytest -x tests') cython-blis-0.9.1/flame-blis/000077500000000000000000000000001427272030600157725ustar00rootroot00000000000000cython-blis-0.9.1/pyproject.toml000066400000000000000000000002011427272030600166640ustar00rootroot00000000000000[build-system] requires = [ "setuptools", "cython>=0.25", "numpy>=1.15.0", ] build-backend = "setuptools.build_meta" cython-blis-0.9.1/requirements.txt000066400000000000000000000001011427272030600172330ustar00rootroot00000000000000# Test requirements numpy pytest cython hypothesis>=4.0.0,<6.0.0 cython-blis-0.9.1/setup.py000066400000000000000000000324761427272030600155050ustar00rootroot00000000000000#!/usr/bin/env python import shutil import os # This is maybe not the best place to put this, # but we need to tell OSX to build for 10.7. # Otherwise, wheels don't work. We can't use 10.6, # it doesn't compile. # if "MACOSX_DEPLOYMENT_TARGET" not in os.environ: # os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.7" from setuptools import Extension, setup import contextlib import io import os.path import json import tempfile import distutils.command.build_ext from distutils.ccompiler import new_compiler from Cython.Build import cythonize import subprocess import sys import platform import numpy PLATFORM_TO_ARCH = { "linux": { "ppc64le": "power9", }, "darwin": {"arm64": "firestorm"}, "windows": { "ARM64": "generic", }, } MOD_NAMES = ["blis.cy", "blis.py"] print("BLIS_COMPILER?", os.environ.get("BLIS_COMPILER", "None")) def clean(path): if os.path.exists(os.path.join(PWD, "build")): shutil.rmtree(os.path.join(PWD, "build")) for name in MOD_NAMES: name = name.replace(".", "/") for ext in [".so", ".html", ".cpp", ".c"]: file_path = os.path.join(path, name + ext) if os.path.exists(file_path): os.unlink(file_path) def locate_windows_llvm(): # first check if the LLVM_HOME env variable is in use if "LLVM_HOME" in os.environ: home = os.environ["LLVM_HOME"] return os.path.join(home, "bin", "clang.exe") else: # otherwise, search the PATH for clang.exe clang = find_in_path("clang.exe", os.environ["PATH"]) if clang is None: clang = r"C:\Program Files\LLVM\bin\clang.exe" return clang def find_in_path(name, path): "Find a file in a search path" # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ for dir in path.split(os.pathsep): binpath = os.path.join(dir, name) if os.path.exists(binpath): return os.path.abspath(binpath) return None # By subclassing build_extensions we have the actual compiler that will be used # which is really known only after finalize_options # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used class build_ext_options: def build_options(self): if hasattr(self.compiler, "initialize"): self.compiler.initialize() self.compiler.platform = sys.platform[:6] print("Build options", self.compiler.platform, self.compiler.compiler_type) self.compiler.include_dirs = [numpy.get_include()] + self.compiler.include_dirs if self.compiler.compiler_type == "msvc": include_dirs = list(self.compiler.include_dirs) library_dirs = list(self.compiler.library_dirs) self.compiler = new_compiler(plat="nt", compiler="unix") self.compiler.platform = "nt" self.compiler.compiler_type = "msvc" self.compiler.compiler = [locate_windows_llvm()] self.compiler.compiler_so = list(self.compiler.compiler) self.compiler.preprocessor = list(self.compiler.compiler) self.compiler.linker = list(self.compiler.compiler) + ["-shared"] self.compiler.linker_so = list(self.compiler.linker) self.compiler.linker_exe = list(self.compiler.linker) self.compiler.archiver = ["llvm-ar"] self.compiler.library_dirs.extend(library_dirs) self.compiler.include_dirs = include_dirs class ExtensionBuilder(distutils.command.build_ext.build_ext, build_ext_options): def build_extensions(self): build_ext_options.build_options(self) if sys.platform in ("msvc", "win32"): platform_name = "windows" elif sys.platform == "darwin": platform_name = "darwin" else: platform_name = "linux" arch = self.get_arch_name(platform_name) print("BUILD ARCH:", arch) objects = self.compile_objects(platform_name, arch, OBJ_DIR) # Work around max line length in Windows, by making a local directory # for the objects short_dir = "z" if not os.path.exists(short_dir): os.mkdir(short_dir) short_paths = [] for object_path in objects: assert os.path.exists(object_path), object_path dir_name, filename = os.path.split(object_path) new_path = os.path.join(short_dir, filename) shutil.copyfile(object_path, new_path) assert os.path.exists(new_path), new_path short_paths.append(new_path) root = os.path.abspath(os.path.dirname(__file__)) for e in self.extensions: e.include_dirs.append(os.path.join(root, "include")) e.include_dirs.append( os.path.join(INCLUDE, "%s-%s" % (platform_name, arch)) ) e.extra_objects = list(short_paths) distutils.command.build_ext.build_ext.build_extensions(self) shutil.rmtree(short_dir) def get_arch_name(self, platform_name): platform_machine = platform.machine() # User-defined if "BLIS_ARCH" in os.environ: return os.environ["BLIS_ARCH"] # Lookup try: return PLATFORM_TO_ARCH[platform_name][platform_machine] except KeyError: pass # Windows has various names for x86_64 :( if platform_name == "windows": return "x86_64" # Check if gcc/clang supports SVE. if platform_name == "linux" and platform_machine == "aarch64": if self.check_compiler_arch("armv8-a+sve") and self.check_header( "arm_sve.h" ): return "arm64" else: return "arm64_no_sve" # Unknown CPU architecture. if platform_machine != "x86_64": return "generic" # Linux/Darwin x86_64 # Try to detect which compiler flags are supported supports_znver1 = self.check_compiler_arch("znver1") supports_znver2 = self.check_compiler_arch("znver2") supports_znver3 = self.check_compiler_arch("znver3") supports_skx = self.check_compiler_arch("skylake-avx512") if supports_znver3 and supports_skx: return "x86_64" elif supports_znver2 and supports_skx: return "x86_64_no_zen3" elif supports_znver1 and supports_skx: return "x86_64_no_zen2" elif not supports_znver1 or not supports_skx: return "x86_64_no_skx" else: return "generic" def _check_compiler_flag(self, flag): supports_flag = True DEVNULL = os.open(os.devnull, os.O_RDWR) try: subprocess.check_call( " ".join(self.compiler.compiler) + " {flag} -E -xc - -o -".format(flag=flag), stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL, shell=True, ) except Exception: supports_flag = False os.close(DEVNULL) return supports_flag def check_compiler_arch(self, arch): return self._check_compiler_flag("-march={arch}".format(arch=arch)) def check_header(self, header): return self._check_compiler_flag("-include {header}".format(header=header)) def get_compiler_name(self): if "BLIS_COMPILER" in os.environ: return os.environ["BLIS_COMPILER"] elif "CC" in os.environ: return os.environ["CC"] else: return None def compile_objects(self, platform, py_arch, obj_dir): objects = [] platform_arch = platform + "-" + py_arch compiler = self.get_compiler_name() with open(os.path.join(BLIS_DIR, "make", "%s.jsonl" % platform_arch)) as file_: env = {} for line in file_: spec = json.loads(line) if "environment" in spec: env = spec["environment"] print(env) continue _, target_name = os.path.split(spec["target"]) if platform == "windows": target_name = target_name.replace("/", "\\") spec["source"] = spec["source"].replace("/", "\\") spec["include"] = [ inc.replace("/", "\\") for inc in spec["include"] ] spec["include"].append( "-I" + os.path.join(INCLUDE, "%s" % platform_arch) ) spec["target"] = os.path.join(obj_dir, target_name) spec["source"] = os.path.join(BLIS_DIR, spec["source"]) if compiler is not None: spec["compiler"] = compiler if platform == "windows": spec["compiler"] = locate_windows_llvm() if compiler is not None and "clang" in compiler: spec["flags"] = [ f for f in spec["flags"] if "no-avx256-split-unaligned-store" not in f ] # Ensure that symbols are visible to aid debugging and profiling. spec["flags"] = [ f for f in spec["flags"] if "visibility=hidden" not in f ] objects.append(self.build_object(env=env, **spec)) return objects def build_object(self, compiler, source, target, flags, macros, include, env=None): if os.path.exists(target): return target if not os.path.exists(source): raise IOError("Cannot find source file: %s" % source) command = compiler.split() command.extend(["-c", source, "-o", target]) command.extend(flags) command.extend(macros) command.extend(include) print("[COMMAND]", " ".join(command)) # TODO: change this to subprocess.run etc. once we drop 2.7 subprocess.check_call(command, cwd=BLIS_DIR) return target @contextlib.contextmanager def chdir(new_dir): old_dir = os.getcwd() try: os.chdir(new_dir) sys.path.insert(0, new_dir) yield finally: del sys.path[0] os.chdir(old_dir) PWD = os.path.join(os.path.abspath(os.path.dirname("."))) SRC = os.path.join(PWD, "blis") BLIS_DIR = os.path.join(SRC, "_src") INCLUDE = os.path.join(PWD, "blis", "_src", "include") COMPILER = os.environ.get("BLIS_COMPILER", "gcc") BLIS_REALLY_COMPILE = os.environ.get("BLIS_REALLY_COMPILE", 0) if not BLIS_REALLY_COMPILE: try: import pip version_parts = pip.__version__.split(".") major = int(version_parts[0]) minor = int(version_parts[1]) if major < 19 or (major == 19 and minor < 3): print( "WARNING: pip versions <19.3 (currently installed: " + pip.__version__ + ") are unable to detect binary wheel compatibility for blis. To avoid a source install with a very long compilation time, please upgrade pip with `pip install --upgrade pip`.\n\nIf you know what you're doing and you really want to compile blis from source, please set the environment variable BLIS_REALLY_COMPILE=1." ) sys.exit(1) except Exception: pass if len(sys.argv) > 1 and sys.argv[1] == "clean": clean(PWD) OBJ_DIR = tempfile.mkdtemp() root = os.path.abspath(os.path.dirname(__file__)) with chdir(root): with open(os.path.join(root, "blis", "about.py")) as f: about = {} exec(f.read(), about) with io.open(os.path.join(root, "README.md"), encoding="utf8") as f: readme = f.read() setup( setup_requires=[ "cython>=0.25", "numpy>=1.15.0", ], install_requires=["numpy>=1.15.0"], ext_modules=cythonize( [ Extension( "blis.cy", [os.path.join("blis", "cy.pyx")], extra_compile_args=["-std=c99"], ), Extension( "blis.py", [os.path.join("blis", "py.pyx")], extra_compile_args=["-std=c99"], ), ], language_level=2, ), cmdclass={"build_ext": ExtensionBuilder}, package_data={"": ["*.json", "*.jsonl", "*.pyx", "*.pxd"]}, name="blis", packages=["blis", "blis.tests"], author=about["__author__"], author_email=about["__email__"], version=about["__version__"], url=about["__uri__"], license=about["__license__"], description=about["__summary__"], long_description=readme, long_description_content_type="text/markdown", classifiers=[ "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Developers", "Intended Audience :: Information Technology", "License :: OSI Approved :: BSD License", "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", "Programming Language :: Cython", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering", ], ) shutil.rmtree(OBJ_DIR) cython-blis-0.9.1/tests/000077500000000000000000000000001427272030600151215ustar00rootroot00000000000000cython-blis-0.9.1/tests/blis_tests_common.py000066400000000000000000000047631427272030600212300ustar00rootroot00000000000000from __future__ import print_function import numpy as np np.random.seed(0) from numpy.testing import assert_allclose from hypothesis import given, assume from hypothesis.strategies import tuples, lists, integers, floats from hypothesis.extra.numpy import arrays def lengths(lo=1, hi=10): return integers(min_value=lo, max_value=hi) def shapes(min_rows=1, max_rows=100, min_cols=1, max_cols=100): return tuples(lengths(lo=min_rows, hi=max_rows), lengths(lo=min_cols, hi=max_cols)) def ndarrays_of_shape(shape, lo=-1000.0, hi=1000.0, dtype='float64'): return arrays(dtype, shape=shape, elements=floats(min_value=lo, max_value=hi, width=32)) def ndarrays(min_len=0, max_len=10, min_val=-10000000.0, max_val=1000000.0, dtype='float64'): return lengths(lo=min_len, hi=max_len).flatmap( lambda n: ndarrays_of_shape(n, lo=min_val, hi=max_val, dtype=dtype)) def matrices(min_rows=1, max_rows=10, min_cols=1, max_cols=10, min_value=-10000000.0, max_value=1000000.0, dtype='float64'): return shapes(min_rows=min_rows, max_rows=max_rows, min_cols=min_cols, max_cols=max_cols).flatmap( lambda mn: ndarrays_of_shape(mn, lo=min_value, hi=max_value, dtype=dtype)) def positive_ndarrays(min_len=0, max_len=10, max_val=100000.0, dtype='float64'): return ndarrays(min_len=min_len, max_len=max_len, min_val=0, max_val=max_val, dtype=dtype) def negative_ndarrays(min_len=0, max_len=10, min_val=-100000.0, dtype='float64'): return ndarrays(min_len=min_len, max_len=max_len, min_val=min_val, max_val=-1e-10, dtype=dtype) def parse_layer(layer_data): # Get the first row, excluding the first column x = layer_data[0,1:] # Get the first column, excluding the first row # .ascontiguousarray is support important here!!!! b = np.ascontiguousarray(layer_data[1:,0], dtype='float64') # Slice out the row and the column used for the X and the bias W = layer_data[1:,1:] assert x.ndim == 1 assert b.ndim == 1 assert b.shape[0] == W.shape[0] assert x.shape[0] == W.shape[1] assume(not np.isnan(W.sum())) assume(not np.isnan(x.sum())) assume(not np.isnan(b.sum())) assume(not any(np.isinf(val) for val in W.flatten())) assume(not any(np.isinf(val) for val in x)) assume(not any(np.isinf(val) for val in b)) return x, b, W def split_row(layer_data): return (layer_data[0,:], layer_data[:,:]) cython-blis-0.9.1/tests/test_dotv.py000066400000000000000000000022651427272030600175130ustar00rootroot00000000000000from __future__ import division from hypothesis import given, assume from math import sqrt, floor from blis_tests_common import * from blis.py import dotv from blis.cy import NO_CONJUGATE, CONJUGATE @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float64'), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float64'), ) def test_memoryview_double_noconj(A, B): if len(A) < len(B): B = B[:len(A)] else: A = A[:len(B)] assume(A is not None) assume(B is not None) numpy_result = A.dot(B) result = dotv(A, B) assert_allclose([numpy_result], result, atol=1e-3, rtol=1e-3) @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float32'), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float32'), ) def test_memoryview_float_noconj(A, B): if len(A) < len(B): B = B[:len(A)] else: A = A[:len(B)] assume(A is not None) assume(B is not None) numpy_result = A.dot(B) result = dotv(A, B) assert_allclose([numpy_result], result, atol=1e-3, rtol=1e-3) cython-blis-0.9.1/tests/test_gemm.py000066400000000000000000000055671427272030600174740ustar00rootroot00000000000000from __future__ import division from hypothesis import given, assume from math import sqrt, floor import numpy as np import pytest from blis_tests_common import * from blis.py import gemm def _stretch_matrix(data, m, n): orig_len = len(data) orig_m = m orig_n = n ratio = sqrt(len(data) / (m * n)) m = int(floor(m * ratio)) n = int(floor(n * ratio)) data = np.ascontiguousarray(data[:m*n], dtype=data.dtype) return data.reshape((m, n)), m, n def _reshape_for_gemm(A, B, a_rows, a_cols, out_cols, dtype, trans_a=False, trans_b=False): A, a_rows, a_cols = _stretch_matrix(A, a_rows, a_cols) if len(B) < a_cols or a_cols < 1: return (None, None, None) b_cols = int(floor(len(B) / a_cols)) B = np.ascontiguousarray(B.flatten()[:a_cols*b_cols], dtype=dtype) B = B.reshape((a_cols, b_cols)) out_cols = B.shape[1] C = np.zeros(shape=(A.shape[0], B.shape[1]), dtype=dtype) if trans_a: A = np.ascontiguousarray(A.T, dtype=dtype) return A, B, C def test_incompatible_shape(): with pytest.raises(ValueError): gemm(np.zeros((2, 2)), np.zeros((3, 2))) with pytest.raises(ValueError): gemm(np.zeros((3, 2)), np.zeros((2, 2)), trans1=True) with pytest.raises(ValueError): gemm(np.zeros((2, 2)), np.zeros((2, 3)), trans2=True) with pytest.raises(ValueError): gemm(np.zeros((3, 2)), np.zeros((3, 2)), trans1=True, trans2=True) @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float64'), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float64'), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000)) def test_memoryview_double_notrans(A, B, a_rows, a_cols, out_cols): A, B, C = _reshape_for_gemm(A, B, a_rows, a_cols, out_cols, 'float64') assume(A is not None) assume(B is not None) assume(C is not None) assume(A.size >= 1) assume(B.size >= 1) assume(C.size >= 1) gemm(A, B, out=C) numpy_result = A.dot(B) assert_allclose(numpy_result, C, atol=1e-3, rtol=1e-3) @given( ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float32'), ndarrays(min_len=10, max_len=100, min_val=-100.0, max_val=100.0, dtype='float32'), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000), integers(min_value=2, max_value=1000)) def test_memoryview_float_notrans(A, B, a_rows, a_cols, out_cols): A, B, C = _reshape_for_gemm(A, B, a_rows, a_cols, out_cols, dtype='float32') assume(A is not None) assume(B is not None) assume(C is not None) assume(A.size >= 1) assume(B.size >= 1) assume(C.size >= 1) gemm(A, B, out=C) numpy_result = A.dot(B) assert_allclose(numpy_result, C, atol=1e-3, rtol=1e-3)